diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index b290e090..97c8c97f 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,20 +1,20 @@ { "name": "nfcore", - "image": "nfcore/gitpod:latest", - "remoteUser": "gitpod", - "runArgs": ["--privileged"], + "image": "nfcore/devcontainer:latest", - // Configure tool-specific properties. - "customizations": { - // Configure properties specific to VS Code. - "vscode": { - // Set *default* container specific settings.json values on container create. - "settings": { - "python.defaultInterpreterPath": "/opt/conda/bin/python" - }, + "remoteUser": "root", + "privileged": true, - // Add the IDs of extensions you want installed when the container is created. - "extensions": ["ms-python.python", "ms-python.vscode-pylance", "nf-core.nf-core-extensionpack"] - } + "remoteEnv": { + // Workspace path on the host for mounting with docker-outside-of-docker + "LOCAL_WORKSPACE_FOLDER": "${localWorkspaceFolder}" + }, + + "onCreateCommand": "./.devcontainer/setup.sh", + + "hostRequirements": { + "cpus": 4, + "memory": "16gb", + "storage": "32gb" } } diff --git a/.devcontainer/setup.sh b/.devcontainer/setup.sh new file mode 100755 index 00000000..f9b8e3f2 --- /dev/null +++ b/.devcontainer/setup.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +# Customise the terminal command prompt +echo "export PROMPT_DIRTRIM=2" >> $HOME/.bashrc +echo "export PS1='\[\e[3;36m\]\w ->\[\e[0m\\] '" >> $HOME/.bashrc +export PROMPT_DIRTRIM=2 +export PS1='\[\e[3;36m\]\w ->\[\e[0m\\] ' + +# Update Nextflow +nextflow self-update + +# Update welcome message +echo "Welcome to the nf-core/stableexpression devcontainer!" > /usr/local/etc/vscode-dev-containers/first-run-notice.txt diff --git a/.editorconfig b/.editorconfig deleted file mode 100644 index 72dda289..00000000 --- a/.editorconfig +++ /dev/null @@ -1,33 +0,0 @@ -root = true - -[*] -charset = utf-8 -end_of_line = lf -insert_final_newline = true -trim_trailing_whitespace = true -indent_size = 4 -indent_style = space - -[*.{md,yml,yaml,html,css,scss,js}] -indent_size = 2 - -# These files are edited and tested upstream in nf-core/modules -[/modules/nf-core/**] -charset = unset -end_of_line = unset -insert_final_newline = unset -trim_trailing_whitespace = unset -indent_style = unset -[/subworkflows/nf-core/**] -charset = unset -end_of_line = unset -insert_final_newline = unset -trim_trailing_whitespace = unset -indent_style = unset - -[/assets/email*] -indent_size = unset - -# ignore python and markdown -[*.{py,md}] -indent_style = unset diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index a4e527f0..bdd34869 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -1,4 +1,4 @@ -# nf-core/stableexpression: Contributing Guidelines +# `nf-core/stableexpression`: Contributing Guidelines Hi there! Many thanks for taking an interest in improving nf-core/stableexpression. @@ -55,9 +55,9 @@ These tests are run both with the latest available version of `Nextflow` and als :warning: Only in the unlikely and regretful event of a release happening with a bug. -- On your own fork, make a new branch `patch` based on `upstream/master`. +- On your own fork, make a new branch `patch` based on `upstream/main` or `upstream/master`. - Fix the bug, and bump version (X.Y.Z+1). -- A PR should be made on `master` from patch to directly this particular bug. +- Open a pull-request from `patch` to `main`/`master` with the changes. ## Getting help @@ -65,26 +65,26 @@ For further information/help, please consult the [nf-core/stableexpression docum ## Pipeline contribution conventions -To make the nf-core/stableexpression code and processing logic more understandable for new contributors and to ensure quality, we semi-standardise the way the code and other contributions are written. +To make the `nf-core/stableexpression` code and processing logic more understandable for new contributors and to ensure quality, we semi-standardise the way the code and other contributions are written. ### Adding a new step If you wish to contribute a new step, please use the following coding standards: -1. Define the corresponding input channel into your new process from the expected previous process channel +1. Define the corresponding input channel into your new process from the expected previous process channel. 2. Write the process block (see below). 3. Define the output channel if needed (see below). 4. Add any new parameters to `nextflow.config` with a default (see below). 5. Add any new parameters to `nextflow_schema.json` with help text (via the `nf-core pipelines schema build` tool). 6. Add sanity checks and validation for all relevant parameters. 7. Perform local tests to validate that the new code works as expected. -8. If applicable, add a new test command in `.github/workflow/ci.yml`. +8. If applicable, add a new test in the `tests` directory. 9. Update MultiQC config `assets/multiqc_config.yml` so relevant suffixes, file name clean up and module plots are in the appropriate order. If applicable, add a [MultiQC](https://https://multiqc.info/) module. 10. Add a description of the output files and if relevant any appropriate images from the MultiQC report to `docs/output.md`. ### Default values -Parameters should be initialised / defined with default values in `nextflow.config` under the `params` scope. +Parameters should be initialised / defined with default values within the `params` scope in `nextflow.config`. Once there, use `nf-core pipelines schema build` to add to `nextflow_schema.json`. diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 8517cecf..17ba21c1 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -9,7 +9,6 @@ body: - [nf-core website: troubleshooting](https://nf-co.re/usage/troubleshooting) - [nf-core/stableexpression pipeline documentation](https://nf-co.re/stableexpression/usage) - - type: textarea id: description attributes: diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 07b5000a..2c2997d7 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -8,14 +8,14 @@ These are the most common things requested on pull requests (PRs). Remember that PRs should be made against the dev branch, unless you're preparing a pipeline release. -Learn more about contributing: [CONTRIBUTING.md](https://github.com/nf-core/stableexpression/tree/master/.github/CONTRIBUTING.md) +Learn more about contributing: [CONTRIBUTING.md](https://github.com/nf-core/stableexpression/tree/main/.github/CONTRIBUTING.md) --> ## PR checklist - [ ] This comment contains a description of changes (with reason). - [ ] If you've fixed a bug or added code that should be tested, add tests! -- [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/nf-core/stableexpression/tree/master/.github/CONTRIBUTING.md) +- [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/nf-core/stableexpression/tree/main/.github/CONTRIBUTING.md) - [ ] If necessary, also make a PR on the nf-core/stableexpression _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. - [ ] Make sure your code lints (`nf-core pipelines lint`). - [ ] Ensure the test suite passes (`nextflow run . -profile test,docker --outdir `). diff --git a/.github/actions/get-shards/action.yml b/.github/actions/get-shards/action.yml new file mode 100644 index 00000000..34085279 --- /dev/null +++ b/.github/actions/get-shards/action.yml @@ -0,0 +1,69 @@ +name: "Get number of shards" +description: "Get the number of nf-test shards for the current CI job" +inputs: + max_shards: + description: "Maximum number of shards allowed" + required: true + paths: + description: "Component paths to test" + required: false + tags: + description: "Tags to pass as argument for nf-test --tag parameter" + required: false +outputs: + shard: + description: "Array of shard numbers" + value: ${{ steps.shards.outputs.shard }} + total_shards: + description: "Total number of shards" + value: ${{ steps.shards.outputs.total_shards }} +runs: + using: "composite" + steps: + - name: Install nf-test + uses: nf-core/setup-nf-test@v1 + with: + version: ${{ env.NFT_VER }} + - name: Get number of shards + id: shards + shell: bash + run: | + # Run nf-test with dynamic parameter + nftest_output=$(nf-test test \ + --profile +docker \ + $(if [ -n "${{ inputs.tags }}" ]; then echo "--tag ${{ inputs.tags }}"; fi) \ + --dry-run \ + --ci \ + --changed-since HEAD^) || { + echo "nf-test command failed with exit code $?" + echo "Full output: $nftest_output" + exit 1 + } + echo "nf-test dry-run output: $nftest_output" + + # Default values for shard and total_shards + shard="[]" + total_shards=0 + + # Check if there are related tests + if echo "$nftest_output" | grep -q 'No tests to execute'; then + echo "No related tests found." + else + # Extract the number of related tests + number_of_shards=$(echo "$nftest_output" | sed -n 's|.*Executed \([0-9]*\) tests.*|\1|p') + if [[ -n "$number_of_shards" && "$number_of_shards" -gt 0 ]]; then + shards_to_run=$(( $number_of_shards < ${{ inputs.max_shards }} ? $number_of_shards : ${{ inputs.max_shards }} )) + shard=$(seq 1 "$shards_to_run" | jq -R . | jq -c -s .) + total_shards="$shards_to_run" + else + echo "Unexpected output format. Falling back to default values." + fi + fi + + # Write to GitHub Actions outputs + echo "shard=$shard" >> $GITHUB_OUTPUT + echo "total_shards=$total_shards" >> $GITHUB_OUTPUT + + # Debugging output + echo "Final shard array: $shard" + echo "Total number of shards: $total_shards" diff --git a/.github/actions/nf-test/action.yml b/.github/actions/nf-test/action.yml new file mode 100644 index 00000000..41ec7fc3 --- /dev/null +++ b/.github/actions/nf-test/action.yml @@ -0,0 +1,111 @@ +name: "nf-test Action" +description: "Runs nf-test with common setup steps" +inputs: + profile: + description: "Profile to use" + required: true + shard: + description: "Shard number for this CI job" + required: true + total_shards: + description: "Total number of test shards(NOT the total number of matrix jobs)" + required: true + paths: + description: "Test paths" + required: true + tags: + description: "Tags to pass as argument for nf-test --tag parameter" + required: false +runs: + using: "composite" + steps: + - name: Setup Nextflow + uses: nf-core/setup-nextflow@v2 + with: + version: "${{ env.NXF_VERSION }}" + + - name: Set up Python + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6 + with: + python-version: "3.14" + + - name: Install nf-test + uses: nf-core/setup-nf-test@v1 + with: + version: "${{ env.NFT_VER }}" + install-pdiff: true + + - name: Setup apptainer + if: contains(inputs.profile, 'singularity') + uses: eWaterCycle/setup-apptainer@main + + - name: Set up Singularity + if: contains(inputs.profile, 'singularity') + shell: bash + run: | + mkdir -p $NXF_SINGULARITY_CACHEDIR + mkdir -p $NXF_SINGULARITY_LIBRARYDIR + + - name: Conda setup + if: contains(inputs.profile, 'conda') + uses: conda-incubator/setup-miniconda@505e6394dae86d6a5c7fbb6e3fb8938e3e863830 # v3 + with: + auto-update-conda: true + conda-solver: libmamba + channels: conda-forge + channel-priority: strict + conda-remove-defaults: true + + - name: Run nf-test + shell: bash + env: + NFT_WORKDIR: ${{ env.NFT_WORKDIR }} + run: | + nf-test test \ + --profile=+${{ inputs.profile }} \ + $(if [ -n "${{ inputs.tags }}" ]; then echo "--tag ${{ inputs.tags }}"; fi) \ + --ci \ + --changed-since HEAD^ \ + --verbose \ + --tap=test.tap \ + --shard ${{ inputs.shard }}/${{ inputs.total_shards }} --debug + + # Save the absolute path of the test.tap file to the output + echo "tap_file_path=$(realpath test.tap)" >> $GITHUB_OUTPUT + + - name: Generate test summary + if: always() + shell: bash + run: | + # Add header if it doesn't exist (using a token file to track this) + if [ ! -f ".summary_header" ]; then + echo "# 🚀 nf-test results" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "| Status | Test Name | Profile | Shard |" >> $GITHUB_STEP_SUMMARY + echo "|:------:|-----------|---------|-------|" >> $GITHUB_STEP_SUMMARY + touch .summary_header + fi + + if [ -f test.tap ]; then + while IFS= read -r line; do + if [[ $line =~ ^ok ]]; then + test_name="${line#ok }" + # Remove the test number from the beginning + test_name="${test_name#* }" + echo "| ✅ | ${test_name} | ${{ inputs.profile }} | ${{ inputs.shard }}/${{ inputs.total_shards }} |" >> $GITHUB_STEP_SUMMARY + elif [[ $line =~ ^not\ ok ]]; then + test_name="${line#not ok }" + # Remove the test number from the beginning + test_name="${test_name#* }" + echo "| ❌ | ${test_name} | ${{ inputs.profile }} | ${{ inputs.shard }}/${{ inputs.total_shards }} |" >> $GITHUB_STEP_SUMMARY + fi + done < test.tap + else + echo "| ⚠️ | No test results found | ${{ inputs.profile }} | ${{ inputs.shard }}/${{ inputs.total_shards }} |" >> $GITHUB_STEP_SUMMARY + fi + + - name: Clean up + if: always() + shell: bash + run: | + sudo rm -rf /home/ubuntu/tests/ diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml index ed1ada9d..a5f81c34 100644 --- a/.github/workflows/awsfulltest.yml +++ b/.github/workflows/awsfulltest.yml @@ -1,36 +1,48 @@ name: nf-core AWS full size tests -# This workflow is triggered on published releases. +# This workflow is triggered on PRs opened against the main/master branch. # It can be additionally triggered manually with GitHub actions workflow dispatch button. # It runs the -profile 'test_full' on AWS batch on: + workflow_dispatch: + pull_request_review: + types: [submitted] release: types: [published] - workflow_dispatch: + jobs: run-platform: name: Run AWS full tests - if: github.repository == 'nf-core/stableexpression' + # run only if the PR is approved by at least 2 reviewers and against the master/main branch or manually triggered + if: github.repository == 'nf-core/stableexpression' && github.event.review.state == 'approved' && (github.event.pull_request.base.ref == 'master' || github.event.pull_request.base.ref == 'main') || github.event_name == 'workflow_dispatch' || github.event_name == 'release' runs-on: ubuntu-latest steps: + - name: Set revision variable + id: revision + run: | + echo "revision=${{ (github.event_name == 'workflow_dispatch' || github.event_name == 'release') && github.sha || 'dev' }}" >> "$GITHUB_OUTPUT" + - name: Launch workflow via Seqera Platform uses: seqeralabs/action-tower-launch@v2 + # TODO nf-core: You can customise AWS full pipeline tests as required + # Add full size test data (but still relatively small datasets for few samples) + # on the `test_full.config` test runs with only one set of parameters with: - workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + workspace_id: ${{ vars.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} - compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} - revision: ${{ github.sha }} - workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/stableexpression/work-${{ github.sha }} + compute_env: ${{ vars.TOWER_COMPUTE_ENV }} + revision: ${{ steps.revision.outputs.revision }} + workdir: s3://${{ vars.AWS_S3_BUCKET }}/work/stableexpression/work-${{ steps.revision.outputs.revision }} parameters: | { "hook_url": "${{ secrets.MEGATESTS_ALERTS_SLACK_HOOK_URL }}", - "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/stableexpression/results-${{ github.sha }}" + "outdir": "s3://${{ vars.AWS_S3_BUCKET }}/stableexpression/results-${{ steps.revision.outputs.revision }}" } profiles: test_full - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5 with: name: Seqera Platform debug log file path: | - seqera_platform_action_*.log - seqera_platform_action_*.json + tower_action_*.log + tower_action_*.json diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml index 76e01b25..429570cd 100644 --- a/.github/workflows/awstest.yml +++ b/.github/workflows/awstest.yml @@ -14,20 +14,20 @@ jobs: - name: Launch workflow via Seqera Platform uses: seqeralabs/action-tower-launch@v2 with: - workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + workspace_id: ${{ vars.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} - compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} + compute_env: ${{ vars.TOWER_COMPUTE_ENV }} revision: ${{ github.sha }} - workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/stableexpression/work-${{ github.sha }} + workdir: s3://${{ vars.AWS_S3_BUCKET }}/work/stableexpression/work-${{ github.sha }} parameters: | { - "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/stableexpression/results-test-${{ github.sha }}" + "outdir": "s3://${{ vars.AWS_S3_BUCKET }}/stableexpression/results-test-${{ github.sha }}" } profiles: test - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5 with: name: Seqera Platform debug log file path: | - seqera_platform_action_*.log - seqera_platform_action_*.json + tower_action_*.log + tower_action_*.json diff --git a/.github/workflows/branch.yml b/.github/workflows/branch.yml index 3399b638..db9f5352 100644 --- a/.github/workflows/branch.yml +++ b/.github/workflows/branch.yml @@ -1,15 +1,17 @@ name: nf-core branch protection -# This workflow is triggered on PRs to master branch on the repository -# It fails when someone tries to make a PR against the nf-core `master` branch instead of `dev` +# This workflow is triggered on PRs to `main`/`master` branch on the repository +# It fails when someone tries to make a PR against the nf-core `main`/`master` branch instead of `dev` on: pull_request_target: - branches: [master] + branches: + - main + - master jobs: test: runs-on: ubuntu-latest steps: - # PRs to the nf-core repo master branch are only ok if coming from the nf-core repo `dev` or any `patch` branches + # PRs to the nf-core repo main/master branch are only ok if coming from the nf-core repo `dev` or any `patch` branches - name: Check PRs if: github.repository == 'nf-core/stableexpression' run: | @@ -22,7 +24,7 @@ jobs: uses: mshick/add-pr-comment@b8f338c590a895d50bcbfa6c5859251edc8952fc # v2 with: message: | - ## This PR is against the `master` branch :x: + ## This PR is against the `${{github.event.pull_request.base.ref}}` branch :x: * Do not close this PR * Click _Edit_ and change the `base` to `dev` @@ -32,9 +34,9 @@ jobs: Hi @${{ github.event.pull_request.user.login }}, - It looks like this pull-request is has been made against the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `master` branch. - The `master` branch on nf-core repositories should always contain code from the latest release. - Because of this, PRs to `master` are only allowed if they come from the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `dev` branch. + It looks like this pull-request is has been made against the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) ${{github.event.pull_request.base.ref}} branch. + The ${{github.event.pull_request.base.ref}} branch on nf-core repositories should always contain code from the latest release. + Because of this, PRs to ${{github.event.pull_request.base.ref}} are only allowed if they come from the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `dev` branch. You do not need to close this PR, you can change the target branch to `dev` by clicking the _"Edit"_ button at the top of this page. Note that even after this, the test will continue to show as failing until you push a new commit. diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index 7965be2d..00000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,73 +0,0 @@ -name: nf-core CI -# This workflow runs the pipeline with the minimal test dataset to check that it completes without any syntax errors -on: - push: - branches: - - dev - pull_request: - release: - types: [published] - -env: - NXF_ANSI_LOG: false - NXF_SINGULARITY_CACHEDIR: ${{ github.workspace }}/.singularity - NXF_SINGULARITY_LIBRARYDIR: ${{ github.workspace }}/.singularity - -concurrency: - group: "${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}" - cancel-in-progress: true - -jobs: - test: - name: Run pipeline with test data - # Only run on push if this is the nf-core dev branch (merged PRs) - if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/stableexpression') }}" - runs-on: ubuntu-latest - strategy: - matrix: - NXF_VER: - - "23.04.0" - - "latest-everything" - profile: - - "docker" - - "singularity" - shard: [1, 2, 3, 4] - steps: - - name: Check out pipeline code - uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 - - - name: Set up JDK 17 - uses: actions/setup-java@8df1039502a15bceb9433410b1a100fbe190c53b # v4 - with: - distribution: "temurin" - java-version: "17" - - - name: Install Nextflow - uses: nf-core/setup-nextflow@v2 - with: - version: "${{ matrix.NXF_VER }}" - - - name: Set up Apptainer - if: matrix.profile == 'singularity' - uses: eWaterCycle/setup-apptainer@main - - - name: Set up Singularity - if: matrix.profile == 'singularity' - run: | - mkdir -p $NXF_SINGULARITY_CACHEDIR - mkdir -p $NXF_SINGULARITY_LIBRARYDIR - - - name: Disk space cleanup - uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 - - - name: Install nf-test - run: | - wget -qO- https://get.nf-test.com | bash - sudo mv nf-test /usr/local/bin/ - - - name: Run Tests (Shard ${{ matrix.shard }}/${{ strategy.job-total }}) - run: nf-test test --ci --shard ${{ matrix.shard }}/${{ strategy.job-total }} - - - name: Run whole pipeline with test profile - run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results diff --git a/.github/workflows/clean-up.yml b/.github/workflows/clean-up.yml index 0b6b1f27..6adb0fff 100644 --- a/.github/workflows/clean-up.yml +++ b/.github/workflows/clean-up.yml @@ -10,7 +10,7 @@ jobs: issues: write pull-requests: write steps: - - uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9 + - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10 with: stale-issue-message: "This issue has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor. Remove stale label or add a comment otherwise this issue will be closed in 20 days." stale-pr-message: "This PR has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor. Remove stale label or add a comment if it is still useful." diff --git a/.github/workflows/download_pipeline.yml b/.github/workflows/download_pipeline.yml index 2d20d644..45884ff9 100644 --- a/.github/workflows/download_pipeline.yml +++ b/.github/workflows/download_pipeline.yml @@ -1,33 +1,42 @@ -name: Test successful pipeline download with 'nf-core download' +name: Test successful pipeline download with 'nf-core pipelines download' # Run the workflow when: # - dispatched manually -# - when a PR is opened or reopened to master branch +# - when a PR is opened or reopened to main/master branch # - the head branch of the pull request is updated, i.e. if fixes for a release are pushed last minute to dev. on: workflow_dispatch: inputs: testbranch: - description: "The specific branch you wish to utilize for the test execution of nf-core download." + description: "The specific branch you wish to utilize for the test execution of nf-core pipelines download." required: true default: "dev" pull_request: - types: - - opened - - edited - - synchronize - branches: - - master - pull_request_target: branches: + - main - master env: NXF_ANSI_LOG: false jobs: + configure: + runs-on: ubuntu-latest + outputs: + REPO_LOWERCASE: ${{ steps.get_repo_properties.outputs.REPO_LOWERCASE }} + REPOTITLE_LOWERCASE: ${{ steps.get_repo_properties.outputs.REPOTITLE_LOWERCASE }} + REPO_BRANCH: ${{ steps.get_repo_properties.outputs.REPO_BRANCH }} + steps: + - name: Get the repository name and current branch + id: get_repo_properties + run: | + echo "REPO_LOWERCASE=${GITHUB_REPOSITORY,,}" >> "$GITHUB_OUTPUT" + echo "REPOTITLE_LOWERCASE=$(basename ${GITHUB_REPOSITORY,,})" >> "$GITHUB_OUTPUT" + echo "REPO_BRANCH=${{ github.event.inputs.testbranch || 'dev' }}" >> "$GITHUB_OUTPUT" + download: runs-on: ubuntu-latest + needs: configure steps: - name: Install Nextflow uses: nf-core/setup-nextflow@v2 @@ -35,52 +44,91 @@ jobs: - name: Disk space cleanup uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 - - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 + - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6 with: - python-version: "3.12" + python-version: "3.14" architecture: "x64" - - uses: eWaterCycle/setup-singularity@931d4e31109e875b13309ae1d07c70ca8fbc8537 # v7 + + - name: Setup Apptainer + uses: eWaterCycle/setup-apptainer@4bb22c52d4f63406c49e94c804632975787312b3 # v2.0.0 with: - singularity-version: 3.8.3 + apptainer-version: 1.3.4 - name: Install dependencies run: | python -m pip install --upgrade pip - pip install git+https://github.com/nf-core/tools.git@dev + pip install git+https://github.com/nf-core/tools.git - - name: Get the repository name and current branch set as environment variable + - name: Make a cache directory for the container images run: | - echo "REPO_LOWERCASE=${GITHUB_REPOSITORY,,}" >> ${GITHUB_ENV} - echo "REPOTITLE_LOWERCASE=$(basename ${GITHUB_REPOSITORY,,})" >> ${GITHUB_ENV} - echo "REPO_BRANCH=${{ github.event.inputs.testbranch || 'dev' }}" >> ${GITHUB_ENV} + mkdir -p ./singularity_container_images - name: Download the pipeline env: - NXF_SINGULARITY_CACHEDIR: ./ + NXF_SINGULARITY_CACHEDIR: ./singularity_container_images run: | - nf-core download ${{ env.REPO_LOWERCASE }} \ - --revision ${{ env.REPO_BRANCH }} \ - --outdir ./${{ env.REPOTITLE_LOWERCASE }} \ + nf-core pipelines download ${{ needs.configure.outputs.REPO_LOWERCASE }} \ + --revision ${{ needs.configure.outputs.REPO_BRANCH }} \ + --outdir ./${{ needs.configure.outputs.REPOTITLE_LOWERCASE }} \ --compress "none" \ --container-system 'singularity' \ - --container-library "quay.io" -l "docker.io" -l "ghcr.io" \ + --container-library "quay.io" -l "docker.io" -l "community.wave.seqera.io/library/" \ --container-cache-utilisation 'amend' \ - --download-configuration + --download-configuration 'yes' - name: Inspect download - run: tree ./${{ env.REPOTITLE_LOWERCASE }} + run: tree ./${{ needs.configure.outputs.REPOTITLE_LOWERCASE }} + + - name: Inspect container images + run: tree ./singularity_container_images | tee ./container_initial + + - name: Count the downloaded number of container images + id: count_initial + run: | + image_count=$(ls -1 ./singularity_container_images | wc -l | xargs) + echo "Initial container image count: $image_count" + echo "IMAGE_COUNT_INITIAL=$image_count" >> "$GITHUB_OUTPUT" - name: Run the downloaded pipeline (stub) id: stub_run_pipeline continue-on-error: true env: - NXF_SINGULARITY_CACHEDIR: ./ + NXF_SINGULARITY_CACHEDIR: ./singularity_container_images NXF_SINGULARITY_HOME_MOUNT: true - run: nextflow run ./${{ env.REPOTITLE_LOWERCASE }}/$( sed 's/\W/_/g' <<< ${{ env.REPO_BRANCH }}) -stub -profile test,singularity --outdir ./results + run: nextflow run ./${{needs.configure.outputs.REPOTITLE_LOWERCASE }}/$( sed 's/\W/_/g' <<< ${{ needs.configure.outputs.REPO_BRANCH }}) -stub -profile test,singularity --outdir ./results - name: Run the downloaded pipeline (stub run not supported) id: run_pipeline - if: ${{ job.steps.stub_run_pipeline.status == failure() }} + if: ${{ steps.stub_run_pipeline.outcome == 'failure' }} env: - NXF_SINGULARITY_CACHEDIR: ./ + NXF_SINGULARITY_CACHEDIR: ./singularity_container_images NXF_SINGULARITY_HOME_MOUNT: true - run: nextflow run ./${{ env.REPOTITLE_LOWERCASE }}/$( sed 's/\W/_/g' <<< ${{ env.REPO_BRANCH }}) -profile test,singularity --outdir ./results + run: nextflow run ./${{ needs.configure.outputs.REPOTITLE_LOWERCASE }}/$( sed 's/\W/_/g' <<< ${{ needs.configure.outputs.REPO_BRANCH }}) -profile test,singularity --outdir ./results + + - name: Count the downloaded number of container images + id: count_afterwards + run: | + image_count=$(ls -1 ./singularity_container_images | wc -l | xargs) + echo "Post-pipeline run container image count: $image_count" + echo "IMAGE_COUNT_AFTER=$image_count" >> "$GITHUB_OUTPUT" + + - name: Compare container image counts + id: count_comparison + run: | + if [ "${{ steps.count_initial.outputs.IMAGE_COUNT_INITIAL }}" -ne "${{ steps.count_afterwards.outputs.IMAGE_COUNT_AFTER }}" ]; then + initial_count=${{ steps.count_initial.outputs.IMAGE_COUNT_INITIAL }} + final_count=${{ steps.count_afterwards.outputs.IMAGE_COUNT_AFTER }} + difference=$((final_count - initial_count)) + echo "$difference additional container images were \n downloaded at runtime . The pipeline has no support for offline runs!" + tree ./singularity_container_images > ./container_afterwards + diff ./container_initial ./container_afterwards + exit 1 + else + echo "The pipeline can be downloaded successfully!" + fi + + - name: Upload Nextflow logfile for debugging purposes + uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5 + with: + name: nextflow_logfile.txt + path: .nextflow.log* + include-hidden-files: true diff --git a/.github/workflows/fix-linting.yml b/.github/workflows/fix_linting.yml similarity index 80% rename from .github/workflows/fix-linting.yml rename to .github/workflows/fix_linting.yml index 207a1b24..6df255c4 100644 --- a/.github/workflows/fix-linting.yml +++ b/.github/workflows/fix_linting.yml @@ -13,13 +13,13 @@ jobs: runs-on: ubuntu-latest steps: # Use the @nf-core-bot token to check out so we can push later - - uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 + - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 with: token: ${{ secrets.nf_core_bot_auth_token }} # indication that the linting is being fixed - name: React on comment - uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4 + uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5 with: comment-id: ${{ github.event.comment.id }} reactions: eyes @@ -32,9 +32,9 @@ jobs: GITHUB_TOKEN: ${{ secrets.nf_core_bot_auth_token }} # Install and run pre-commit - - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 + - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6 with: - python-version: "3.12" + python-version: "3.14" - name: Install pre-commit run: pip install pre-commit @@ -47,7 +47,7 @@ jobs: # indication that the linting has finished - name: react if linting finished succesfully if: steps.pre-commit.outcome == 'success' - uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4 + uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5 with: comment-id: ${{ github.event.comment.id }} reactions: "+1" @@ -67,21 +67,21 @@ jobs: - name: react if linting errors were fixed id: react-if-fixed if: steps.commit-and-push.outcome == 'success' - uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4 + uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5 with: comment-id: ${{ github.event.comment.id }} reactions: hooray - name: react if linting errors were not fixed if: steps.commit-and-push.outcome == 'failure' - uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4 + uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5 with: comment-id: ${{ github.event.comment.id }} reactions: confused - name: react if linting errors were not fixed if: steps.commit-and-push.outcome == 'failure' - uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4 + uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5 with: issue-number: ${{ github.event.issue.number }} body: | diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index a502573c..7a527a34 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -3,9 +3,6 @@ name: nf-core linting # It runs the `nf-core pipelines lint` and markdown lint tests to ensure # that the code meets the nf-core guidelines. on: - push: - branches: - - dev pull_request: release: types: [published] @@ -14,12 +11,12 @@ jobs: pre-commit: runs-on: ubuntu-latest steps: - - uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 + - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 - - name: Set up Python 3.12 - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 + - name: Set up Python 3.14 + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6 with: - python-version: "3.12" + python-version: "3.14" - name: Install pre-commit run: pip install pre-commit @@ -31,18 +28,18 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out pipeline code - uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 + uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 - name: Install Nextflow uses: nf-core/setup-nextflow@v2 - - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 + - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6 with: - python-version: "3.12" + python-version: "3.14" architecture: "x64" - name: read .nf-core.yml - uses: pietrobolcato/action-read-yaml@1.1.0 + uses: pietrobolcato/action-read-yaml@9f13718d61111b69f30ab4ac683e67a56d254e1d # 1.1.0 id: read_yml with: config: ${{ github.workspace }}/.nf-core.yml @@ -74,7 +71,7 @@ jobs: - name: Upload linting log file artifact if: ${{ always() }} - uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4 + uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5 with: name: linting-logs path: | diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml index 42e519bf..e6e9bc26 100644 --- a/.github/workflows/linting_comment.yml +++ b/.github/workflows/linting_comment.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Download lint results - uses: dawidd6/action-download-artifact@bf251b5aa9c2f7eeb574a96ee720e24f801b7c11 # v6 + uses: dawidd6/action-download-artifact@ac66b43f0e6a346234dd65d4d0c8fbb31cb316e5 # v11 with: workflow: linting.yml workflow_conclusion: completed @@ -21,7 +21,7 @@ jobs: run: echo "pr_number=$(cat linting-logs/PR_number.txt)" >> $GITHUB_OUTPUT - name: Post PR comment - uses: marocchino/sticky-pull-request-comment@331f8f5b4215f0445d3c07b4967662a32a2d3e31 # v2 + uses: marocchino/sticky-pull-request-comment@773744901bac0e8cbb5a0dc842800d45e9b2b405 # v2 with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} number: ${{ steps.pr_number.outputs.pr_number }} diff --git a/.github/workflows/nf-test.yml b/.github/workflows/nf-test.yml new file mode 100644 index 00000000..1668cf73 --- /dev/null +++ b/.github/workflows/nf-test.yml @@ -0,0 +1,144 @@ +name: Run nf-test +on: + pull_request: + paths-ignore: + - "docs/**" + - "**/meta.yml" + - "**/*.md" + - "**/*.png" + - "**/*.svg" + release: + types: [published] + workflow_dispatch: + +# Cancel if a newer run is started +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + NFT_VER: "0.9.3" + NFT_WORKDIR: "~" + NXF_ANSI_LOG: false + NXF_SINGULARITY_CACHEDIR: ${{ github.workspace }}/.singularity + NXF_SINGULARITY_LIBRARYDIR: ${{ github.workspace }}/.singularity + +jobs: + nf-test-changes: + name: nf-test-changes + runs-on: # use self-hosted runners + - runs-on=${{ github.run_id }}-nf-test-changes + - runner=4cpu-linux-x64 + outputs: + shard: ${{ steps.set-shards.outputs.shard }} + total_shards: ${{ steps.set-shards.outputs.total_shards }} + steps: + - name: Clean Workspace # Purge the workspace in case it's running on a self-hosted runner + run: | + ls -la ./ + rm -rf ./* || true + rm -rf ./.??* || true + ls -la ./ + - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 + with: + fetch-depth: 0 + + - name: get number of shards + id: set-shards + uses: ./.github/actions/get-shards + env: + NFT_VER: ${{ env.NFT_VER }} + with: + max_shards: 10 + + - name: debug + run: | + echo ${{ steps.set-shards.outputs.shard }} + echo ${{ steps.set-shards.outputs.total_shards }} + + nf-test: + name: "${{ matrix.profile }} | ${{ matrix.NXF_VER }} | ${{ matrix.shard }}/${{ needs.nf-test-changes.outputs.total_shards }}" + needs: [nf-test-changes] + if: ${{ needs.nf-test-changes.outputs.total_shards != '0' }} + runs-on: # use self-hosted runners + - runs-on=${{ github.run_id }}-nf-test + - runner=4cpu-linux-x64 + strategy: + fail-fast: false + matrix: + shard: ${{ fromJson(needs.nf-test-changes.outputs.shard) }} + profile: [conda, docker, singularity] + isMain: + - ${{ github.base_ref == 'master' || github.base_ref == 'main' }} + # Exclude conda and singularity on dev + exclude: + - isMain: false + profile: "conda" + - isMain: false + profile: "singularity" + NXF_VER: + - "25.04.0" + - "latest-everything" + env: + NXF_ANSI_LOG: false + TOTAL_SHARDS: ${{ needs.nf-test-changes.outputs.total_shards }} + + steps: + - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 + with: + fetch-depth: 0 + + - name: Run nf-test + id: run_nf_test + uses: ./.github/actions/nf-test + continue-on-error: ${{ matrix.NXF_VER == 'latest-everything' }} + env: + NFT_WORKDIR: ${{ env.NFT_WORKDIR }} + NXF_VERSION: ${{ matrix.NXF_VER }} + with: + profile: ${{ matrix.profile }} + shard: ${{ matrix.shard }} + total_shards: ${{ env.TOTAL_SHARDS }} + + - name: Report test status + if: ${{ always() }} + run: | + if [[ "${{ steps.run_nf_test.outcome }}" == "failure" ]]; then + echo "::error::Test with ${{ matrix.NXF_VER }} failed" + # Add to workflow summary + echo "## ❌ Test failed: ${{ matrix.profile }} | ${{ matrix.NXF_VER }} | Shard ${{ matrix.shard }}/${{ env.TOTAL_SHARDS }}" >> $GITHUB_STEP_SUMMARY + if [[ "${{ matrix.NXF_VER }}" == "latest-everything" ]]; then + echo "::warning::Test with latest-everything failed but will not cause workflow failure. Please check if the error is expected or if it needs fixing." + fi + if [[ "${{ matrix.NXF_VER }}" != "latest-everything" ]]; then + exit 1 + fi + fi + + confirm-pass: + needs: [nf-test] + if: always() + runs-on: # use self-hosted runners + - runs-on=${{ github.run_id }}-confirm-pass + - runner=2cpu-linux-x64 + steps: + - name: One or more tests failed (excluding latest-everything) + if: ${{ contains(needs.*.result, 'failure') }} + run: exit 1 + + - name: One or more tests cancelled + if: ${{ contains(needs.*.result, 'cancelled') }} + run: exit 1 + + - name: All tests ok + if: ${{ contains(needs.*.result, 'success') }} + run: exit 0 + + - name: debug-print + if: always() + run: | + echo "::group::DEBUG: `needs` Contents" + echo "DEBUG: toJSON(needs) = ${{ toJSON(needs) }}" + echo "DEBUG: toJSON(needs.*.result) = ${{ toJSON(needs.*.result) }}" + echo "::endgroup::" diff --git a/.github/workflows/release-announcements.yml b/.github/workflows/release-announcements.yml index 03ecfcf7..431d3d44 100644 --- a/.github/workflows/release-announcements.yml +++ b/.github/workflows/release-announcements.yml @@ -12,8 +12,12 @@ jobs: - name: get topics and convert to hashtags id: get_topics run: | - echo "topics=$(curl -s https://nf-co.re/pipelines.json | jq -r '.remote_workflows[] | select(.full_name == "${{ github.repository }}") | .topics[]' | awk '{print "#"$0}' | tr '\n' ' ')" >> $GITHUB_OUTPUT + echo "topics=$(curl -s https://nf-co.re/pipelines.json | jq -r '.remote_workflows[] | select(.full_name == "${{ github.repository }}") | .topics[]' | awk '{print "#"$0}' | tr '\n' ' ')" | sed 's/-//g' >> $GITHUB_OUTPUT + - name: get description + id: get_description + run: | + echo "description=$(curl -s https://nf-co.re/pipelines.json | jq -r '.remote_workflows[] | select(.full_name == "${{ github.repository }}") | .description')" >> $GITHUB_OUTPUT - uses: rzr/fediverse-action@master with: access-token: ${{ secrets.MASTODON_ACCESS_TOKEN }} @@ -22,48 +26,15 @@ jobs: # https://docs.github.com/en/developers/webhooks-and-events/webhooks/webhook-events-and-payloads#release message: | Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}! - + ${{ steps.get_description.outputs.description }} Please see the changelog: ${{ github.event.release.html_url }} ${{ steps.get_topics.outputs.topics }} #nfcore #openscience #nextflow #bioinformatics - send-tweet: - runs-on: ubuntu-latest - - steps: - - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 - with: - python-version: "3.10" - - name: Install dependencies - run: pip install tweepy==4.14.0 - - name: Send tweet - shell: python - run: | - import os - import tweepy - - client = tweepy.Client( - access_token=os.getenv("TWITTER_ACCESS_TOKEN"), - access_token_secret=os.getenv("TWITTER_ACCESS_TOKEN_SECRET"), - consumer_key=os.getenv("TWITTER_CONSUMER_KEY"), - consumer_secret=os.getenv("TWITTER_CONSUMER_SECRET"), - ) - tweet = os.getenv("TWEET") - client.create_tweet(text=tweet) - env: - TWEET: | - Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}! - - Please see the changelog: ${{ github.event.release.html_url }} - TWITTER_CONSUMER_KEY: ${{ secrets.TWITTER_CONSUMER_KEY }} - TWITTER_CONSUMER_SECRET: ${{ secrets.TWITTER_CONSUMER_SECRET }} - TWITTER_ACCESS_TOKEN: ${{ secrets.TWITTER_ACCESS_TOKEN }} - TWITTER_ACCESS_TOKEN_SECRET: ${{ secrets.TWITTER_ACCESS_TOKEN_SECRET }} - bsky-post: runs-on: ubuntu-latest steps: - - uses: zentered/bluesky-post-action@80dbe0a7697de18c15ad22f4619919ceb5ccf597 # v0.1.0 + - uses: zentered/bluesky-post-action@6461056ea355ea43b977e149f7bf76aaa572e5e8 # v0.3.0 with: post: | Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}! diff --git a/.github/workflows/template-version-comment.yml b/.github/workflows/template-version-comment.yml new file mode 100644 index 00000000..e8560fc7 --- /dev/null +++ b/.github/workflows/template-version-comment.yml @@ -0,0 +1,46 @@ +name: nf-core template version comment +# This workflow is triggered on PRs to check if the pipeline template version matches the latest nf-core version. +# It posts a comment to the PR, even if it comes from a fork. + +on: pull_request_target + +jobs: + template_version: + runs-on: ubuntu-latest + steps: + - name: Check out pipeline code + uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 + with: + ref: ${{ github.event.pull_request.head.sha }} + + - name: Read template version from .nf-core.yml + uses: nichmor/minimal-read-yaml@1f7205277e25e156e1f63815781db80a6d490b8f # v0.0.2 + id: read_yml + with: + config: ${{ github.workspace }}/.nf-core.yml + + - name: Install nf-core + run: | + python -m pip install --upgrade pip + pip install nf-core==${{ steps.read_yml.outputs['nf_core_version'] }} + + - name: Check nf-core outdated + id: nf_core_outdated + run: echo "OUTPUT=$(pip list --outdated | grep nf-core)" >> ${GITHUB_ENV} + + - name: Post nf-core template version comment + uses: mshick/add-pr-comment@b8f338c590a895d50bcbfa6c5859251edc8952fc # v2 + if: | + contains(env.OUTPUT, 'nf-core') + with: + repo-token: ${{ secrets.NF_CORE_BOT_AUTH_TOKEN }} + allow-repeats: false + message: | + > [!WARNING] + > Newer version of the nf-core template is available. + > + > Your pipeline is using an old version of the nf-core template: ${{ steps.read_yml.outputs['nf_core_version'] }}. + > Please update your pipeline to the latest version. + > + > For more documentation on how to update your pipeline, please see the [nf-core documentation](https://github.com/nf-core/tools?tab=readme-ov-file#sync-a-pipeline-with-the-template) and [Synchronisation documentation](https://nf-co.re/docs/contributing/sync). + # diff --git a/.gitignore b/.gitignore index 162b87ab..b91d76b2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,4 @@ -.nextflow.* -.nextflow/ +.nextflow* work/ data/ results/ @@ -8,9 +7,14 @@ testing/ testing* *.pyc null/ -.nf-test/ -.nf-test.* +.nf-test* .idea/ +.vscode/ taggers/ tokenizers/ corpora/ +.github/act.custom_runner.Dockerfile +.ruff_cache +galaxy/test_output/ +TODO +test/ diff --git a/.gitpod.yml b/.gitpod.yml deleted file mode 100644 index 105a1821..00000000 --- a/.gitpod.yml +++ /dev/null @@ -1,20 +0,0 @@ -image: nfcore/gitpod:latest -tasks: - - name: Update Nextflow and setup pre-commit - command: | - pre-commit install --install-hooks - nextflow self-update - - name: unset JAVA_TOOL_OPTIONS - command: | - unset JAVA_TOOL_OPTIONS - -vscode: - extensions: # based on nf-core.nf-core-extensionpack - - esbenp.prettier-vscode # Markdown/CommonMark linting and style checking for Visual Studio Code - - EditorConfig.EditorConfig # override user/workspace settings with settings found in .editorconfig files - - Gruntfuggly.todo-tree # Display TODO and FIXME in a tree view in the activity bar - - mechatroner.rainbow-csv # Highlight columns in csv files in different colors - # - nextflow.nextflow # Nextflow syntax highlighting - - oderwat.indent-rainbow # Highlight indentation level - - streetsidesoftware.code-spell-checker # Spelling checker for source code - - charliermarsh.ruff # Code linter Ruff diff --git a/.nf-core.yml b/.nf-core.yml index e0b85a77..9d83d644 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1,2 +1,33 @@ +lint: + files_exist: + - conf/igenomes.config + - conf/igenomes_ignored.config + - conf/igenomes.config + - conf/igenomes_ignored.config + files_unchanged: + - assets/nf-core-stableexpression_logo_light.png + - docs/images/nf-core-stableexpression_logo_light.png + - docs/images/nf-core-stableexpression_logo_dark.png + - .github/PULL_REQUEST_TEMPLATE.md + nextflow_config: + - params.input + template_strings: + - tests/test_data/genorm/compute_m_measure/input/std.0.0.parquet + - tests/test_data/genorm/compute_m_measure/input/std.1.2.parquet + - tests/test_data/genorm/compute_m_measure/input/std.1.2.parquet + schema_lint: false + +nf_core_version: 3.5.2 repository_type: pipeline -nf_core_version: "2.14.1" +template: + author: Olivier Coen + description: This pipeline is dedicated to identifying the most stable genes within a single or multiple expression dataset(s). This is particularly useful for identifying the most suitable RT-qPCR reference genes for a specific species. + force: false + is_nfcore: true + name: stableexpression + org: nf-core + outdir: . + skip_features: + - igenomes + - fastqc + version: 1.0.0 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f54353f1..c7942f15 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,24 +1,41 @@ repos: - repo: https://github.com/pre-commit/mirrors-prettier - rev: "v3.1.0" + rev: "v4.0.0-alpha.8" hooks: - id: prettier additional_dependencies: - - prettier@3.2.5 + - prettier@3.6.2 + exclude: galaxy/ - - repo: https://github.com/editorconfig-checker/editorconfig-checker.python - rev: "2.7.3" + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v6.0.0 hooks: - - id: editorconfig-checker - alias: ec + - id: trailing-whitespace + args: [--markdown-linebreak-ext=md] + exclude: | + (?x)^( + .*ro-crate-metadata.json$| + modules/nf-core/.*| + subworkflows/nf-core/.*| + .*\.snap$ + )$ + - id: end-of-file-fixer + exclude: | + (?x)^( + .*ro-crate-metadata.json$| + modules/nf-core/.*| + subworkflows/nf-core/.*| + .*\.snap$ + )$ - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.8.4 + rev: v0.14.1 hooks: # Run the linter. - id: ruff - types_or: [python, pyi] + files: \.py$ args: [--fix] + exclude: bin/old/ # Run the formatter. - id: ruff-format - types_or: [python, pyi] + files: \.py$ diff --git a/.prettierignore b/.prettierignore index 486e0f92..7cc55006 100644 --- a/.prettierignore +++ b/.prettierignore @@ -11,3 +11,9 @@ testing* *.pyc bin/ .nf-test/ +ro-crate-metadata.json +modules/nf-core/ +subworkflows/nf-core/ +galaxy/ +docs/ +tests/act diff --git a/.prettierrc.yml b/.prettierrc.yml index c81f9a76..07dbd8bb 100644 --- a/.prettierrc.yml +++ b/.prettierrc.yml @@ -1 +1,6 @@ printWidth: 120 +tabWidth: 4 +overrides: + - files: "*.{md,yml,yaml,html,css,scss,js,cff}" + options: + tabWidth: 2 diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..a33b527c --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "markdown.styles": ["public/vscode_markdown.css"] +} diff --git a/CHANGELOG.md b/CHANGELOG.md index d3ef518e..798c10f1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,9 +3,13 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## v1.0dev - [date] +## v1.0.0 - 18/03/2026 -Initial release of nf-core/stableexpression, created with the [nf-core](https://nf-co.re/) template. +First complete, official release of nf-core/stableexpression. + +## v1.0dev - 26/01/2025 + +Initial pre-release of nf-core/stableexpression, created with the [nf-core](https://nf-co.re/) template. ### `Added` diff --git a/CITATIONS.md b/CITATIONS.md index 8a127850..e7423ab9 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -10,22 +10,29 @@ ## Pipeline tools -## [Expression Atlas](https://www.ebi.ac.uk/gxa/home) +- [EBI Expression Atlas](https://www.ebi.ac.uk/gxa/home) > Papatheodorou I, Fonseca NA, Keays M, Tang YA, Barrera E, Bazant W, Burke M, Füllgrabe A, Muñoz-Pomer Fuentes A, George N, Huerta L, Koskinen S, Mohammed S, Geniza M, Preece J, Jaiswal P, Jarnuczak AF, Huber W, Stegle O, Vizcaino JA, Brazma A, Petryszak R. Expression Atlas: gene and protein expression across multiple studies and organisms. Nucleic Acids Res. 2017 Nov 20;46(Database issue):D246–D251. doi: 10.1093/nar/gkx1158. PubMed PMID: 29165655. -## [g:Profiler](https://biit.cs.ut.ee/gprofiler/gost) +- [NCBI GEO](https://www.ncbi.nlm.nih.gov/geo/) + +> Ron Edgar, Michael Domrachev & Alex E Lash. Gene Expression Omnibus: NCBI gene expression and hybridization array data repository. Nucleic Acids Res. 2002 Jan 1;30(1):207-10. doi: 10.1093/nar/30.1.207. PubMed PMID: 11752295. + +- [g:Profiler](https://biit.cs.ut.ee/gprofiler/gost) > Reimand J, Kull M, Peterson H, Hansen J, Vilo J. g:Profiler—a web-based toolset for functional profiling of gene lists from large-scale experiments. Nucleic Acids Res. 2007 May 3;35(Web Server issue):W193–W200. doi:10.1093/nar/gkm226. PubMed PMID: 17478515. -## [DESeq2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html) +- [Normfinder](https://rdrr.io/github/dhammarstrom/generefer/man/normfinder.html) + +> Claus Lindbjerg Andersen, Jens Ledet Jensen, Torben Falck Ørntoft. Normalization of Real-Time Quantitative Reverse Transcription-PCR Data: A Model-Based Variance Estimation Approach to Identify Genes Suited for Normalization, Applied to Bladder and Colon Cancer Data Sets. Cancer Res (2004) 64 (15): 5245–5250. doi:10.1158/0008-5472.CAN-04-0496. PubMed PMID: 15289330. + +- [GeNorm](https://pypi.org/project/rna-genorm/) -> Love MI, Huber W & Anders S. Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biol -> . 2014;15(12):550. doi: 10.1186/s13059-014-0550-8. PubMed PMID: 25516281. +> Jo Vandesompele, Katleen De Preter, Filip Pattyn, Bruce Poppe, Nadine Van Roy, Anne De Paepe, Frank Speleman. Accurate normalization of real-time quantitative RT-PCR data by geometric averaging of multiple internal control genes. Genome Biol. 2002 Jun 18;3(7):RESEARCH0034. doi: 10.1186/gb-2002-3-7-research0034 Pubmed PMID: 12184808. -## [EdgeR](https://bioconductor.org/packages/release/bioc/html/edgeR.html) +- [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) -> Robinson MD, McCarthy DJ, Smyth GK. edgeR: a Bioconductor package for differential expression analysis of digital gene expression data. Bioinformatics. 2010 Jan 1;26(1):139-40. doi: 10.1093/bioinformatics/btp616. Pubmed PMID: 19910308. +> Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. ## Software packaging/containerisation tools diff --git a/LICENSE b/LICENSE index 7334ce7d..0f46f557 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) Olivier Coen +Copyright (c) The nf-core/stableexpression team Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 5a59df8f..bea23099 100644 --- a/README.md +++ b/README.md @@ -5,86 +5,139 @@ -[![GitHub Actions CI Status](https://github.com/nf-core/stableexpression/actions/workflows/ci.yml/badge.svg)](https://github.com/nf-core/stableexpression/actions/workflows/ci.yml) -[![GitHub Actions Linting Status](https://github.com/nf-core/stableexpression/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/stableexpression/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/stableexpression/results) +[![Open in GitHub Codespaces](https://img.shields.io/badge/Open_In_GitHub_Codespaces-black?labelColor=grey&logo=github)](https://github.com/codespaces/new/nf-core/stableexpression) +[![GitHub Actions CI Status](https://github.com/nf-core/stableexpression/actions/workflows/nf-test.yml/badge.svg)](https://github.com/nf-core/stableexpression/actions/workflows/nf-test.yml) +[![GitHub Actions Linting Status](https://github.com/nf-core/stableexpression/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/stableexpression/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/stableexpression/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX) [![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com) -[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A523.04.0-23aa62.svg)](https://www.nextflow.io/) +[![Nextflow](https://img.shields.io/badge/version-%E2%89%A525.04.0-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/) +[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.5.1-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.5.1) +[![run with apptainer](https://custom-icon-badges.demolab.com/badge/run%20with-apptainer-4545?logo=apptainer&color=teal&labelColor=000000)](https://apptainer.org/) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) [![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/stableexpression) -[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23stableexpression-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/stableexpression)[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/nf_core)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core) +[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23stableexpression-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/stableexpression)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core) ## Introduction -**nf-core/stableexpression** is a bioinformatics pipeline that aims at finding the most stable genes among a single or multiple public / local count datasets. It takes as input a species name (mandatory), keywords for expression atlas search (optional) and / or a CSV input file listing local raw / normalized count datasets (optional). **A typical usage is to find the most suitable qPCR housekeeping genes for a specific species (and optionally specific conditions)**. +**nf-core/stableexpression** is a bioinformatics pipeline aiming to aggregate multiple count datasets for a specific species and find the most stable genes. The datasets can be either downloaded from public databases (EBI, NCBI) or provided directly by the user. Both RNA-seq and Microarray count datasets can be utilised.

- +

-## Pipeline summary +It takes as main inputs : -1. Get Expression Atlas accessions corresponding to the provided species (and optionally keywords) ([Expression Atlas](https://www.ebi.ac.uk/gxa/home); optional) -2. Download Expression Atlas data ([Expression Atlas](https://www.ebi.ac.uk/gxa/home); optional) -3. Normalize raw data (using [DESeq2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html) or [EdgeR](https://bioconductor.org/packages/release/bioc/html/edgeR.html)) -4. Map gene IDS to Ensembl IDS for standardisation among datasets ([g:Profiler](https://biit.cs.ut.ee/gprofiler/gost)) -5. Compute gene variation coefficients and get the most stable genes +- a species name (mandatory) +- keywords for Expression Atlas / GEO search (optional) +- a CSV input file listing your own raw / normalised count datasets (optional). -## Usage +**Use cases**: + +- **find the most suitable genes as RT-qPCR reference genes for a specific species (and optionally specific conditions)** +- download all Expression Atlas and / or NCBI GEO datasets for a species (and optionally keywords) + +## Pipeline overview + +The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: + +#### 1. Get accessions from public databases + +- Get [Expression Atlas](https://www.ebi.ac.uk/gxa/home) dataset accessions corresponding to the provided species (and optionally keywords) + This step is run by default but is optional. Set `--skip_fetch_eatlas_accessions` to skip it. +- Get NBCI [GEO](https://www.ncbi.nlm.nih.gov/gds) **microarray** dataset accessions corresponding to the provided species (and optionally keywords) + This is optional and **NOT** run by default. Set `--fetch_geo_accessions` to run it. + +#### 2. Download data (see [usage](./conf/usage.md#3-provide-your-own-accessions)) + +- Download [Expression Atlas](https://www.ebi.ac.uk/gxa/home) data if any +- Download NBCI [GEO](https://www.ncbi.nlm.nih.gov/gds) data if any > [!NOTE] -> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. +> At this point, datasets downloaded from public databases are merged with datasets provided by the user using the `--datasets` parameter. See [usage](./conf/usage.md#4-use-your-own-expression-datasets) for more information about local datasets. -First, prepare a samplesheet listing the different count datasets: +#### 3. ID Mapping (see [usage](./conf/usage.md#5-custom-gene-id-mapping--metadata)) -`datasets.csv`: +- Gene IDs are cleaned +- Map gene IDS to NCBI Entrez Gene IDS (or Ensembl IDs) for standardisation among datasets using [g:Profiler](https://biit.cs.ut.ee/gprofiler/gost) (run by default; optional) +- Rare genes are filtered out -```csv -counts,design,normalized -path/to/normalized.counts.csv,path/to/normalized.design.csv,true -path/to/raw.counts.csv,path/to/raw.design.csv,false -``` +#### 4. Sample filtering + +Samples that show too high ratios of zeros or missing values are removed from the analysis. + +#### 5. Normalisation of expression + +- Normalize RNAseq raw data using TPM (necessitates downloading the corresponding genome and computing transcript lengths) or CPM. +- Perform quantile normalisation on each dataset separately using [scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.quantile_transform.html) + +#### 6. Merge all data + +All datasets are merged into one single dataframe. + +#### 7. Imputation of missing values + +Missing values are replaced by imputed values using a specific algorithm provided by [scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.quantile_transform.html). The user can choose the method of imputation with the `--missing_value_imputer` parameter. + +#### 8. General statistics for each gene + +Base statistics are computed for each gene, platform-wide and for each platform (RNAseq and microarray). -Make sure to format your datasets properly: +#### 9. Scoring -`counts.csv`: +- The whole list of genes is divided in multiple sections, based on their expression level. +- Based on the coefficient of variation, a shortlist of candidates genes is extracted for each section. +- Run optimised, scalable version of [Normfinder](https://www.moma.dk/software/normfinder) +- Run optimised, scalable version of [Genorm](https://genomebiology.biomedcentral.com/articles/10.1186/gb-2002-3-7-research0034) (run by default; optional) +- Compute stability scores for each candidate gene -```csv -,sample_A,sample_B,sample_C -gene_1,1,2,3 -gene_2,1,2,3 -... +#### 10. Reporting + +- Result aggregation +- Make [`MultiQC`](http://multiqc.info/) report +- Prepare [Dash Plotly](https://dash.plotly.com/) app for further investigation of gene / sample counts + +## Test pipeline + +You can test the execution of the pipeline locally with: + +```bash +nextflow run nf-core/stableexpression -profile test, ``` -`design.csv`: +## Basic usage -```csv -sample,condition -sample_A,condition_1 -sample_B,condition_2 -... +> [!NOTE] +> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. + +To search the most stable genes in a species considering all public datasets, simply run: + +```bash +nextflow run nf-core/stableexpression \ + -profile \ + --species \ + --outdir \ + -resume ``` -Now you can tun the pipeline as follows: +## More advanced usage + +For more specific scenarios, like: -> ```bash -> nextflow run nf-core/stableexpression \ -> -profile docker \ -> --species \ -> --eatlas_accessions \ -> --eatlas_keywords \ -> --datasets ./datasets.csv \ -> --outdir ./results -> ``` +- **fetching only specific conditions** +- **using your own expression dataset(s)** -> [!WARNING] -> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; -> see [docs](https://nf-co.re/usage/configuration#custom-configuration-files). +please refer to the [usage documentation](https://nf-co.re/stableexpression/usage). -For more details and further functionality, please refer to the [usage documentation](https://nf-co.re/stableexpression/usage) and the [parameter documentation](https://nf-co.re/stableexpression/parameters). +## Resource allocation + +For setting pipeline CPU / memory usage, see [here](./docs/configuration.md). + +## Profiles + +See [here](https://nf-co.re/stableexpression/usage#profiles) for more information about profiles. ## Pipeline output @@ -92,10 +145,22 @@ To see the results of an example test run with a full size dataset refer to the For more details about the output files and reports, please refer to the [output documentation](https://nf-co.re/stableexpression/output). +## Support us + +If you like nf-core/stableexpression, please make sure you give it a star on GitHub! + +[![stars - stableexpression](https://img.shields.io/github/stars/nf-core/stableexpression?style=social)](https://github.com/nf-core/stableexpression) + ## Credits nf-core/stableexpression was originally written by Olivier Coen. +We thank the following people for their assistance in the development of this pipeline: + +- RĂŠmy Costa +- Shaheen Acheche +- Janine Soares + ## Contributions and Support If you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md). @@ -104,6 +169,9 @@ For further information or help, don't hesitate to get in touch on the [Slack `# ## Citations + + + An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. You can cite the `nf-core` publication as follows: diff --git a/assets/methods_description_template.yml b/assets/methods_description_template.yml index e116bbee..d4670c25 100644 --- a/assets/methods_description_template.yml +++ b/assets/methods_description_template.yml @@ -3,8 +3,6 @@ description: "Suggested text and references to use when describing pipeline usag section_name: "nf-core/stableexpression Methods Description" section_href: "https://github.com/nf-core/stableexpression" plot_type: "html" -## TODO nf-core: Update the HTML below to your preferred methods description, e.g. add publication citation for this pipeline -## You inject any metadata in the Nextflow '${workflow}' object data: |

Methods

Data was processed using nf-core/stableexpression v${workflow.manifest.version} ${doi_text} of the nf-core collection of workflows (Ewels et al., 2020), utilising reproducible software environments from the Bioconda (GrĂźning et al., 2018) and Biocontainers (da Veiga Leprevost et al., 2017) projects.

diff --git a/assets/multiqc_config.custom_content.template.yaml b/assets/multiqc_config.custom_content.template.yaml new file mode 100644 index 00000000..fae01dd9 --- /dev/null +++ b/assets/multiqc_config.custom_content.template.yaml @@ -0,0 +1,690 @@ +custom_data: + ranked_most_stable_genes_summary_template: + section_name: "Most stable genes" + file_format: "csv" + no_violin: true + description: "Expression descriptive statistics of all genes, ranked by stability. Genes are sorted by stability score - from the most stable to the least stable." + plot_type: "table" + pconfig: + col1_header: "Gene ID" + sort_rows: false + headers: + gene_id: + title: "Gene ID" + rank: + title: "Rank" + description: Rank of the gene based on stability score + scale: "RdYlGn-rev" + cond_formatting_rules: + between_fourth_and_tenth: + - eq: 4 + - eq: 5 + - eq: 6 + - eq: 7 + - eq: 8 + - eq: 9 + - eq: 10 + third: + - eq: 3 + second: + - eq: 2 + first: + - eq: 1 + name: + title: "Gene name" + description: + title: "Gene description" + original_gene_ids: + title: "Original gene IDs" + description: Original gene IDs as stated in the input (provided or downloaded) datasets + stability_score: + title: "Stability score" + description: "Final stability score : the lower, the better" + format: "{:,.6f}" + scale: "RdYlGn-rev" + normfinder_stability_value_normalised: + title: "Normalised Normfinder score" + description: Quantile normalised (among candidate genes) stability value as computed by Normfinder + format: "{:,.6f}" + scale: "PRGn-rev" + genorm_m_measure_normalised: + title: "Normalised Genorm score" + description: Quantile normalised (among candidate genes) M-measure as computed by Genorm + format: "{:,.6f}" + scale: "PRGn-rev" + ratio_nulls_in_valid_samples: + title: "Ratio null values (valid samples)" + description: Ratio of samples in which the gene is not represented, excluding samples with particularly low overall gene count. + coefficient_of_variation_normalised: + title: "Normalised CV" + description: Quantile normalised (among candidate genes) coefficient of variation ( std(expression) / mean(expression) ) across all samples. + format: "{:,.6f}" + scale: "PRGn-rev" + robust_coefficient_of_variation_median_normalised: + title: "Normalised RCVm" + description: Quantile normalised (among candidate genes) robust coefficient of variation on median of the expression across all samples. + format: "{:,.4f}" + scale: "PRGn-rev" + coefficient_of_variation: + title: "CV" + description: Coefficient of variation ( std(expression) / mean(expression) ) across all samples. + format: "{:,.6f}" + robust_coefficient_of_variation_median: + title: "RCVm" + description: Robust coefficient of variation on median of the expression across all samples. + format: "{:,.4f}" + normfinder_stability_value: + title: "Normfinder stability value " + description: Stability value as computed by Normfinder + format: "{:,.6f}" + genorm_m_measure: + title: "Genorm M-measure" + description: M-measure as computed by Genorm + format: "{:,.6f}" + mean: + title: "Average" + description: Average expression across all samples. + format: "{:,.4f}" + standard_deviation: + title: "Standard deviation" + description: Standard deviation of the expression across all samples. + format: "{:,.6f}" + median: + title: "Median" + description: Median expression across all samples. + format: "{:,.4f}" + median_absolute_deviation: + title: "MAD" + description: Median absolute deviation of the expression across all samples. + format: "{:,.4f}" + expression_level_status: + title: "Expression level" + description: "Indication about the average gene expression level across all samples (compared to the whole pool of genes). Expression in [0, 0.05]: Very low expression. Expression in [0.05, 0.1]: Low expression. Expression in [0.1, 0.9]: Medium range. Expression in [0.9, 0.95]: High expression. Expression in [0.95, 1]: Very high expression." + cond_formatting_rules: + very_high: + - s_eq: "Very high expression" + high: + - s_eq: "High expression" + medium: + - s_eq: "Medium range" + low: + - s_eq: "Low expression" + very_low: + - s_eq: "Very low expression" + rnaseq_coefficient_of_variation: + title: "Var coeff [RNA-seq only]" + description: Coefficient of variation ( std(expression) / mean(expression) ) across RNA-Seq samples. + format: "{:,.4f}" + rnaseq_robust_coefficient_of_variation_median: + title: "RCVm [RNA-seq only]" + description: Robust coefficient of variation on median of the expression across RNA-Seq samples. + format: "{:,.4f}" + rnaseq_mean: + title: "Average [RNA-seq only]" + description: Average expression across RNA-Seq samples. + format: "{:,.4f}" + rnaseq_standard_deviation: + title: "Std [RNA-seq only]" + description: Standard deviation of the expression across RNA-Seq samples. + format: "{:,.4f}" + rnaseq_median: + title: "Median [RNA-seq only]" + description: Median expression across RNA-Seq samples. + format: "{:,.4f}" + rnaseq_median_absolute_deviation: + title: "MAD [RNA-seq only]" + description: Median absolute deviation of the expression across RNA-Seq samples. + format: "{:,.4f}" + rnaseq_expression_level_status: + title: "Expression level [RNA-seq only]" + description: "Indication about the average gene expression level across RNA-seq samples (compared to the whole pool of genes). Expression in [0, 0.05]: Very low expression. Expression in [0.05, 0.1]: Low expression. Expression in [0.1, 0.9]: Medium range. Expression in [0.9, 0.95]: High expression. Expression in [0.95, 1]: Very high expression." + cond_formatting_rules: + very_high: + - s_eq: "Very high expression" + high: + - s_eq: "High expression" + medium: + - s_eq: "Medium range" + low: + - s_eq: "Low expression" + very_low: + - s_eq: "Very low expression" + microarray_coefficient_of_variation: + title: "Var coeff [Microarray only]" + description: Coefficient of variation ( std(expression) / mean(expression) ) across Microarray samples. + format: "{:,.4f}" + microarray_robust_coefficient_of_variation_median: + title: "RCVm [Microarray only]" + description: Robust coefficient of variation on median of the expression across Microarray samples. + format: "{:,.4f}" + microarray_mean: + title: "Average [Microarray only]" + description: Average expression across Microarray samples. + format: "{:,.4f}" + microarray_standard_deviation: + title: "Std [Microarray only]" + description: Standard deviation of the expression across Microarray samples. + format: "{:,.4f}" + microarray_median: + title: "Median [Microarray only]" + description: Median expression across Microarray samples. + format: "{:,.4f}" + microarray_median_absolute_deviation: + title: "MAD [Microarray only]" + description: Median absolute deviation of the expression across Microarray samples. + format: "{:,.4f}" + microarray_expression_level_status: + title: "Expression level [Microarray only]" + description: "Indication about the average gene expression level across Microarray samples (compared to the whole pool of genes). Expression in [0, 0.05]: Very low expression. Expression in [0.05, 0.1]: Low expression. Expression in [0.1, 0.9]: Medium range. Expression in [0.9, 0.95]: High expression. Expression in [0.95, 1]: Very high expression." + cond_formatting_rules: + very_high: + - s_eq: "Very high expression" + high: + - s_eq: "High expression" + medium: + - s_eq: "Medium range" + low: + - s_eq: "Low expression" + very_low: + - s_eq: "Very low expression" + ratio_nulls_in_all_samples: + title: "Ratio null values (all samples)" + description: Ratio of samples in which the gene is not represented. + ratio_zeros: + title: "Ratio zero values" + description: Ratio of samples in which the gene has a zero value. + rnaseq_ratio_nulls_in_all_samples: + title: "Ratio null values (all samples) [RNA-seq only]" + description: Ratio of RNA-Seq samples in which the gene is not represented. + rnaseq_ratio_nulls_in_valid_samples: + title: "Ratio null values (valid samples) [RNA-seq only]" + description: Ratio of RNA-Seq samples in which the gene is not represented, excluding samples with particularly low overall gene count. + rnaseq_ratio_zeros: + title: "Ratio zero values [RNA-seq only]" + description: Ratio of RNA-Seq samples in which the gene has a zero value. + microarray_ratio_nulls_in_all_samples: + title: "Ratio null values (all samples) [Microarray only]" + description: Ratio of Microarray samples in which the gene is not represented. + microarray_ratio_nulls_in_valid_samples: + title: "Ratio null values (valid samples) [Microarray only]" + description: Ratio of Microarray samples in which the gene is not represented, excluding samples with particularly low overall gene count. + microarray_ratio_zeros: + title: "Ratio zero values [Microarray only]" + description: Ratio of Microarray samples in which the gene has a zero value. + + expr_distrib_most_stable_genes_template: + section_name: "Normalised count distributions" + file_format: "csv" + pconfig: + sort_samples: false + xlab: Expression + ylab: Gene + description: "Distribution of normalised gene expression (between 0 and 1) across samples for the most stable genes. Only the NB_GENES most stable genes are shown and genes are ranked from the most stable to the least stable." + plot_type: "boxplot" + + gene_statistics: + section_name: "Descriptive statistics - All genes" + parent_id: stats + parent_name: "Statistics" + parent_description: "Various statistics at the gene or sample level" + file_format: "csv" + description: Distribution of descriptive statistics for all genes. + plot_type: "violin" + headers: + stability_score: + title: "Stability score" + color: "rgb(186,43,32)" + coefficient_of_variation_normalised: + title: "Normalised CV" + color: "rgb(64, 122, 22)" + robust_coefficient_of_variation_median_normalised: + title: "Normalised RCVm" + color: "rgb(64, 122, 22)" + normfinder_stability_value_normalised: + title: "Normalised Normfinder score" + color: "rgb(64, 122, 22)" + genorm_m_measure_normalised: + title: "Normalised Genorm score" + color: "rgb(64, 122, 22)" + coefficient_of_variation: + title: "CV" + color: "rgb(26, 167, 178)" + robust_coefficient_of_variation_median: + title: "RCVm" + color: "rgb(26, 167, 178)" + normfinder_stability_value: + title: "Normfinder stability value " + color: "rgb(26, 167, 178)" + genorm_m_measure: + title: "Genorm M-measure" + color: "rgb(26, 167, 178)" + mean: + title: "Average" + color: "rgb(26, 167, 178)" + standard_deviation: + title: "Standard deviation" + color: "rgb(26, 167, 178)" + median: + title: "Median" + color: "rgb(26, 167, 178)" + median_absolute_deviation: + title: "MAD" + color: "rgb(26, 167, 178)" + rnaseq_coefficient_of_variation: + title: "Var coeff [RNA-seq only]" + color: "rgb(140, 50, 76)" + rnaseq_robust_coefficient_of_variation_median: + title: "RCVm [RNA-seq only]" + color: "rgb(140, 50, 76)" + rnaseq_mean: + title: "Average [RNA-seq only]" + color: "rgb(140, 50, 76)" + rnaseq_standard_deviation: + title: "Std [RNA-seq only]" + color: "rgb(140, 50, 76)" + rnaseq_median: + title: "Median [RNA-seq only]" + color: "rgb(140, 50, 76)" + rnaseq_median_absolute_deviation: + title: "MAD [RNA-seq only]" + color: "rgb(140, 50, 76)" + microarray_coefficient_of_variation: + title: "Var coeff [Microarray only]" + color: "rgb(27, 83, 73)" + microarray_robust_coefficient_of_variation_median: + title: "RCVm [Microarray only]" + color: "rgb(27, 83, 73)" + microarray_mean: + title: "Average [Microarray only]" + color: "rgb(27, 83, 73)" + microarray_standard_deviation: + title: "Std [Microarray only]" + color: "rgb(27, 83, 73)" + microarray_median: + title: "Median [Microarray only]" + color: "rgb(27, 83, 73)" + microarray_median_absolute_deviation: + title: "MAD [Microarray only]" + color: "rgb(27, 83, 73)" + ratio_nulls_in_all_samples: + title: "Ratio null values (all samples)" + color: "rgb(106, 78, 193)" + ratio_nulls_in_valid_samples: + title: "Ratio null values (valid samples)" + color: "rgb(106, 78, 193)" + ratio_zeros: + title: "Ratio zero values" + color: "rgb(106, 78, 193)" + rnaseq_ratio_nulls_in_all_samples: + title: "Ratio null values (all samples) [RNA-seq only]" + color: "rgb(106, 78, 193)" + rnaseq_ratio_nulls_in_valid_samples: + title: "Ratio null values (valid samples) [RNA-seq only]" + color: "rgb(106, 78, 193)" + rnaseq_ratio_zeros: + title: "Ratio zero values [RNA-seq only]" + color: "rgb(106, 78, 193)" + microarray_ratio_nulls_in_all_samples: + title: "Ratio null values (all samples) [Microarray only]" + color: "rgb(106, 78, 193)" + microarray_ratio_nulls_in_valid_samples: + title: "Ratio null values (valid samples) [Microarray only]" + color: "rgb(106, 78, 193)" + microarray_ratio_zeros: + title: "Ratio zero values [Microarray only]" + color: "rgb(106, 78, 193)" + + skewness: + section_name: "Count skewness" + parent_id: stats + parent_name: "Statistics" + parent_description: "Various statistics at the gene or sample level" + file_format: "csv" + pconfig: + sort_samples: false + #xmin: 0 + #xmax: 1 + xlab: Skewness + ylab: Dataset + description: Distribution of count skewness across samples, displayed dataset per dataset. + plot_type: "boxplot" + + ratio_zeros: + section_name: "Proportion of zeros" + parent_id: stats + parent_name: "Statistics" + parent_description: "Various statistics at the gene or sample level" + file_format: "csv" + pconfig: + sort_samples: false + #xmin: 0 + #xmax: 1 + xlab: Proportion of zeros + ylab: Dataset + description: Distribution of zeros across samples, displayed dataset per dataset. + plot_type: "boxplot" + + ratio_nulls: + section_name: "Proportion of missing values" + parent_id: stats + parent_name: "Statistics" + parent_description: "Various statistics at the gene or sample level" + file_format: "csv" + pconfig: + sort_samples: false + #xmin: 0 + #xmax: 1 + xlab: Proportion of null values + ylab: Dataset + description: Distribution of missing values (including genes not present) across samples, displayed dataset per dataset. + plot_type: "boxplot" + + null_values_filter: + section_name: "Filter on null values" + parent_id: filtering + parent_name: "Sample filtering" + parent_description: "Proportion of samples filtered out, relatively to the the variable observed" + file_format: "csv" + pconfig: + sort_samples: true + tt_decimals: 0 + cpswitch: false # show the 'Counts / Percentages' switch + cpswitch_c_active: true # show counts per default + stacking: "relative" + description: Effect of filter on ratio of null (missing) values + categories: + kept: + name: "Nb samples kept" + color: "#2ABF96" + rejected: + name: "Nb samples rejected" + color: "#38B4F2" + plot_type: "barplot" + + zero_values_filter: + section_name: "Filter on zero values" + parent_id: filtering + parent_name: "Sample filtering" + parent_description: "Proportion of samples filtered out, relatively to the the variable observed" + file_format: "csv" + pconfig: + sort_samples: true + tt_decimals: 0 + cpswitch: false # show the 'Counts / Percentages' switch + cpswitch_c_active: true # show counts per default + stacking: "relative" + description: Effect of filter on ratio of zero values + categories: + kept: + name: "Nb samples kept" + color: "#2ABF96" + rejected: + name: "Nb samples rejected" + color: "#38B4F2" + plot_type: "barplot" + + id_mapping_stats: + section_name: "Gene ID mapping statistics" + parent_id: idmapping + parent_name: "ID mapping" + parent_description: "Information about the ID mapping" + file_format: "csv" + pconfig: + sort_samples: true + tt_decimals: 0 + cpswitch: true # show the 'Counts / Percentages' switch + cpswitch_c_active: true # show counts per default + stacking: "relative" + description: Statistics of gene ID mapping, dataset per dataset + categories: + final: + name: "Nb final gene IDs" + color: "#2ABF96" + merged: + name: "Nb gene IDs merged with other IDs" + color: "#38B4F2" + not_valid: + name: "Nb rare gene IDs removed" + color: "#F2C038" + unmapped: + name: "Nb unmapped gene IDs" + color: "#E3224A" + plot_type: "barplot" + + total_gene_id_occurrence_quantiles: + section_name: "Distribution of gene ID occurrence quantiles" + parent_id: idmapping + parent_name: "ID mapping" + parent_description: "Information about the ID mapping" + file_format: "csv" + pconfig: + categories: true + #ymax: 1.1 + #ymin: -0.1 + #y_lines: + # - value: 1 + # color: "#ff0000" + # width: 2 + # dash: "dash" + # label: "Threshold" + description: Quantiles of the total number of occurrences of gene IDs across all datasets. Quantile values were sorted from greatest to least. + plot_type: "linegraph" + helptext: Gene IDs can be present or absent in the datasets. For each gene ID, the total number of occurrences across all datasets was calculated and quantile values were computed from these totals. + + eatlas_selected_experiments_metadata: + section_name: "Selected" + parent_id: eatlas + parent_name: "Expression Atlas datasets" + parent_description: "Information about the Expression Atlas datasets processed in the analysis" + file_format: "tsv" + no_violin: true + description: Metadata of selected Expression Atlas datasets corresponding to the provided species (and optionally the provided keywords) + plot_type: "table" + + eatlas_all_experiments_metadata: + section_name: "All datasets" + parent_id: eatlas + parent_name: "Expression Atlas datasets" + parent_description: "Information about the Expression Atlas datasets processed in the analysis" + file_format: "tsv" + no_violin: true + description: Metadata of all Expression Atlas datasets corresponding to the provided species + plot_type: "table" + + eatlas_failure_reasons: + section_name: "Failure reasons" + parent_id: eatlas + parent_name: "Expression Atlas datasets" + parent_description: "Information about the Expression Atlas datasets processed in the analysis" + file_format: "csv" + no_violin: true + description: Reasons of failure during download of Expression Atlas datasets + plot_type: "table" + + eatlas_warning_reasons: + section_name: "Warnings" + parent_id: eatlas + parent_name: "Expression Atlas datasets" + parent_description: "Information about the Expression Atlas datasets processed in the analysis" + file_format: "csv" + no_violin: true + description: Warnings during download of Expression Atlas datasets + plot_type: "table" + + geo_selected_experiments_metadata: + section_name: "Selected" + parent_id: geo + parent_name: "GEO dataset metadata" + parent_description: "Information about the GEO datasets processed in the analysis" + file_format: "tsv" + no_violin: true + description: Metadata of selected GEO datasets corresponding to the provided species (and optionally the provided keywords) + plot_type: "table" + + geo_all_experiments_metadata: + section_name: "All datasets" + parent_id: geo + parent_name: "GEO datasets" + parent_description: "Information about the GEO datasets processed in the analysis" + file_format: "tsv" + no_violin: true + description: Metadata of all GEO datasets corresponding to the provided species + plot_type: "table" + + geo_rejected_experiments_metadata: + section_name: "Rejected GEO datasets" + parent_id: geo + parent_name: "GEO datasets" + parent_description: "Information about the GEO datasets processed in the analysis" + file_format: "tsv" + no_violin: true + description: Metadata of all GEO datasets which were rejected + plot_type: "table" + + geo_failure_reasons: + section_name: "Failure reasons" + parent_id: geo + parent_name: "GEO datasets" + parent_description: "Information about the GEO datasets processed in the analysis" + file_format: "csv" + no_violin: true + description: Reasons of failure during download of GEO datasets + plot_type: "table" + + geo_warning_reasons: + section_name: "Warnings" + parent_id: geo + parent_name: "GEO datasets" + parent_description: "Information about the GEO datasets processed in the analysis" + file_format: "csv" + no_violin: true + description: Warnings during download of GEO datasets + plot_type: "table" + + id_cleaning_failure_reasons: + section_name: "Gene ID cleaning failure reasons" + parent_id: idmapping + parent_name: "ID mapping" + parent_description: "Information about the ID mapping" + file_format: "tsv" + no_violin: true + description: Reasons of failure during gene ID cleaning + plot_type: "table" + + renaming_warning_reasons: + section_name: "Gene renaming warning reasons" + parent_id: idmapping + parent_name: "ID mapping" + parent_description: "Information about the ID mapping" + file_format: "tsv" + no_violin: true + description: Reasons of warning during gene ID renaming. You can further investigate ID mapping issues on the g:Profiler website at https://biit.cs.ut.ee/gprofiler/convert + plot_type: "table" + + renaming_failure_reasons: + section_name: "Gene renaming failure reasons" + parent_id: idmapping + parent_name: "ID mapping" + parent_description: "Information about the ID mapping" + file_format: "tsv" + no_violin: true + description: Reasons of failure during gene ID renaming + plot_type: "table" + + normalisation_failure_reasons: + section_name: "Failure reasons" + parent_id: normalisation + parent_name: "Normalisation" + parent_description: "Information about the normalisation" + file_format: "tsv" + no_violin: true + description: Reasons of failure during Normalisation (DESeq2 or edgeR) + plot_type: "table" + + normalisation_warning_reasons: + section_name: "Warning reasons" + parent_id: normalisation + parent_name: "Normalisation" + parent_description: "Information about the normalisation" + file_format: "tsv" + no_violin: true + description: Reasons of failure during Normalisation (DESeq2 or edgeR) + plot_type: "table" + +custom_content: + order: + - gene_statistics + - skewness + - ratio_zeros + - ratio_nulls + - null_values_filter + - zero_values_filter + - id_mapping_stats + - total_gene_id_occurrence_quantiles + - eatlas_selected_experiments_metadata + - eatlas_all_experiments_metadata + - eatlas_failure_reasons + - eatlas_warning_reasons + - geo_selected_experiments_metadata + - geo_all_experiments_metadata + - geo_rejected_experiments_metadata + - geo_failure_reasons + - geo_warning_reasons + - id_cleaning_failure_reasons + - renaming_warning_reasons + - renaming_failure_reasons + - normalisation_failure_reasons + - normalisation_warning_reasons + +sp: + ranked_most_stable_genes_summary_template: + fn: "*SECTION.most_stable_genes_summary.csv" + max_filesize: 5000000 # 5MB + expr_distrib_most_stable_genes_template: + fn: "*SECTION.most_stable_genes_transposed_counts.csv" + max_filesize: 50000000 # 50MB + gene_statistics: + fn: "*all_genes_summary.csv" + max_filesize: 50000000 # 50MB + id_mapping_stats: + fn: "*id_mapping_stats.csv" + null_values_filter: + fn: "*missing_values_filter_stats.csv" + zero_values_filter: + fn: "*zero_values_filter_stats.csv" + total_gene_id_occurrence_quantiles: + fn: "*total_gene_id_occurrence_quantiles.csv" + skewness: + fn: "*skewness.transposed.csv" + ratio_zeros: + fn: "*ratio_zeros.transposed.csv" + ratio_nulls: + fn: "*ratio_nulls.transposed.csv" + eatlas_selected_experiments_metadata: + fn: "*selected_experiments.metadata.tsv" + eatlas_all_experiments_metadata: + fn: "*species_experiments.metadata.tsv" + eatlas_failure_reasons: + fn: "*eatlas_failure_reasons.csv" + eatlas_warning_reasons: + fn: "*eatlas_warning_reasons.csv" + geo_selected_experiments_metadata: + fn: "*geo_selected_datasets.metadata.tsv" + geo_all_experiments_metadata: + fn: "*geo_all_datasets.metadata.tsv" + geo_rejected_experiments_metadata: + fn: "*geo_rejected_datasets.metadata.tsv" + geo_failure_reasons: + fn: "*geo_failure_reasons.csv" + geo_warning_reasons: + fn: "*geo_warning_reasons.csv" + id_cleaning_failure_reasons: + fn: "*id_cleaning_failure_reasons.tsv" + renaming_warning_reasons: + fn: "*renaming_warning_reasons.tsv" + renaming_failure_reasons: + fn: "*renaming_failure_reasons.tsv" + normalisation_failure_reasons: + fn: "*normalisation_failure_reasons.csv" + normalisation_warning_reasons: + fn: "*normalisation_warning_reasons.csv" diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index b790ca96..565c88e2 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -1,15 +1,34 @@ +report_comment: > + This report has been generated by the nf-core/stableexpression analysis pipeline. For information about how to interpret these results, please see the documentation. + report_section_order: "nf-core-stableexpression-methods-description": order: -1000 - "custom_data": - order: -1001 software_versions: - order: -1002 + order: -1001 "nf-core-stableexpression-summary": - order: -1003 + order: -1002 export_plots: true -report_comment: > - This report has been generated by the nf-core/stableexpression analysis pipeline. For information - about how to interpret these results, please see the documentation. +run_modules: + - custom_content + +disable_version_detection: true + +max_table_rows: 100000 + +table_cond_formatting_colours: + - first: "#ffd700" + - second: "#C0C0C0" + - third: "#CD7F32" + - between_fourth_and_tenth: "#468F8F" + - very_low: "#337ab7" + - low: "#5bc0de" + - medium: "#5cb85c" + - high: "#f0ad4e" + - very_high: "#d9534f" + +#violin_downsample_after: 10000 + +log_filesize_limit: 10000000000 # 10GB diff --git a/assets/nf-core-sampleexpression_logo_light.png b/assets/nf-core-sampleexpression_logo_light.png deleted file mode 100644 index 5b38865f..00000000 Binary files a/assets/nf-core-sampleexpression_logo_light.png and /dev/null differ diff --git a/assets/nf-core-stableexpression_logo_light.png b/assets/nf-core-stableexpression_logo_light.png new file mode 100644 index 00000000..d1af4a9c Binary files /dev/null and b/assets/nf-core-stableexpression_logo_light.png differ diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv deleted file mode 100644 index 5f653ab7..00000000 --- a/assets/samplesheet.csv +++ /dev/null @@ -1,3 +0,0 @@ -sample,fastq_1,fastq_2 -SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz -SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz, diff --git a/assets/schema_datasets.json b/assets/schema_datasets.json new file mode 100644 index 00000000..fa320e10 --- /dev/null +++ b/assets/schema_datasets.json @@ -0,0 +1,41 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://raw.githubusercontent.com/nf-core/stableexpression/master/assets/schema_datasets.json", + "title": "nf-core/stableexpression pipeline - params.schema_datasets schema", + "description": "Schema for the file provided with params.datasets", + "type": "array", + "items": { + "type": "object", + "properties": { + "counts": { + "type": "string", + "format": "file-path", + "exists": true, + "pattern": "^\\S+\\.(csv|tsv|dat)$", + "errorMessage": "You must provide a count dataset file" + }, + "design": { + "type": "string", + "format": "file-path", + "schema": "assets/schema_design.json", + "exists": true, + "pattern": "^\\S+\\.(csv|tsv|dat)$", + "errorMessage": "You must provide a design file", + "meta": ["design"] + }, + "platform": { + "type": "string", + "errorMessage": "You must specify the platform of the dataset", + "enum": ["rnaseq", "microarray"], + "meta": ["platform"] + }, + "normalised": { + "type": "boolean", + "description": "Specify whether the dataset is already normalised", + "errorMessage": "You must specify whether the dataset is already normalised (true or false)", + "meta": ["normalised"] + } + }, + "required": ["counts", "design", "platform", "normalised"] + } +} diff --git a/assets/schema_design.json b/assets/schema_design.json index 925eaf28..dc1e4b87 100644 --- a/assets/schema_design.json +++ b/assets/schema_design.json @@ -1,8 +1,8 @@ { "$schema": "https://json-schema.org/draft/2020-12/schema", - "$id": "https://raw.githubusercontent.com/nf-core/stableexpression/master/assets/schema_input.json", + "$id": "https://raw.githubusercontent.com/nf-core/stableexpression/master/assets/schema_design.json", "title": "nf-core/stableexpression pipeline - design schema", - "description": "Schema for the file provided with in the design column of the params.input CSV file", + "description": "Schema for the design file provided in the design column of the params.datasets CSV / TSV file", "type": "array", "items": { "type": "object", diff --git a/assets/schema_gene_id_mapping.json b/assets/schema_gene_id_mapping.json new file mode 100644 index 00000000..fc537199 --- /dev/null +++ b/assets/schema_gene_id_mapping.json @@ -0,0 +1,23 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://raw.githubusercontent.com/nf-core/stableexpression/master/assets/schema_gene_id_mapping.json", + "title": "nf-core/stableexpression pipeline - custom mappings schema", + "description": "Schema for the file provided with the params.gene_id_mapping CSV / TSV file", + "type": "array", + "items": { + "type": "object", + "properties": { + "original_gene_id": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "You must provide a column for original gene IDs." + }, + "gene_id": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "You must provide a column for mapped IDs." + } + }, + "required": ["original_gene_id", "gene_id"] + } +} diff --git a/assets/schema_gene_length.json b/assets/schema_gene_length.json new file mode 100644 index 00000000..b395cea0 --- /dev/null +++ b/assets/schema_gene_length.json @@ -0,0 +1,23 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://raw.githubusercontent.com/nf-core/stableexpression/master/assets/schema_gene_length.json", + "title": "nf-core/stableexpression pipeline - custom mappings schema", + "description": "Schema for the file provided with in the design column of the params.gene_length CSV file", + "type": "array", + "items": { + "type": "object", + "properties": { + "gene_id": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "You must provide a column for original gene IDs." + }, + "length": { + "type": "integer", + "minimum": 0, + "errorMessage": "You must provide a column for gene lengths." + } + }, + "required": ["gene_id", "length"] + } +} diff --git a/assets/schema_gene_metadata.json b/assets/schema_gene_metadata.json new file mode 100644 index 00000000..d3faad8c --- /dev/null +++ b/assets/schema_gene_metadata.json @@ -0,0 +1,28 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://raw.githubusercontent.com/nf-core/stableexpression/master/assets/schema_gene_metadata.json", + "title": "nf-core/stableexpression pipeline - custom mappings schema", + "description": "Schema for the file provided with the params.gene_metadata CSV / TSV file", + "type": "array", + "items": { + "type": "object", + "properties": { + "gene_id": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "You must provide a column for mapped IDs." + }, + "name": { + "type": "string", + "pattern": "^[^,]+$", + "errorMessage": "You must provide a column for gene names." + }, + "description": { + "type": "string", + "pattern": "^[^,]+$", + "errorMessage": "You must provide a column for gene descriptions." + } + }, + "required": ["gene_id", "name", "description"] + } +} diff --git a/assets/schema_input.json b/assets/schema_input.json deleted file mode 100644 index 6b439a0e..00000000 --- a/assets/schema_input.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "$schema": "https://json-schema.org/draft/2020-12/schema", - "$id": "https://raw.githubusercontent.com/nf-core/stableexpression/master/assets/schema_input.json", - "title": "nf-core/stableexpression pipeline - params.input schema", - "description": "Schema for the file provided with params.input", - "type": "array", - "items": { - "type": "object", - "properties": { - "counts": { - "type": "string", - "format": "file-path", - "exists": true, - "pattern": "^\\S+\\.csv$", - "errorMessage": "You must provide a count dataset file" - }, - "design": { - "type": "string", - "format": "file-path", - "exists": true, - "pattern": "^\\S+\\.csv$", - "errorMessage": "You must provide a design file" - }, - "normalized": { - "type": "boolean", - "description": "Specify whether the dataset is already normalized", - "errorMessage": "You must specify whether the dataset is already normalized (true or false)" - } - }, - "required": ["counts", "design", "normalized"] - } -} diff --git a/assets/sendmail_template.txt b/assets/sendmail_template.txt index 0c0d811e..ce432dbe 100644 --- a/assets/sendmail_template.txt +++ b/assets/sendmail_template.txt @@ -14,7 +14,7 @@ Content-Transfer-Encoding: base64 Content-ID: Content-Disposition: inline; filename="nf-core-stableexpression_logo_light.png" -<% out << new File("$projectDir/docs/images/nf-core-stableexpression_logo_light.png"). +<% out << new File("$projectDir/assets/nf-core-stableexpression_logo_light.png"). bytes. encodeBase64(). toString(). diff --git a/bin/aggregate_results.py b/bin/aggregate_results.py new file mode 100755 index 00000000..fd536674 --- /dev/null +++ b/bin/aggregate_results.py @@ -0,0 +1,509 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import config +import polars as pl +import yaml +from common import write_float_csv + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# outfile names +ALL_GENE_SUMMARY_OUTFILENAME = "all_genes_summary.csv" +SUMMARY_OUTFILENAME_SUFFIX = "most_stable_genes_summary.csv" +COUNTS_OUTFILENAME_SUFFIX = "most_stable_genes_transposed_counts.csv" +CUSTOM_CONTENT_MULTIQC_CONFIG_FILE = "custom_content_multiqc_config.yaml" + +# quantile intervals +NB_EXPRESSION_QUANTILES = 100 +NB_TOP_GENES_TO_SHOW_IN_BOX_PLOTS = 25 + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Get statistics from count data for each gene" + ) + parser.add_argument( + "--counts", type=Path, dest="count_file", required=True, help="Count file" + ) + parser.add_argument( + "--target-genes", + type=str, + nargs="+", + dest="target_genes", + default=[], + help="File containing target genes", + ) + parser.add_argument( + "--stats-with-scores", + type=Path, + nargs="+", + dest="stat_score_files", + required=True, + help="Files containing statistics for all genes and stability scores by candidate genes, one per section", + ) + parser.add_argument( + "--multiqc-config", + type=Path, + dest="multiqc_config", + required=True, + help="MultiQC config file for custom content", + ) + parser.add_argument( + "--platform-stats", + type=Path, + dest="platform_stat_files", + nargs="+", + help="File containing base statistics for all genes and for all datasets for a specific platform", + ) + parser.add_argument( + "--metadata", + type=str, + dest="metadata_files", + help="Metadata file", + ) + parser.add_argument( + "--mappings", type=str, dest="mapping_files", help="Mapping file" + ) + return parser.parse_args() + + +def parse_stat_score_file(file: Path) -> pl.DataFrame: + return pl.read_csv(file).with_columns( + pl.col(config.GENE_ID_COLNAME).cast(pl.String()) + ) + + +def get_non_empty_dataframes(files: list[Path]) -> list[pl.DataFrame]: + dfs = [pl.read_csv(file) for file in files] + return [df for df in dfs if not df.is_empty()] + + +def cast_cols_to_string(df: pl.DataFrame) -> pl.DataFrame: + return df.select( + [pl.col(column).cast(pl.String) for column in df.collect_schema().names()] + ) + + +def concat_cast_to_string_and_drop_duplicates(files: list[Path]) -> pl.DataFrame: + """Concatenate DataFrames, cast all columns to String, and drop duplicates. + + The first step is to concatenate the DataFrames. Then, the dataframe is cast + to String to ensure that all columns have the same data type. Finally, duplicate + rows are dropped. + """ + dfs = get_non_empty_dataframes(files) + dfs = [cast_cols_to_string(df) for df in dfs] + concat_df = pl.concat(dfs) + # dropping duplicates + # casting all columns to String + return concat_df.unique() + + +def cast_count_columns_to_float(df: pl.DataFrame) -> pl.DataFrame: + return df.select( + pl.col(config.GENE_ID_COLNAME), + pl.exclude(config.GENE_ID_COLNAME).cast(pl.Float64), + ) + + +def join_data_on_gene_id(stat_df: pl.DataFrame, *dfs: pl.DataFrame) -> pl.DataFrame: + """Merge the statistics dataframe with the metadata dataframe and the mapping dataframe.""" + # we need to ensure that the index of stat_df are strings + for df in dfs: + stat_df = stat_df.join(df, on=config.GENE_ID_COLNAME, how="left") + return stat_df + + +def get_counts(file: Path) -> pl.DataFrame: + # sorting dataframe (necessary to get consistent output) + return pl.read_parquet(file).sort(config.GENE_ID_COLNAME, descending=False) + + +def get_metadata(metadata_files: list[Path]) -> pl.DataFrame | None: + """Retrieve and concatenate metadata from a list of metadata files.""" + if not metadata_files: + return None + return concat_cast_to_string_and_drop_duplicates(metadata_files) + + +def get_mappings(mapping_files: list[Path]) -> pl.DataFrame | None: + if not mapping_files: + return None + concat_df = concat_cast_to_string_and_drop_duplicates(mapping_files) + # group by new gene IDs and gets the lis + # convert the list column to a string representation + # separate the original gene IDs with a semicolon + return concat_df.group_by(config.GENE_ID_COLNAME).agg( + pl.col(config.ORIGINAL_GENE_ID_COLNAME) + .unique() + .sort() + .str.join(";") + .alias(config.ORIGINAL_GENE_IDS_COLNAME) + ) + + +def get_status(quantile_interval: int) -> str: + """Return the expression level status of the gene given its quantile interval.""" + if NB_EXPRESSION_QUANTILES - 5 <= quantile_interval: + return "Very high expression" + elif ( + NB_EXPRESSION_QUANTILES - 10 <= quantile_interval < NB_EXPRESSION_QUANTILES - 5 + ): + return "High expression" + elif 4 < quantile_interval <= 9: + return "Low expression" + elif quantile_interval <= 4: + return "Very low expression" + else: + return "Medium range" + + +def add_expression_level_status(df: pl.DataFrame) -> pl.DataFrame: + logger.info("Adding expression level status") + mapping_dict = { + quantile_interval: get_status(quantile_interval) + for quantile_interval in range(NB_EXPRESSION_QUANTILES) + } + return df.with_columns( + pl.col(config.EXPRESSION_LEVEL_QUANTILE_INTERVAL_COLNAME) + .replace_strict(mapping_dict) + .alias(config.EXPRESSION_LEVEL_STATUS_COLNAME) + ) + + +def complement_gene_summary_table( + stat_summary_df: pl.DataFrame, *dfs: pl.DataFrame +) -> pl.DataFrame: + """ + Add various metadata to statistics summary. + """ + # add gene name, description and original gene IDs to statistics summary + stat_summary_df = join_data_on_gene_id(stat_summary_df, *dfs) + stat_summary_df = add_expression_level_status(stat_summary_df) + return stat_summary_df + + +def get_most_stable_genes_counts( + log_count_df: pl.DataFrame, stat_summary_df: pl.DataFrame +) -> pl.DataFrame: + # getting list of top stable genes with their order + top_genes_with_order = ( + stat_summary_df.head(NB_TOP_GENES_TO_SHOW_IN_BOX_PLOTS) + .select(config.GENE_ID_COLNAME) + .with_row_index("sort_order") + ) + + # join to get only existing genes and maintain order + sorted_transposed_counts_df = log_count_df.join( + top_genes_with_order, on=config.GENE_ID_COLNAME, how="inner" + ).sort("sort_order", descending=False) + + # get the actual gene names that were found (in order) + actual_gene_names = ( + sorted_transposed_counts_df.select(config.GENE_ID_COLNAME).to_series().to_list() + ) + return sorted_transposed_counts_df.drop( + ["sort_order", config.GENE_ID_COLNAME] + ).transpose(column_names=actual_gene_names) + + +def format_multiqc_section( + section: str, nb_sections: int, template_dict: dict, found_target_genes: list[dict] +): + section_dict = dict(template_dict) + + parent_id = section.replace("_", " ") + parent_name = ( + f"{section.replace('_', ' ').capitalize()} / {nb_sections}: most stable genes" + ) + parent_description = ( + f"Most stable genes and distribution of their normalised counts for {section.replace('_', ' ')} / {nb_sections}" + + " (section 1 corresponding to the most expressed genes)" + ) + + additional_name = "" + if found_target_genes: + additional_names = [ + f"{d['target_gene']} ({d['gene']})" for d in found_target_genes + ] + additional_name = ". Comprises " + ", ".join(additional_names) + + section_dict["parent_id"] = parent_id + section_dict["parent_name"] = parent_name + additional_name + section_dict["parent_description"] = parent_description + + return section_dict + + +def format_multiqc_sp(section: str, template_dict: dict): + sp_dict = dict(template_dict) + sp_dict["fn"] = sp_dict["fn"].replace("SECTION", section) + return sp_dict + + +def format_genes(genes: list[str]): + # str.maketrans("", "", "-_.") makes a mapping table for str.translate() that + # removes all occurrences of any character in "-_." from the input string + # it's faster than re.sub + return pl.Series( + [gene.lower().translate(str.maketrans("", "", "-_.")).strip() for gene in genes] + ) + + +def search_target_genes(df: pl.DataFrame, target_genes: list[str]) -> list[dict]: + """ + Search for target genes in a DataFrame. + + Args: + df (pl.DataFrame): The DataFrame to search in. + target_genes (list[str]): The list of target genes to search for. + + Returns: + list[dict]: A list of dictionaries associating each found target gene with its corresponding gene ID in the datasets. + """ + + unique_gene_ids = set(df[config.GENE_ID_COLNAME].to_list()) + + if config.GENE_NAME_COLNAME in df.columns: + unique_gene_ids |= set(df[config.GENE_NAME_COLNAME].to_list()) + + if config.ORIGINAL_GENE_IDS_COLNAME in df.columns: + original_gene_ids = ( + df.select( + pl.col(config.ORIGINAL_GENE_IDS_COLNAME).str.split(by=",").explode() + ) + .to_series() + .to_list() + ) + unique_gene_ids |= set(original_gene_ids) + + # putting all unique gene IDs, gene names and original gene IDs into single list + all_unique_gene_ids = [gene for gene in unique_gene_ids if gene is not None] + + # formatting all gene IDs found + formated_gene_ids_df = pl.DataFrame({"gene": all_unique_gene_ids}).with_columns( + pl.col("gene") + .map_batches( + lambda x: format_genes(x), + return_dtype=pl.String, + ) + .alias("formatted_gene") + ) + + # formatting target genes + formated_target_genes_df = pl.DataFrame({"target_gene": target_genes}).with_columns( + pl.col("target_gene") + .map_batches( + lambda x: format_genes(x), + return_dtype=pl.String, + ) + .alias("formatted_gene") + ) + + return ( + formated_gene_ids_df.join( + formated_target_genes_df, on="formatted_gene", how="inner" + ) + .select(["target_gene", "gene"]) + .sort("target_gene") + .to_dicts() + ) + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + # -------------------------------------------------- + # Parsing counts + # -------------------------------------------------- + + count_df = get_counts(args.count_file) + # reducing dataframe size (it is only used for plotting by MultiQC) + count_df = cast_count_columns_to_float(count_df) + + # -------------------------------------------------- + # Parsing statistics and scores, section by section + # -------------------------------------------------- + + stat_score_dfs = [] + sections = [] + for file in args.stat_score_files: + # the section name is at the beginning of the file name + section = file.name.split(".")[0] + df = parse_stat_score_file(file) + df = df.with_columns(pl.lit(section).alias(config.SECTION_COLNAME)) + stat_score_dfs.append(df) + sections.append(section) + + stat_score_df = pl.concat(stat_score_dfs) + + if stat_score_df.select(config.GENE_ID_COLNAME).is_duplicated().any(): + raise ValueError("Duplicate gene IDs found in statistics and scores files.") + + # sorting sections in the order (from 1 to ) + sections = sorted(sections, key=lambda section: int(section.split("_")[-1])) + + # -------------------------------------------------- + # Parsing MultiQC template config for custom content + # -------------------------------------------------- + + with open(args.multiqc_config, "r") as f: + multiqc_config = yaml.safe_load(f.read()) + + # putting template parts aside + ranking_dict = multiqc_config["custom_data"][ + "ranked_most_stable_genes_summary_template" + ] + ranking_sp_dict = multiqc_config["sp"]["ranked_most_stable_genes_summary_template"] + expr_distrib_dict = multiqc_config["custom_data"][ + "expr_distrib_most_stable_genes_template" + ] + expr_distrib_sp_dict = multiqc_config["sp"][ + "expr_distrib_most_stable_genes_template" + ] + other_sections = multiqc_config["custom_content"]["order"] + + del multiqc_config["custom_data"]["ranked_most_stable_genes_summary_template"] + del multiqc_config["sp"]["ranked_most_stable_genes_summary_template"] + del multiqc_config["custom_data"]["expr_distrib_most_stable_genes_template"] + del multiqc_config["sp"]["expr_distrib_most_stable_genes_template"] + del multiqc_config["custom_content"]["order"] + + # filling dynamically the number of genes to show in box plots + expr_distrib_dict["description"] = expr_distrib_dict["description"].replace( + "NB_GENES", str(NB_TOP_GENES_TO_SHOW_IN_BOX_PLOTS) + ) + + # -------------------------------------------------- + # Parsing statistics per platform + # -------------------------------------------------- + + platform_datasets_stat_dfs = [ + parse_stat_score_file(file) + for file in args.platform_stat_files + if file is not None + ] + + # -------------------------------------------------- + # Parsing metadata and mapping files + # -------------------------------------------------- + + metadata_files = ( + [Path(file) for file in args.metadata_files.split(" ")] + if args.metadata_files is not None + else [] + ) + mapping_files = ( + [Path(file) for file in args.mapping_files.split(" ")] + if args.mapping_files is not None + else [] + ) + + # parsing metadata and mapping files + metadata_df = get_metadata(metadata_files) + mapping_df = get_mappings(mapping_files) + optional_dfs = [df for df in [metadata_df, mapping_df] if df is not None] + + # -------------------------------------------------- + # Adding metadata, mapping and platform statistics information to gene summary table + # -------------------------------------------------- + + additional_data_dfs = optional_dfs + platform_datasets_stat_dfs + all_genes_summary_df = complement_gene_summary_table( + stat_score_df, *additional_data_dfs + ) + + logger.info(f"Exporting statistics of all genes to: {ALL_GENE_SUMMARY_OUTFILENAME}") + # sorting values in order to having consistent output + all_genes_summary_df.sort(by=config.GENE_ID_COLNAME).write_csv( + ALL_GENE_SUMMARY_OUTFILENAME, float_precision=config.CSV_FLOAT_PRECISION + ) + + # -------------------------------------------------- + # Getting summary table and counts for each section + # Adding new sections in MultiQC config for each new expression section + # -------------------------------------------------- + + nb_sections = len(sections) + new_mqc_config_sections = {} + new_mqc_config_sp = {} + + logger.info("Making new sections in the MultiQC config") + for section in sections: + # getting best candidates for this section + + section_df = ( + all_genes_summary_df.filter(pl.col("section") == section) + .drop("section") + .sort(config.STABILITY_SCORE_COLNAME, nulls_last=True, maintain_order=True) + ) + + found_target_genes = [] + if args.target_genes: + found_target_genes = search_target_genes(section_df, args.target_genes) + + section_most_stable_genes_counts_df = get_most_stable_genes_counts( + count_df, section_df + ) + + section_summary_outfile = f"{section}.{SUMMARY_OUTFILENAME_SUFFIX}" + write_float_csv(section_df, section_summary_outfile) + + section_counts_outfile = f"{section}.{COUNTS_OUTFILENAME_SUFFIX}" + write_float_csv(section_most_stable_genes_counts_df, section_counts_outfile) + + # making new sections in the MultiQC config + new_mqc_config_sections[f"genes_{section}"] = format_multiqc_section( + section, nb_sections, ranking_dict, found_target_genes + ) + new_mqc_config_sections[f"normalised_expr_distrib_{section}"] = ( + format_multiqc_section( + section, nb_sections, expr_distrib_dict, found_target_genes + ) + ) + new_mqc_config_sp[f"genes_{section}"] = format_multiqc_sp( + section, ranking_sp_dict + ) + new_mqc_config_sp[f"normalised_expr_distrib_{section}"] = format_multiqc_sp( + section, expr_distrib_sp_dict + ) + + # adding new sections + multiqc_config["custom_data"] = ( + new_mqc_config_sections | multiqc_config["custom_data"] + ) + # specifying the filenames linked to the new sections + multiqc_config["sp"] = new_mqc_config_sp | multiqc_config["sp"] + # specifying the section order + multiqc_config["custom_content"]["order"] = ( + list(new_mqc_config_sections.keys()) + other_sections + ) + + with open(CUSTOM_CONTENT_MULTIQC_CONFIG_FILE, "w") as f: + yaml.dump(multiqc_config, f, indent=4, sort_keys=False) + + logger.info("Done") + + +if __name__ == "__main__": + main() diff --git a/bin/clean_gene_ids.py b/bin/clean_gene_ids.py new file mode 100755 index 00000000..1e18c15b --- /dev/null +++ b/bin/clean_gene_ids.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +import sys +from pathlib import Path + +import config +import polars as pl +from common import parse_count_table + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +################################################################## +# CONSTANTS +################################################################## + +CLEANED_COUNTS_SUFFIX = ".cleaned.parquet" + +FAILURE_REASON_FILE = "failure_reason.txt" + +################################################################## +# FUNCTIONS +################################################################## + + +def parse_args(): + parser = argparse.ArgumentParser("Rename gene IDs using mapped IDs") + parser.add_argument( + "--count-file", type=Path, required=True, help="Input file containing counts" + ) + return parser.parse_args() + + +def clean_ensembl_gene_id_versioning(df: pl.DataFrame): + """ + Clean Ensembl gene IDs by removing version numbers. + Remove the dot and the numbers after it in IDs like ENSG00000000003.17 + """ + return df.with_columns( + pl.when(pl.col(config.GENE_ID_COLNAME).str.starts_with("ENSG")) + .then(pl.col(config.GENE_ID_COLNAME).str.extract(r"^(ENSG[a-zA-Z0-9]+)", 1)) + .otherwise(pl.col(config.GENE_ID_COLNAME)) + .alias(config.GENE_ID_COLNAME) + ) + + +def clean_mirna_ids(df: pl.DataFrame): + """ + Clean miRNA IDs by removing the 5p / 3p identifier. + """ + return df.with_columns( + pl.when(pl.col(config.GENE_ID_COLNAME).str.contains(r"-[53]p$")) + .then(pl.col(config.GENE_ID_COLNAME).str.extract(r"^(.*?)-[53]p$")) + .otherwise(pl.col(config.GENE_ID_COLNAME)) + .alias(config.GENE_ID_COLNAME) + ) + + +################################################################## +# MAIN +################################################################## + + +def main(): + args = parse_args() + + logger.info(f"Converting IDs for count file {args.count_file.name}...") + + ############################################################# + # PARSING FILES + ############################################################# + + df = parse_count_table(args.count_file) + + if df.is_empty(): + msg = "COUNT FILE IS EMPTY" + logger.warning(msg) + with open(FAILURE_REASON_FILE, "w") as f: + f.write(msg) + sys.exit(0) + + try: + df = clean_ensembl_gene_id_versioning(df) + df = clean_mirna_ids(df) + except Exception as e: + msg = f"ERROR CLEANING IDS in count file {args.count_file.name}: {e}" + logger.error(msg) + with open(FAILURE_REASON_FILE, "w") as f: + f.write(msg) + sys.exit(0) + + ############################################################# + # WRITING CLEANED COUNTS + ############################################################# + + logger.info("Writing count file with cleaned IDs") + count_outfile = args.count_file.with_name( + args.count_file.stem + CLEANED_COUNTS_SUFFIX + ) + df.write_parquet(count_outfile) + + +if __name__ == "__main__": + main() diff --git a/bin/collect_gene_ids.py b/bin/collect_gene_ids.py new file mode 100755 index 00000000..d4531444 --- /dev/null +++ b/bin/collect_gene_ids.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from collections import Counter +from pathlib import Path + +import config +from tqdm import tqdm + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +UNIQUE_GENE_IDS_OUTFILE = "unique_gene_ids.txt" +GENE_ID_OCCURRENCES_OUTFILE = "gene_id_occurrences.csv" + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser(description="Collect gene IDs from count files") + parser.add_argument( + "--ids", type=str, dest="gene_id_files", required=True, help="Gene ID files" + ) + return parser.parse_args() + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + gene_id_files = [Path(file) for file in args.gene_id_files.split(" ")] + logger.info(f"Getting gene IDs from {len(gene_id_files)} files") + + unique_gene_ids = set() + counter = Counter() + for gene_id_file in tqdm(gene_id_files): + with open(gene_id_file, "r") as fin: + gene_ids = [line.strip() for line in fin] + unique_gene_ids.update(gene_ids) + counter.update(gene_ids) + + with open(UNIQUE_GENE_IDS_OUTFILE, "w") as fout: + fout.write("\n".join([str(gene_id) for gene_id in sorted(unique_gene_ids)])) + + with open(GENE_ID_OCCURRENCES_OUTFILE, "w") as fout: + fout.write( + f"{config.ORIGINAL_GENE_ID_COLNAME},{config.GENE_ID_COUNT_COLNAME}\n" + ) + for gene_id, count in sorted(counter.items()): + fout.write(f"{gene_id},{count}\n") + + +if __name__ == "__main__": + main() diff --git a/bin/collect_statistics.py b/bin/collect_statistics.py new file mode 100755 index 00000000..418adc28 --- /dev/null +++ b/bin/collect_statistics.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import pandas as pd + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def parse_args(): + parser = argparse.ArgumentParser(description="Collect statistics") + parser.add_argument( + "--file", + type=Path, + required=True, + ) + return parser.parse_args() + + +def main(): + args = parse_args() + + logger.info("Collecting statistics...") + # parsing file manually because it's not a standard CSV format + with open(args.file, "r") as f: + lines = f.readlines() + data = [line.strip().split(",") for line in lines] + + # getting max number of columns + max_nb_cols = max(len(row) for row in data) + # fill missing values with None + for row in data: + row += [None] * (max_nb_cols - len(row)) + + df = pd.DataFrame(data) + # the first item is the dataset name + df.set_index(df.columns[0], inplace=True) + + outfile = args.file.name.replace(".csv", ".transposed.csv") + logger.info(f"Saving statistics to {outfile}") + df.T.to_csv(outfile, index=False, header=True) + + +if __name__ == "__main__": + main() diff --git a/bin/common.py b/bin/common.py new file mode 100644 index 00000000..59e1bb7e --- /dev/null +++ b/bin/common.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import logging +from pathlib import Path + +import config +import polars as pl + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def parse_header(file: Path, sep: str): + with open(file, "r") as fin: + header = fin.readline().strip().split(sep) + first_row = fin.readline().strip().split(sep) + if len(header) == len(first_row): + return header + elif len(header) == len(first_row) - 1: + return [config.GENE_ID_COLNAME] + header + else: + raise ValueError( + f"Header has length: {len(header)} while first row has length: {len(first_row)}" + ) + + +def parse_table(file: Path): + # parsing header first + if file.suffix in [".csv", ".tsv"]: + # parsing header manually + sep = "," if file.suffix == ".csv" else "\t" + header = parse_header(file, sep) + return pl.read_csv( + file, + separator=sep, + has_header=False, + skip_rows=1, + new_columns=header, + null_values=["NA", "N/A", "na", "n/a"], + ) + elif file.suffix == ".parquet": + return pl.read_parquet(file) + else: + raise ValueError(f"Unsupported file format: {file.suffix}") + + +def parse_count_table(file: Path): + df = parse_table(file) + first_col = df.columns[0] + # whatever the name of the first col, rename it to "gene_id" + return df.rename({first_col: config.GENE_ID_COLNAME}).select( + pl.col(config.GENE_ID_COLNAME).cast(pl.String()), + pl.exclude(config.GENE_ID_COLNAME).cast(pl.Float64()), + ) + + +def compute_log2(df: pl.DataFrame) -> pl.DataFrame: + """ + Compute log2 values. + """ + return df.select( + pl.col(config.GENE_ID_COLNAME), + (pl.exclude(config.GENE_ID_COLNAME) + 1).log(base=2), + ) + + +def export_parquet(df: pl.DataFrame, count_file: Path, suffix: str): + outfilename = count_file.with_suffix(suffix).name + logger.info(f"Exporting processed counts to: {outfilename}") + df.write_parquet(outfilename) + + +def write_float_csv(df: pl.DataFrame, outfilename: str): + df.write_csv(outfilename, float_precision=config.CSV_FLOAT_PRECISION) diff --git a/bin/compute_cpm.py b/bin/compute_cpm.py new file mode 100755 index 00000000..b2548896 --- /dev/null +++ b/bin/compute_cpm.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +import sys +from pathlib import Path + +import config +import polars as pl +from common import compute_log2, export_parquet, parse_count_table + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +OUTFILE_SUFFIX = ".cpm.parquet" + +WARNING_REASON_FILE = "warning_reason.txt" +FAILURE_REASON_FILE = "failure_reason.txt" + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser(description="Normalise data to CPM") + parser.add_argument( + "--counts", type=Path, dest="count_file", required=True, help="Count file" + ) + return parser.parse_args() + + +def calculate_cpm(df: pl.DataFrame) -> pl.DataFrame: + """ + Calculate CPM (Counts Per Million) from raw count data. + + Parameters: + ----------- + counts_df : polars.DataFrame + DataFrame with genes as rows and samples as columns + + Returns: + -------- + cpm_df : polars.DataFrame + DataFrame with CPM values + """ + # Calculate total counts per sample (column sums) + sums = df.select(pl.exclude(config.GENE_ID_COLNAME).sum()) + + # Calculate CPM: (count / total_counts) * 1,000,000 + count_columns = df.select(pl.exclude(config.GENE_ID_COLNAME)).columns + return df.select( + [pl.col(config.GENE_ID_COLNAME)] + + [(pl.col(col) / sums[col][0] * 1e6).alias(col) for col in count_columns] + ) + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + logger.info("Parsing data") + + try: + count_df = parse_count_table(args.count_file) + + logger.info(f"Normalising {args.count_file.name}") + count_df = calculate_cpm(count_df) + + logger.info("Computing log2 values") + count_df = compute_log2(count_df) + + export_parquet(count_df, args.count_file, OUTFILE_SUFFIX) + + except Exception as e: + logger.error(f"Error occurred while normalising data: {e}") + msg = "UNEXPECTED ERROR" + logger.error(msg) + with open(FAILURE_REASON_FILE, "w") as f: + f.write(msg) + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/bin/compute_dataset_statistics.py b/bin/compute_dataset_statistics.py new file mode 100755 index 00000000..48671561 --- /dev/null +++ b/bin/compute_dataset_statistics.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import config +import polars as pl +from common import parse_count_table + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +KEY_TO_OUTFILE = {"skewness": "skewness.txt"} +FLOAT_PRECISION = 6 + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Compute general statistics from count data for each sample" + ) + parser.add_argument( + "--counts", type=Path, dest="count_file", required=True, help="Count file" + ) + return parser.parse_args() + + +def compute_dataset_statistics(df: pl.DataFrame) -> dict: + # sample count skewness + skewness = df.select(pl.exclude(config.GENE_ID_COLNAME).skew()).row(0) + return dict(skewness=list(skewness)) + + +def format_value(value: float) -> str: + return f"{value:.{FLOAT_PRECISION}f}" if value != 0 else "0" + + +def export_count_data(stats: dict): + """ + Export dataset statistics to CSV files. + Write each statistic to a separate file, on a single row + """ + for key, outfile_name in KEY_TO_OUTFILE.items(): + logger.info(f"Exporting dataset statistics {key} to: {outfile_name}") + with open(outfile_name, "w") as outfile: + outfile.write(",".join([format_value(val) for val in stats[key]])) + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + count_file = args.count_file + + logger.info(f"Computing dataset statistics for {count_file.name}") + count_df = parse_count_table(count_file) + + stat_dict = compute_dataset_statistics(count_df) + + export_count_data(stat_dict) + + +if __name__ == "__main__": + main() diff --git a/bin/compute_gene_statistics.py b/bin/compute_gene_statistics.py new file mode 100755 index 00000000..ec250cf7 --- /dev/null +++ b/bin/compute_gene_statistics.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import config +import polars as pl + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# outfile names +ALL_GENES_RESULT_OUTFILE_SUFFIX = "stats_all_genes.csv" + +RCV_MULTIFILER = 1.4826 # see https://pmc.ncbi.nlm.nih.gov/articles/PMC9196089/ + +# quantile intervals +NB_QUANTILES = 100 + + +############################################################################ +# POLARS EXTENSIONS +############################################################################ + + +@pl.api.register_expr_namespace("row") +class StatsExtension: + def __init__(self, expr: pl.Expr): + self._expr = expr + + def not_null_values(self): + return self._expr.list.drop_nulls().list + + def mean(self) -> pl.Expr: + """Mean over non nulls values in row""" + return self.not_null_values().mean() + + def std(self) -> pl.Expr: + """Std over non nulls values in row""" + return self.not_null_values().std() + + def median(self) -> pl.Expr: + """Median over non nulls values in row""" + return self.not_null_values().median() + + def mad(self) -> pl.Expr: + """Median Absolute Deviation over non nulls values in row""" + return ( + self.not_null_values() + .eval( + (pl.element() - pl.element().median()).abs().median() + ) # returns a list with one element + .list.first() + ) + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Get base statistics from count data for each gene" + ) + parser.add_argument( + "--imputed-counts", + type=Path, + dest="imputed_count_file", + help="Count file with imputed missing values", + ) + parser.add_argument( + "--counts", type=Path, dest="count_file", required=True, help="Count file" + ) + parser.add_argument( + "--ratio-nulls-per-sample", + type=Path, + dest="ratio_nulls_per_samples", + required=True, + help="Ratio of null values per sample", + ) + parser.add_argument( + "--max-ratio-null-valid-sample", + type=float, + dest="max_ratio_null_valid_sample", + required=True, + help="Maximum ratio of null values for a sample to be considered valid", + ) + parser.add_argument("--platform", type=str, help="Platform name") + return parser.parse_args() + + +def get_counts(file: Path) -> pl.DataFrame: + # sorting dataframe (necessary to get consistent output) + return pl.read_parquet(file).sort(config.GENE_ID_COLNAME, descending=False) + + +def get_colname(colname: str, platform: str | None) -> str: + return f"{platform}_{colname}" if platform else colname + + +def get_samples(lf: pl.LazyFrame) -> list[str]: + return lf.select(pl.exclude(config.GENE_ID_COLNAME)).collect_schema().names() + + +def get_valid_samples( + ratio_nulls_per_samples_df: pl.DataFrame, max_ratio_null_valid_sample: float +) -> list[str]: + """ + Get samples whose ratio of null values is below the maximum ratio. + """ + return ( + ratio_nulls_per_samples_df.filter( + pl.col(config.RATIO_COLNAME) <= max_ratio_null_valid_sample + ) + .select(config.SAMPLE_COLNAME) + .to_series() + .to_list() + ) + + +def compute_ratios_null_values( + df: pl.DataFrame, valid_samples: list[str], platform: str | None +) -> pl.DataFrame: + # the samples showing a low gene count will not be taken into account for the zero count penalty + nb_nulls = df.select(pl.exclude(config.GENE_ID_COLNAME).is_null()).sum_horizontal() + + found_valid_samples = [sample for sample in valid_samples if sample in df.columns] + + if found_valid_samples: + nb_nulls_valid_samples = df.select( + pl.col(found_valid_samples).is_null() + ).sum_horizontal() + else: + nb_nulls_valid_samples = nb_nulls + + nb_samples = len(df.columns) - 1 + return df.select( + pl.col(config.GENE_ID_COLNAME), + (nb_nulls / nb_samples).alias( + get_colname(config.RATIO_NULLS_COLNAME, platform) + ), + (nb_nulls_valid_samples / len(found_valid_samples)).alias( + get_colname(config.RATIO_NULLS_VALID_SAMPLES_COLNAME, platform) + ), + ) + + +def get_main_statistics(lf: pl.LazyFrame, platform: str | None) -> pl.LazyFrame: + """ + Compute count descriptive statistics for each gene in the count dataframe. + """ + logger.info("Getting descriptive statistics") + samples = get_samples(lf) + # computing main stats + augmented_count_lf = lf.with_columns( + mean=pl.concat_list(samples).row.mean(), + std=pl.concat_list(samples).row.std(), + median=pl.concat_list(samples).row.median(), + mad=pl.concat_list(samples).row.mad(), + ) + + return augmented_count_lf.select( + pl.col(config.GENE_ID_COLNAME), + pl.col("mean").alias(get_colname(config.MEAN_COLNAME, platform)), + pl.col("std").alias(get_colname(config.STANDARD_DEVIATION_COLNAME, platform)), + pl.col("median").alias(get_colname(config.MEDIAN_COLNAME, platform)), + pl.col("mad").alias(get_colname(config.MAD_COLNAME, platform)), + (pl.col("std") / pl.col("mean")).alias( + get_colname(config.COEFFICIENT_OF_VARIATION_COLNAME, platform) + ), + (pl.col("mad") / pl.col("median") * RCV_MULTIFILER).alias( + get_colname(config.ROBUST_COEFFICIENT_OF_VARIATION_MEDIAN_COLNAME, platform) + ), + ) + + +def compute_ratio_zeros( + count_lf: pl.LazyFrame, stat_lf: pl.LazyFrame, platform: str +) -> pl.LazyFrame: + nb_samples = len(get_samples(count_lf)) + nb_zeros_lf = count_lf.select( + (pl.sum_horizontal(pl.exclude(config.GENE_ID_COLNAME) == 0) / nb_samples).alias( + get_colname(config.RATIO_ZEROS_COLNAME, platform) + ) + ) + # return stat_lf + return pl.concat([stat_lf, nb_zeros_lf], how="horizontal") + + +def get_quantile_intervals(lf: pl.LazyFrame, platform: str) -> pl.LazyFrame: + """ + Compute the quantile intervals for the mean expression levels of each gene in the dataframe. + + The function assigns to each gene a quantile interval of its mean cpm compared to all genes. + """ + logger.info("Getting mean expression quantiles") + mean_colname = get_colname(config.MEAN_COLNAME, platform) + return lf.with_columns( + ( + pl.col(mean_colname).rank(method="ordinal") + / pl.col(mean_colname).count() + * NB_QUANTILES + ) + .floor() + .cast(pl.Int8) + # we want the only value = NB_QUANTILES to be NB_QUANTILES - 1 + # because the last quantile interval is [NB_QUANTILES - 1, NB_QUANTILES] + .replace({NB_QUANTILES: NB_QUANTILES - 1}) + .alias(get_colname(config.EXPRESSION_LEVEL_QUANTILE_INTERVAL_COLNAME, platform)) + ) + + +def export_data(lf: pl.LazyFrame, platform: str | None): + """Export gene expression data to CSV files.""" + outfile = ( + f"{platform}.{ALL_GENES_RESULT_OUTFILE_SUFFIX}" + if platform + else ALL_GENES_RESULT_OUTFILE_SUFFIX + ) + logger.info(f"Exporting statistics for all genes to: {outfile}") + lf.sink_csv(outfile, float_precision=config.CSV_FLOAT_PRECISION) + logger.info("Done") + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + ratio_nulls_per_samples_df = pl.read_csv(args.ratio_nulls_per_samples) + valid_samples = get_valid_samples( + ratio_nulls_per_samples_df, args.max_ratio_null_valid_sample + ) + + logger.info("Loading count data (before missing value imputation)") + non_imputed_count_df = get_counts(args.count_file) + + ratio_nulls_df = compute_ratios_null_values( + non_imputed_count_df, valid_samples, args.platform + ) + + # deleting non_imputed_count_df in order to free unused memory + del non_imputed_count_df + + # if the user provided an imputed count file, use it; otherwise, use the original count file + if args.imputed_count_file: + logger.info("Using imputed count file") + count_file = args.imputed_count_file + else: + logger.info("Using original count file") + count_file = args.count_file + + logger.info("Loading count data...") + count_df = get_counts(count_file) + logger.info( + f"Loaded count data with {count_df.shape[0]} rows and {count_df.shape[1]} columns" + ) + + logger.info("Computing statistics and stability score") + count_lf = count_df.lazy() + # getting expression statistics + stat_lf = get_main_statistics(count_lf, args.platform) + + # adding column for nb of null values for each gene + stat_lf = stat_lf.join( + ratio_nulls_df.lazy(), on=config.GENE_ID_COLNAME, how="inner" + ) + + # adding a column for the frequency of zero values + stat_lf = compute_ratio_zeros(count_lf, stat_lf, args.platform) + + # getting quantile intervals + stat_lf = get_quantile_intervals(stat_lf, args.platform) + + # exporting computed data + export_data(stat_lf, args.platform) + + +if __name__ == "__main__": + main() diff --git a/bin/compute_gene_transcript_lengths.py b/bin/compute_gene_transcript_lengths.py new file mode 100755 index 00000000..1bc009bb --- /dev/null +++ b/bin/compute_gene_transcript_lengths.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import config +import pandas as pd + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +OUTFILE = "gene_transcript_lengths.csv" + +GFF_COLUMNS = [ + "chromosome", + "source", + "feature", + "start", + "end", + "score", + "strand", + "phase", + "attributes", +] + +DTYPES = { + "chromosome": str, + "source": str, + "feature": str, + "start": int, + "end": int, + "score": str, + "strand": str, + "phase": str, + "attributes": str, +} + + +################################################################## +################################################################## +# FUNCTIONS +################################################################## +################################################################## + + +def parse_args(): + parser = argparse.ArgumentParser("Get CDNA lengths from GFF3 annotation file") + parser.add_argument( + "--annotation", + type=Path, + dest="annotation_file", + required=True, + help="Annotation file in GFF3 format", + ) + return parser.parse_args() + + +def parse_gff3_file(annotation_file: Path) -> pd.DataFrame: + return pd.read_csv( + annotation_file, + sep="\t", + names=GFF_COLUMNS, + dtype=DTYPES, + comment="#", + on_bad_lines="warn", + ) + + +def compute_transcript_lengths(df: pd.DataFrame) -> pd.DataFrame: + exon_df = df.loc[df["feature"] == "exon"].copy() + # extract transcript ID from attributes column for each exon + exon_df["transcript_id"] = exon_df["attributes"].str.extract( + r"Parent=transcript:([^;]+)" + ) + # compute transcript length + exon_df[config.CDNA_LENGTH_COLNAME] = exon_df["end"] - exon_df["start"] + 1 + exon_df = exon_df[["transcript_id", config.CDNA_LENGTH_COLNAME]] + return exon_df.groupby("transcript_id", as_index=False).agg( + {config.CDNA_LENGTH_COLNAME: "sum"} + ) + + +def compute_max_transcript_lengths_per_gene( + df: pd.DataFrame, transcript_lengths_df: pd.DataFrame +) -> pd.DataFrame: + rna_cols = [ + feature + for feature in df["feature"].unique() + if "RNA" in feature and "gene" not in feature + ] + rna_df = df.loc[df["feature"].isin(rna_cols)].copy() + + # extract gene ID from attributes column for each transcript + rna_df[config.GENE_ID_COLNAME] = rna_df["attributes"].str.extract( + r"Parent=gene:([^;]+)" + ) + # extract transcript ID from attributes column + rna_df["transcript_id"] = rna_df["attributes"].str.extract(r"ID=transcript:([^;]+)") + + # merge with transcript lengths dataframe to get length + merged_df = rna_df.merge(transcript_lengths_df, how="left", on="transcript_id") + logger.info( + f"Got length for {len(merged_df) / len(rna_df) * 100:.2f}% of transcripts" + ) + # compute max transcript length per gene + merged_df = merged_df[[config.GENE_ID_COLNAME, config.CDNA_LENGTH_COLNAME]] + return merged_df.groupby(config.GENE_ID_COLNAME, as_index=False).agg( + {config.CDNA_LENGTH_COLNAME: "max"} + ) + + +################################################################## +################################################################## +# MAIN +################################################################## +################################################################## + + +def main(): + args = parse_args() + + logger.info("Parsing annotation file") + df = parse_gff3_file(args.annotation_file) + + logger.info("Computing transcript lengths") + transcript_lengths_df = compute_transcript_lengths(df) + + # keep only mRNA and exon features + logger.info("Getting max transcript length per gene") + gene_length_df = compute_max_transcript_lengths_per_gene(df, transcript_lengths_df) + + logger.info(f"Writing to {OUTFILE}") + gene_length_df.to_csv(OUTFILE, index=False, header=True) + + +if __name__ == "__main__": + main() diff --git a/bin/compute_m_measures.py b/bin/compute_m_measures.py new file mode 100755 index 00000000..bc1d783e --- /dev/null +++ b/bin/compute_m_measures.py @@ -0,0 +1,219 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import config +import polars as pl + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +M_MEASURE_OUTFILE_NAME = "m_measures.csv" + +DEFAULT_CHUNKSIZE = 300 +NB_GENE_ID_CHUNK_FOLDERS = 100 + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser(description="Compute M-measure for each gene") + parser.add_argument( + "--counts", + type=Path, + dest="count_file", + required=True, + help="File containing std of lof expression ratios", + ) + parser.add_argument( + "--std-files", + type=str, + dest="std_files", + required=True, + help="File containing std of lof expression ratios", + ) + parser.add_argument( + "--task-attempts", + dest="task_attempts", + type=int, + default=1, + help="Number of task attempts", + ) + return parser.parse_args() + + +def get_nb_rows(lf: pl.LazyFrame): + return lf.select(pl.len()).collect().item() + + +def concat_all_std_data(files: list[Path], low_memory: bool) -> pl.LazyFrame: + lfs = [pl.scan_parquet(file, low_memory=low_memory) for file in files] + lf = pl.concat(lfs) + return ( + lf.explode(config.RATIOS_STD_COLNAME) + .group_by(config.GENE_ID_COLNAME) + .agg(pl.col(config.RATIOS_STD_COLNAME)) + ) + + +def compute_m_measures(lf: pl.LazyFrame) -> pl.LazyFrame: + return lf.select( + pl.col(config.GENE_ID_COLNAME), + ( + pl.col(config.RATIOS_STD_COLNAME).list.sum() + / (pl.col(config.RATIOS_STD_COLNAME).list.len() - 1) + ).alias(config.GENORM_M_MEASURE_COLNAME), + ) + + +def get_chunks(lst: list, chunksize: int): + """Yield successive n-sized chunks from lst.""" + for i in range(0, len(lst), chunksize): + yield lst[i : i + chunksize] + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + low_memory = True if args.task_attempts > 1 else False + files = [Path(file) for file in args.std_files.split(" ")] + + logger.info("Getting list of gene IDs") + count_lf = pl.scan_parquet(args.count_file, low_memory=low_memory) + + ############################################################################# + # MAKING A FOLDER FOR EACH CHUNK OF GENE IDS + ############################################################################# + gene_ids = count_lf.select(config.GENE_ID_COLNAME).collect().to_series().to_list() + gene_ids = sorted(gene_ids) + + chunksize = max( + 1, int(len(gene_ids) / NB_GENE_ID_CHUNK_FOLDERS) + ) # 1 if len(gene_ids) < NB_GENE_ID_CHUNK_FOLDERS + gene_id_list_chunks = list(get_chunks(gene_ids, chunksize=chunksize)) + + gene_id_chunk_folders = [] + for i in range(len(gene_id_list_chunks)): + gene_id_chunk_folder = Path(f"gene_ids_{i}") + gene_id_chunk_folder.mkdir(exist_ok=True) + gene_id_chunk_folders.append(gene_id_chunk_folder) + + ############################################################################# + # EXPORTING GENE DATA TO THEIR RESPECTIVE CHUNK FOLDER + ############################################################################# + # progressively decreasing the chunksize if OOM + chunksize = int(DEFAULT_CHUNKSIZE / args.task_attempts) + chunk_files_list = [ + files[i : i + chunksize] for i in range(0, len(files), chunksize) + ] + + logger.info("Parsing std data by chunks") + for i, chunk_files in enumerate(chunk_files_list): + # parsing files and making a first list concatenation + concat_lf = concat_all_std_data(chunk_files, low_memory) + + # looping through each group of gene IDs + for j, (gene_id_list_chunk, gene_id_chunk_folder) in enumerate( + zip(gene_id_list_chunks, gene_id_chunk_folders) + ): + # writing all data corresponding to this group of gene IDs in a specific folder + outfile = gene_id_chunk_folder / f"chunk.{i}.parquet" + concat_df = concat_lf.filter( + pl.col(config.GENE_ID_COLNAME).is_in(gene_id_list_chunk) + ).collect() + concat_df.write_parquet(outfile) + + ############################################################################# + # GATHERING ALL DATA CHUNK BY CHUNK AND COMPUTING M MEASURE FOR EACH GENE + ############################################################################# + computed_genes = 0 + nb_ratios_per_gene = set() + logger.info( + "Concatenating all std data by chunk of gene IDs and computing M measures" + ) + with open(M_MEASURE_OUTFILE_NAME, "a") as fout: + for i, gene_id_chunk_folder in enumerate(gene_id_chunk_folders): + chunk_files = list(gene_id_chunk_folder.iterdir()) + + concat_lf = concat_all_std_data(chunk_files, low_memory).sort( + config.GENE_ID_COLNAME + ) + + # computing M measures for these gene IDs + m_measure_lf = compute_m_measures(concat_lf) + m_measure_df = m_measure_lf.collect() + + ################################################# + # checks + ################################################# + if m_measure_df[config.GENE_ID_COLNAME].is_duplicated().any(): + raise ValueError("Duplicate values found for gene IDs!") + + process_gene_ids = sorted( + m_measure_df.select(config.GENE_ID_COLNAME).to_series().to_list() + ) + if process_gene_ids != gene_id_list_chunks[i]: + raise ValueError("Incorrect gene IDs found!") + + computed_genes += len(m_measure_df) + + unique_nb_ratios = ( + concat_lf.with_columns( + pl.col(config.RATIOS_STD_COLNAME).list.len().alias("length") + ) + .select("length") + .unique() + .collect() + .to_series() + .to_list() + ) + nb_ratios_per_gene.update(unique_nb_ratios) + + ################################################# + ################################################# + + # appending to output file + if i == 0: + m_measure_df.write_csv( + fout, + include_header=True, + float_precision=config.CSV_FLOAT_PRECISION, + ) + else: + m_measure_df.write_csv( + fout, + include_header=False, + float_precision=config.CSV_FLOAT_PRECISION, + ) + + logger.info(f"Number of gene IDs: {len(gene_ids)}") + logger.info(f"Number of computed genes: {computed_genes}") + if computed_genes != len(gene_ids): + raise ValueError( + f"Number of computed genes: {computed_genes} != number of gene IDs: {len(gene_ids)}" + ) + + if len(nb_ratios_per_gene) > 1: + logger.warning( + f"Got multiple number of std ratios to compute: {list(nb_ratios_per_gene)}" + ) + + +if __name__ == "__main__": + main() diff --git a/bin/compute_stability_scores.py b/bin/compute_stability_scores.py new file mode 100755 index 00000000..ad7fbba2 --- /dev/null +++ b/bin/compute_stability_scores.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from dataclasses import dataclass, field +from pathlib import Path +from typing import ClassVar + +import config +import polars as pl +from common import write_float_csv + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# outfile names +STATISTICS_WITH_SCORES_OUTFILENAME = "stats_with_scores.csv" + + +@dataclass +class StabilityScorer: + N_QUANTILES: ClassVar[int] = 1000 + + WEIGHT_FIELDS: ClassVar[list[str]] = [ + config.NORMFINDER_STABILITY_VALUE_COLNAME, + config.GENORM_M_MEASURE_COLNAME, + config.COEFFICIENT_OF_VARIATION_COLNAME, + config.ROBUST_COEFFICIENT_OF_VARIATION_MEDIAN_COLNAME, + ] + + WEIGHT_RATIO_NB_NULLS_TO_SCORING: ClassVar[float] = 1 + + df: pl.DataFrame + stability_score_weights_str: str + weights: dict[str, float] = field(default_factory=dict) + + def __post_init__(self): + self.parse_stability_score_weights() + self.compute_stability_score() + + def parse_stability_score_weights(self): + for weight_field, weight in zip( + self.WEIGHT_FIELDS, self.stability_score_weights_str.split(",") + ): + self.weights[weight_field] = float(weight) + + def linear_normalise(self, data: pl.Series, new_name: str) -> pl.Series: + """ + Linearly normalise a series + """ + min_val = data.min() + max_val = data.max() + return pl.Series(new_name, (data - min_val) / (max_val - min_val)) + + @staticmethod + def get_normalised_col(col: str) -> str: + return f"{col}_normalised" + + def compute_stability_score(self): + logger.info("Computing stability score for candidate genes") + + # since Normfinder is always run + # we can distinguish between candidate and non-candidate genes easily with this column + self.df = self.df.with_columns( + pl.when(pl.col(config.NORMFINDER_STABILITY_VALUE_COLNAME).is_not_null()) + .then(1) + .otherwise(0) + .alias(config.IS_CANDIDATE_COLNAME) + ) + + # dividing the dataframe into two parts: candidate and non-candidate genes + candidate_df = self.df.filter( + pl.col(config.IS_CANDIDATE_COLNAME) == 1 + ) # keep only candidate genes + non_candidate_df = self.df.filter(pl.col(config.IS_CANDIDATE_COLNAME) == 0) + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # DATA NORMALISATION (TO [0, 1]) + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + normalised_data = {} + null_data = {} + weight_sum = 0 + # iterate over columns that can participate in stability score calculation + for col, weight in self.weights.items(): + # if a column is absent, skip it + if col not in self.df.columns: + continue + data = candidate_df.select(col).to_series() + # for each column present, we perform linear transformation to have values between 0 and 1 + # and put these normalised data in another column suffixed with "_normalised" + normalised_col = self.get_normalised_col(col) + normalised_data[col] = self.linear_normalise(data, new_name=normalised_col) + # creating a null column with same name + null_data[col] = pl.Series(normalised_col, [None] * len(non_candidate_df)) + # counting the sum of weights corresponding to the columns present + # so that we can normalise the weights afterwards + weight_sum += weight + + # replacing original data with quantile normalised ones + candidate_df = candidate_df.with_columns( + data for data in normalised_data.values() + ) + # adding null columns to the non-candidate df to allow concatenation + non_candidate_df = non_candidate_df.with_columns( + data for data in null_data.values() + ) + + # concatenating with non candidate genes to have all genes + self.df = pl.concat([candidate_df, non_candidate_df]) + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # GENERAL FORMULA FOR STABILITY + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # adding penalty for samples with null values + # genes with at least one zero value are already excluded at that stage + stability_scoring_expr = ( + pl.col(config.RATIO_NULLS_VALID_SAMPLES_COLNAME) + * self.WEIGHT_RATIO_NB_NULLS_TO_SCORING + ) + + for col, weight in self.weights.items(): + if col not in self.df.columns: + logger.warning(f"Column {col} not found in dataframe") + continue + normalised_col = self.get_normalised_col(col) + # we do not want to include null / nan values in the stability score calculation + # because this would result in a total null / nan value for the stability score + stability_scoring_expr += ( + pl.when( + pl.col(normalised_col).is_not_null() + & pl.col(normalised_col).is_not_nan() + ) + .then(pl.col(normalised_col)) + .otherwise(pl.lit(0)) + ) + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + expr = ( + pl.when(pl.col(config.IS_CANDIDATE_COLNAME) == 1) + .then(stability_scoring_expr) + .otherwise(None) + ) + # add stability score column + self.df = self.df.with_columns(expr.alias(config.STABILITY_SCORE_COLNAME)) + + def get_statistics_with_stability_scores(self) -> pl.DataFrame: + return ( + self.df.sort( + config.STABILITY_SCORE_COLNAME, descending=False, nulls_last=True + ) + .with_row_index(name="index") + .with_columns((pl.col("index") + 1).alias(config.RANK_COLNAME)) + .drop("index") + ) + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Computes stability score for each gene" + ) + parser.add_argument( + "--stats", + type=Path, + dest="stats_file", + required=True, + help="Gene Statistics file", + ) + parser.add_argument( + "--normfinder-stability", + type=str, + required=True, + dest="normfinder_stability_file", + help="Output files of Normfinder", + ) + parser.add_argument( + "--genorm-stability", + type=str, + dest="genorm_stability_file", + help="Output files of Genorm", + ) + parser.add_argument( + "--weights", + dest="stability_score_weights", + type=str, + required=True, + help="Weights for Coefficient of Variation / Robust Coefficient of Variation on Median / Normfinder / Genorm respectively. Must be a comma-separated string. Example: 0.7,0.1,0.1,0.1", + ) + return parser.parse_args() + + +def get_stabilities(stability_files: list[Path]) -> pl.DataFrame: + """Retrieve and concatenate stability values from a list of stability files.""" + df = pl.read_csv(stability_files[0]) + if len(stability_files) > 1: + for file in stability_files[1:]: + new_df = pl.read_csv(file) + df = df.join(new_df, on=config.GENE_ID_COLNAME, how="left") + return df.with_columns(pl.lit(1).alias(config.IS_CANDIDATE_COLNAME)) + + +def get_statistics(stat_files: list[Path]) -> pl.DataFrame: + """Retrieve and concatenate data from a list of statistics files.""" + df = pl.read_csv(stat_files[0]) + if len(stat_files) > 1: + for file in stat_files[1:]: + new_df = pl.read_csv(file) + df = df.join(new_df, on=config.GENE_ID_COLNAME, how="left") + return df + + +def export_data(scored_df: pl.DataFrame): + """Export gene expression data to CSV files.""" + logger.info(f"Exporting stability scores to: {STATISTICS_WITH_SCORES_OUTFILENAME}") + write_float_csv(scored_df, STATISTICS_WITH_SCORES_OUTFILENAME) + logger.info("Done") + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + stat_df = pl.read_parquet(args.stats_file) + + stability_files = [ + Path(file) + for file in [args.normfinder_stability_file, args.genorm_stability_file] + if file is not None + ] + + # getting metadata and mappings + stability_df = get_stabilities(stability_files) + # merges base statistics with computed stability measurements + df = stat_df.join(stability_df, on=config.GENE_ID_COLNAME, how="left") + + # sort genes according to the metrics present in the dataframe + stability_scorer = StabilityScorer(df, args.stability_score_weights) + scored_df = stability_scorer.get_statistics_with_stability_scores() + + # exporting computed data + export_data(scored_df) + + +if __name__ == "__main__": + main() diff --git a/bin/compute_tpm.py b/bin/compute_tpm.py new file mode 100755 index 00000000..0dd889f8 --- /dev/null +++ b/bin/compute_tpm.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +import sys +from pathlib import Path + +import config +import polars as pl +from common import ( + compute_log2, + export_parquet, + parse_count_table, + parse_table, +) + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +OUTFILE_SUFFIX = ".tpm.parquet" + +WARNING_REASON_FILE = "warning_reason.txt" +FAILURE_REASON_FILE = "failure_reason.txt" + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser(description="Normalise data to TPM") + parser.add_argument( + "--counts", type=Path, dest="count_file", required=True, help="Count file" + ) + parser.add_argument( + "--gene-lengths", + type=Path, + dest="gene_lengths_file", + required=True, + help="Gene lengths file (CSV format)", + ) + return parser.parse_args() + + +def try_cast_to_int(df: pl.DataFrame) -> pl.DataFrame: + """Try casting columns to integers.""" + count_columns = df.select(pl.exclude(config.GENE_ID_COLNAME)).columns + # try casting to handle integer values that are float-formated like 1.0 + for col in count_columns: + is_all_integers = df.select(pl.col(col).round().eq(pl.col(col)).all()).item() + if is_all_integers: + df = df.with_columns(pl.col(col).cast(pl.Int64())) + return df + + +def is_raw_counts(df: pl.DataFrame) -> bool: + """Check if the data are raw counts (integers).""" + count_columns = df.select(pl.exclude(config.GENE_ID_COLNAME)).columns + return all( + dtype + in ( + pl.Int8(), + pl.Int16(), + pl.Int32(), + pl.Int64(), + pl.UInt8(), + pl.UInt16(), + pl.UInt32(), + pl.UInt64(), + ) + for dtype in df.select(count_columns).schema.values() + ) + + +def is_tpm(df: pl.DataFrame) -> bool: + """Check if the data are TPM (sum to 1e6 per sample).""" + sample_sums_df = df.select(pl.exclude(config.GENE_ID_COLNAME).sum()) + # a small error is possible, and we assume that if the sum is close to 1e6, it is TPM + # setting the tolerance to 100 + is_tpm_col_df = sample_sums_df.select((pl.all() - 1e6).abs() < 1e2) + return is_tpm_col_df.select( + pl.any_horizontal(pl.all()) + ).item() # Allow for floating-point precision + + +def compute_rpkm(df: pl.DataFrame, cdna_length_df: pl.DataFrame) -> pl.DataFrame: + """ + Process raw counts to RPKM. + """ + logger.info("Computing RPKM.") + df = df.join(cdna_length_df, on=config.GENE_ID_COLNAME) + return df.select( + pl.col(config.GENE_ID_COLNAME), + pl.exclude([config.GENE_ID_COLNAME, config.CDNA_LENGTH_COLNAME]).truediv( + pl.col(config.CDNA_LENGTH_COLNAME) + ), + ) + + +def compute_tpm_from_rpkm(rpkm_df: pl.DataFrame) -> pl.DataFrame: + """ + Process RPKM to TPM. + """ + logger.info("Computing TPM from RPKM.") + sums = rpkm_df.select(pl.exclude(config.GENE_ID_COLNAME).sum()) + # Divide each column by its sum and multiply by 1e6 + count_columns = rpkm_df.select(pl.exclude(config.GENE_ID_COLNAME)).columns + return rpkm_df.select( + [pl.col(config.GENE_ID_COLNAME)] + + [(pl.col(col) / sums[col][0] * 1e6).alias(col) for col in count_columns], + ) + + +def compute_tpm(df: pl.DataFrame, cdna_length_df: pl.DataFrame) -> pl.DataFrame: + """ + Process raw counts, FPKM, or RPKM to TPM. + """ + if is_raw_counts(df): + logger.info("Raw counts detected → computing TPM directly.") + rpkm_df = compute_rpkm(df, cdna_length_df) + return compute_tpm_from_rpkm(rpkm_df) + elif is_tpm(df): + logger.info("Data are already TPM. No conversion needed.") + return df + else: + # Convert FPKM/RPKM to TPM + logger.info("Assuming FPKM/RPKM normalisation.") + return compute_tpm_from_rpkm(df) + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + try: + logger.info("Parsing data") + count_df = parse_count_table(args.count_file) + cdna_length_df = parse_table(args.gene_lengths_file) + + logger.info("Converting data types") + count_df = try_cast_to_int(count_df) + + logger.info(f"Normalising {args.count_file.name}") + count_df = compute_tpm(count_df, cdna_length_df) + + logger.info("Computing log2 values") + count_df = compute_log2(count_df) + + export_parquet(count_df, args.count_file, OUTFILE_SUFFIX) + + except Exception as e: + logger.error(f"Error occurred while normalising data: {e}") + msg = "UNEXPECTED ERROR" + logger.error(msg) + with open(FAILURE_REASON_FILE, "w") as f: + f.write(msg) + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/bin/config.py b/bin/config.py new file mode 100644 index 00000000..fe217991 --- /dev/null +++ b/bin/config.py @@ -0,0 +1,44 @@ +# general column names +GENE_ID_COLNAME = "gene_id" +GENE_ID_COUNT_COLNAME = "count" +CDNA_LENGTH_COLNAME = "length" +RANK_COLNAME = "rank" + +# base statistics +COEFFICIENT_OF_VARIATION_COLNAME = "coefficient_of_variation" +ROBUST_COEFFICIENT_OF_VARIATION_MEDIAN_COLNAME = ( + "robust_coefficient_of_variation_median" +) +STANDARD_DEVIATION_COLNAME = "standard_deviation" +STABILITY_SCORE_COLNAME = "stability_score" +MEAN_COLNAME = "mean" +MEDIAN_COLNAME = "median" +MAD_COLNAME = "median_absolute_deviation" +EXPRESSION_LEVEL_STATUS_COLNAME = "expression_level_status" +EXPRESSION_LEVEL_QUANTILE_INTERVAL_COLNAME = "expression_level_quantile_interval" +RATIO_NULLS_COLNAME = "ratio_nulls_in_all_samples" +RATIO_NULLS_VALID_SAMPLES_COLNAME = "ratio_nulls_in_valid_samples" +RATIO_ZEROS_COLNAME = "ratio_zeros" +IS_CANDIDATE_COLNAME = "is_candidate" + +# dataset statistics +KS_TEST_COLNAME = "kolmogorov_smirnov_pvalue" + +# count dataframe +GENE_COUNT_COLNAME = "count" +SAMPLE_COLNAME = "sample" +RATIO_COLNAME = "ratio" + +# gene metadata +ORIGINAL_GENE_ID_COLNAME = "original_gene_id" +ORIGINAL_GENE_IDS_COLNAME = "original_gene_ids" +GENE_NAME_COLNAME = "name" +GENE_DESCRIPTION_COLNAME = "description" +SECTION_COLNAME = "section" + +# computed stability values +NORMFINDER_STABILITY_VALUE_COLNAME = "normfinder_stability_value" +GENORM_M_MEASURE_COLNAME = "genorm_m_measure" +RATIOS_STD_COLNAME = "ratios_stds" + +CSV_FLOAT_PRECISION = 6 diff --git a/bin/deseq2_normalize.R b/bin/deseq2_normalize.R deleted file mode 100755 index 03334487..00000000 --- a/bin/deseq2_normalize.R +++ /dev/null @@ -1,119 +0,0 @@ -#!/usr/bin/env Rscript - -# Written by Olivier Coen. Released under the MIT license. - -library(DESeq2) -library(optparse) - -##################################################### -##################################################### -# FUNCTIONS -##################################################### -##################################################### - - -get_args <- function() { - - option_list <- list( - make_option("--counts", dest = 'count_file', help = "Path to input count file"), - make_option("--design", dest = 'design_file', help = "Path to input design file") - ) - - args <- parse_args(OptionParser( - option_list = option_list, - description = "Normalize counts using DESeq2" - )) - - return(args) -} - -prefilter_counts <- function(dds, design_data) { - # see https://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html - # getting size of smallest group - group_sizes <- table(design_data$condition) - smallest_group_size <- min(group_sizes) - # keep genes with at least 10 counts over a certain number of samples - keep <- rowSums(counts(dds) >= 10) >= smallest_group_size - dds <- dds[keep,] - return(dds) -} - -get_normalized_counts <- function(dds) { - # perform normalization - dds <- estimateSizeFactors(dds) - normalized_counts <- counts(dds, normalized = TRUE) - return(normalized_counts) -} - -filter_out_genes_with_at_least_one_zero <- function(count_matrix, normalized_counts) { - # get gene IDs corresponding to rows with only non-zero counts - non_zero_rows <- rownames(count_matrix[apply(count_matrix!=0, 1, all),]) - # filter out genes with zero counts - normalized_counts <- normalized_counts[rownames(normalized_counts) %in% non_zero_rows, ] - return(normalized_counts) -} - -get_log2_cpm_counts <- function(normalized_counts, count_data) { - # calculate total counts per sample (library size) - library_sizes <- colSums(count_data) - # convert normalized counts to CPM - cpm_counts <- t(t(normalized_counts) / library_sizes * 1e6) - cpm_counts <- log2(cpm_counts + 1) - return(cpm_counts) -} - -get_normalized_cpm_counts <- function(count_file, design_file) { - - print(paste('Normalizing counts in:', count_file)) - - count_data <- read.csv(count_file, row.names = 1) - design_data <- read.csv(design_file) - - design_data <- design_data[design_data$sample %in% colnames(count_data), ] - - count_matrix <- as.matrix(count_data) - - col_data <- data.frame( - row.names = design_data$sample, - condition = factor(design_data$condition) - ) - - # check if the column names of count_matrix match the row names of col_data - if (!all(colnames(count_matrix) == rownames(col_data))) { - stop("Sample names in the count matrix do not match the design data.") - } - - dds <- DESeqDataSetFromMatrix(countData = count_matrix, colData = col_data, design = ~ condition) - - # pre-filter genes with low counts - # not absolutely necessary, but good practice to avoid RAM issues - dds <- prefilter_counts(dds, design_data) - - normalized_counts <- get_normalized_counts(dds) - #print(length(rownames(normalized_counts))) - - normalized_counts <- filter_out_genes_with_at_least_one_zero(count_matrix, normalized_counts) - #print(length(rownames(normalized_counts))) - - cpm_counts <- get_log2_cpm_counts(normalized_counts, count_data) - - return(cpm_counts) -} - -export_data <- function(cpm_counts, filename) { - filename <- sub("\\.csv$", ".log_cpm.csv", filename) - print(paste('Exporting normalized counts per million to:', filename)) - write.table(cpm_counts, filename, sep = ',', row.names = TRUE, quote = FALSE) -} - -##################################################### -##################################################### -# MAIN -##################################################### -##################################################### - -args <- get_args() - -cpm_counts <- get_normalized_cpm_counts(args$count_file, args$design_file) - -export_data(cpm_counts, basename(args$count_file)) diff --git a/bin/detect_rare_genes.py b/bin/detect_rare_genes.py new file mode 100755 index 00000000..02cc2831 --- /dev/null +++ b/bin/detect_rare_genes.py @@ -0,0 +1,138 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import config +import polars as pl +from common import parse_table + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +VALID_GENE_IDS_OUTFILE = "valid_gene_ids.txt" +TOTAL_OCCURRENCES_OUTFILE = "total_gene_id_occurrence_quantiles.csv" + +################################################################## +# FUNCTIONS +################################################################## + + +def parse_args(): + parser = argparse.ArgumentParser("Get genes with good occurrence") + parser.add_argument( + "--occurrences", + type=Path, + required=True, + dest="gene_id_occurrence_file", + help="Input file containing gene ID occurrences", + ) + parser.add_argument( + "--mappings", + type=Path, + required=True, + dest="mapping_file", + help="Mapping file containing gene IDs", + ) + parser.add_argument( + "--nb-datasets", + type=int, + required=True, + dest="nb_datasets", + help="Number of datasets", + ) + parser.add_argument( + "--min-occurrence-frequency", + type=float, + required=True, + dest="min_occurrence_frequency", + help="Minimum frequency of occurrences for a gene among all datasets", + ) + parser.add_argument( + "--min-occurrence-quantile", + type=float, + required=True, + dest="min_occurrence_quantile", + help="Minimum frequency of occurrences for a gene among all datasets", + ) + return parser.parse_args() + + +################################################################## +# MAIN +################################################################## + + +def main(): + args = parse_args() + + original_gene_id_occurrence_df = parse_table(args.gene_id_occurrence_file) + mapping_df = parse_table(args.mapping_file) + nb_mapped_genes = len(mapping_df) + + df = original_gene_id_occurrence_df.join( + mapping_df, + on=config.ORIGINAL_GENE_ID_COLNAME, + ) + + total_gene_id_occurrence_df = df.group_by(config.GENE_ID_COLNAME).agg( + pl.col(config.GENE_ID_COUNT_COLNAME).sum().alias("total_occurrences") + ) + + df = ( + df.join( + total_gene_id_occurrence_df, + on=config.GENE_ID_COLNAME, + ) + .with_columns( + total_occurrences_quantile=( + pl.col("total_occurrences").rank(method="max") + / pl.col("total_occurrences").count() + ), + total_occurrences_frequency=( + pl.col("total_occurrences") / args.nb_datasets + ), + ) + .select( + [ + config.GENE_ID_COLNAME, + "total_occurrences_frequency", + "total_occurrences_quantile", + ] + ) + .unique() + ) + + # sorting (for output consistency) + df = df.sort(["total_occurrences_quantile", "gene_id"], descending=[True, False]) + + # writing total occurrences in a csv before filtering + df.select([config.GENE_ID_COLNAME, "total_occurrences_quantile"]).write_csv( + TOTAL_OCCURRENCES_OUTFILE + ) + + # filtering genes + valid_gene_ids = ( + df.filter(pl.col("total_occurrences_quantile") >= args.min_occurrence_quantile) + .filter(pl.col("total_occurrences_frequency") >= args.min_occurrence_frequency) + .select(config.GENE_ID_COLNAME) + .unique() + .to_series() + .to_list() + ) + + with open(VALID_GENE_IDS_OUTFILE, "w") as f: + f.write("\n".join(valid_gene_ids)) + + nb_valid_genes = len(valid_gene_ids) + + logger.info( + f"Found {nb_valid_genes} valid gene IDs ({nb_valid_genes / nb_mapped_genes:.2%})" + ) + + +if __name__ == "__main__": + main() diff --git a/bin/download_eatlas_data.R b/bin/download_eatlas_data.R new file mode 100755 index 00000000..2684908a --- /dev/null +++ b/bin/download_eatlas_data.R @@ -0,0 +1,233 @@ +#!/usr/bin/env Rscript + +# Written by Olivier Coen. Released under the MIT license. + +options(error = traceback) +suppressPackageStartupMessages(library("ExpressionAtlas")) +library(ExpressionAtlas) +library(optparse) + +FAILURE_REASON_FILE <- "failure_reason.txt" +WARNING_REASON_FILE <- "warning_reason.txt" + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + +get_args <- function() { + option_list <- list( + make_option("--accession", type = "character", help = "Accession number of expression atlas experiment. Example: E-MTAB-552") + ) + + args <- parse_args(OptionParser( + option_list = option_list, + description = "Get expression atlas data" + )) + return(args) +} + +download_expression_atlas_data_with_retries <- function(accession, max_retries = 3, wait_time = 5) { + success <- FALSE + attempts <- 0 + + while (!success && attempts < max_retries) { + attempts <- attempts + 1 + + tryCatch({ + atlas_data <- ExpressionAtlas::getAtlasData( accession ) + success <- TRUE + + }, warning = function(w) { + + # if the accession os not valid, we stop immediately (useless to keep going) + if (grepl("does not look like an ArrayExpress/BioStudies experiment accession.", w$message)) { + warning(w$message) + write("EXPERIMENT NOT FOUND", file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) + } + + # else, retrying + message("Attempt ", attempts, " Warning: ", w$message) + + if (attempts < max_retries) { + warning("Retrying in ", wait_time, " seconds...") + Sys.sleep(wait_time) + + } else { + + if (grepl("550 Requested action not taken; file unavailable", w$message)) { + warning(w$message) + write("EXPERIMENT SUMMARY NOT FOUND", file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) + } else if (grepl("Failure when receiving data from the peer", w$message)) { + warning(w$message) + write("EXPERIMENT NOT FOUND", file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) + } else if (grepl("FTP status was", w$message)) { + warning(w$message) + write("FTP ERROR", file = FAILURE_REASON_FILE) + quit(save = "no", status = 101) + } else { + warning("Unhandled warning: ", w$message) + write("UNKNOWN ERROR", file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) + } + } + + }, error = function(e) { + + message("Attempt ", attempts, " Message: ", e$message) + + if (attempts < max_retries) { + warning("Retrying in ", wait_time, " seconds...") + Sys.sleep(wait_time) + + } else { + + if (grepl("Download appeared successful but no experiment summary object was found", e$message)) { + warning(e$message) + write("EXPERIMENT SUMMARY NOT FOUND", file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) + } else { + warning("Unhandled error: ", e$message) + write("UNKNOWN ERROR", file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) + } + + } + }) + } + + return(atlas_data) +} + +get_rnaseq_data <- function(data) { + return(list( + count_data = assays( data )$counts, + platform = 'rnaseq', + count_type = 'raw', # rnaseq data are raw in ExpressionAtlas + sample_groups = colData(data)$AtlasAssayGroup + )) +} + +get_one_colour_microarray_data <- function(data) { + return(list( + count_data = exprs( data ), + platform = 'microarray', + count_type = 'normalised', # one colour microarray data are already normalised in ExpressionAtlas + sample_groups = phenoData(data)$AtlasAssayGroup + )) +} + +get_batch_id <- function(accession, data_type) { + batch_id <- paste0(accession, '_', data_type) + # cleaning + batch_id <- gsub("-", "_", batch_id) + return(batch_id) +} + +get_new_sample_names <- function(result, batch_id) { + new_colnames <- paste0(batch_id, '_', colnames(result$count_data)) + return(new_colnames) +} + +export_count_data <- function(result, batch_id) { + + # renaming columns, to make them specific to accession and data type + colnames(result$count_data) <- get_new_sample_names(result, batch_id) + + outfilename <- paste0(batch_id, '.', result$platform, '.', result$count_type, '.counts.csv') + + # exporting to CSV file + # index represents gene names + cat(paste('Exporting count data to file', outfilename)) + write.table(result$count_data, outfilename, sep = ',', row.names = TRUE, col.names = TRUE, quote = FALSE) +} + +export_metadata <- function(result, batch_id) { + + new_colnames <- get_new_sample_names(result, batch_id) + batch_list <- rep(batch_id, length(new_colnames)) + + df <- data.frame( + batch = batch_list, + condition = result$sample_groups, + sample = new_colnames + ) + + outfilename <- paste0(batch_id, '.design.csv') + cat(paste('Exporting design data to file', outfilename)) + write.table(df, outfilename, sep = ',', row.names = FALSE, col.names = TRUE, quote = FALSE) +} + + +process_data <- function(atlas_data, accession) { + + eset <- atlas_data[[ accession ]] + + # looping through each data type (ex: 'rnaseq') in the experiment + for (data_type in names(eset)) { + + data <- eset[[ data_type ]] + + skip_iteration <- FALSE + # getting count dataframe + tryCatch({ + + if ( data_type == 'rnaseq' ) { + result <- get_rnaseq_data(data) + } else if ( startsWith(data_type, 'A-') ) { # typically: A-AFFY- or A-GEOD- + result <- get_one_colour_microarray_data(data) + } else { + warning(paste("Unknown data type:", data_type)) + write(paste("UNKNOWN DATA TYPE:", data_type), file = WARNING_REASON_FILE, append=TRUE) + skip_iteration <<- TRUE + } + + }, error = function(e) { + warning(paste("Caught an error: ", e$message)) + write(paste('ERROR: COULD NOT GET ASSAY DATA FOR EXPERIMENT ID', accession, 'AND DATA TYPE', data_type), file = WARNING_REASON_FILE, append=TRUE) + skip_iteration <<- TRUE + }) + + # If an error occurred, skip to the next iteration + if (skip_iteration) { + next + } + + batch_id <- get_batch_id(accession, data_type) + + # exporting count data to CSV + export_count_data(result, batch_id) + + # exporting metadata to CSV + export_metadata(result, batch_id) + } + +} + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + +args <- get_args() + +cat(paste("Getting data for accession", args$accession, "\n")) + +accession <- trimws(args$accession) +if (startsWith(accession, "E-PROT")) { + warning("Ignoring the ", accession, " experiment.") + write("PROTEOME ACCESSIONS NOT HANDLED", file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) +} + +# searching and downloading expression atlas data +atlas_data <- download_expression_atlas_data_with_retries(args$accession) + +# writing count data in atlas_data to specific CSV files +process_data(atlas_data, args$accession) diff --git a/bin/download_geo_data.R b/bin/download_geo_data.R new file mode 100755 index 00000000..2686f13c --- /dev/null +++ b/bin/download_geo_data.R @@ -0,0 +1,827 @@ +#!/usr/bin/env Rscript + +# Written by Olivier Coen. Released under the MIT license. + +suppressPackageStartupMessages(library("GEOquery")) +suppressPackageStartupMessages(library("dplyr")) +suppressPackageStartupMessages(library("tibble")) +suppressPackageStartupMessages(library("stringr")) +library(GEOquery) +library(optparse) +library(dplyr) +library(tibble) +library(stringr) + +options(error = traceback) + +COUNT_FILE_EXTENSION <- ".counts.csv" +DESIGN_FILE_EXTENSION <- ".design.csv" +MAPPING_FILE_EXTENSION <- ".sample_name_mapping.csv" +METADATA_FILE_EXTENSION <- ".platform_metadata.csv" +BASE_REJECTED_DIR <- "rejected" + +FAILURE_REASON_FILE <- "failure_reason.txt" +WARNING_REASON_FILE <- "warning_reason.txt" + +##################################################### +##################################################### +# ARG PARSER +##################################################### +##################################################### + +get_args <- function() { + option_list <- list( + make_option("--accession", type = "character", help = "Accession number of GEO dataset. Example: GSE56413"), + make_option("--species", type = "character", help = "Species name") + ) + + args <- parse_args(OptionParser( + option_list = option_list, + description = "Get GEO data" + )) + return(args) +} + + +##################################################### +##################################################### +# UTILS +##################################################### +##################################################### + +format_species_name <- function(x) { + x <- tools::toTitleCase(x) + x <- gsub("[_-]", " ", x) + return(x) +} + +write_warning <- function(msg) { + message(msg) + file_conn <- file( WARNING_REASON_FILE, open = "a") + cat(paste0(msg, "; "), file = file_conn, sep = "", fill = FALSE) + close(file_conn) +} + + +get_extensions <- function(file){ + extensions <- strsplit(basename(file), split="\\.")[[1]] + return(extensions) +} + + +get_rejected_dir <- function(platform, series) { + rejected_dir <- file.path(BASE_REJECTED_DIR, paste0(series$accession, '_', platform$id)) + dir.exists(rejected_dir) || dir.create(rejected_dir, recursive = TRUE) + return(rejected_dir) +} + + +clean_column_names <- function(df){ + + if (length(unique(colnames(df))) < length(colnames(df))){ + colnames(df) <- paste0(colnames(df), '_', seq_along(df)) + return(df) + } +} + + +##################################################### +##################################################### +# DOWNLOAD +##################################################### +##################################################### + +download_geo_data_with_retries <- function(accession, max_retries = 3, wait_time = 5) { + + success <- FALSE + attempts <- 0 + + while (!success && attempts < max_retries) { + attempts <- attempts + 1 + + tryCatch({ + geo_data <- GEOquery::getGEO( accession ) + success <- TRUE + + }, error = function(e) { + + message("Attempt ", attempts, " Message: ", e$message) + + if (attempts < max_retries) { + warning("Retrying in ", wait_time, " seconds...") + Sys.sleep(wait_time) + + } else { + warning("Unhandled error: ", e$message) + write("EXPERIMENT NOT FOUND", file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) + } + }) + + } + return(geo_data) +} + +##################################################### +##################################################### +# PARSE SERIES / PLATFORM METADATA +##################################################### +##################################################### + +get_experiment_data <- function(geo_data) { + data <- geo_data[[1]] + experiment_data <- experimentData(data) + #print(experiment_data) + return(experiment_data) +} + + +get_experiment_type <- function(geo_data) { + experiment_data <- get_experiment_data(geo_data) + experiment_type <- tolower(attr(experiment_data, "other")$type) + if (experiment_type == "expression profiling by high throughput sequencing") { + return("rnaseq") + } else if (experiment_type == "expression profiling by array") { + return("microarray") + } else { + return(gsub("\n", " ; ", experiment_type)) + } +} + +get_series_species <- function(geo_data) { + message("Getting species included in series") + species_list <- list() + for (i in 1:length(geo_data)) { + data <- geo_data[[ i ]] + metadata <- pData(data) + li <- unique(metadata$organism_ch1) + # check if organism_ch2 exists + if ("organism_ch2" %in% colnames(metadata)) { + li <- append(li, unique(metadata$organism_ch2)) + } + species_list[[i]] <- li + } + species_list <- unique(unlist(species_list)) + return(species_list) +} + + +get_series_supplementary_data <- function(geo_data, series) { + series_species <- get_series_species(geo_data) + if (length(series_species) > 1) { + message(paste("Multiple species found in series:", paste(series_species, collapse = ", "), ". Will not download supplementary data")) + return(list()) + } else if (length(series_species) == 0) { + message("No species found in series...") + return(list()) + } else { + if (series_species != series$species) { + message(paste("Species provided by the user:", series_species, "does not match species in GEO data:", series$species)) + return(list()) + } + experiment_data <- get_experiment_data(geo_data) + suppl_data_str <- attr(experiment_data, "other")$supplementary_file + return(stringr::str_split(suppl_data_str, "\n")[[1]]) + } +} + + +get_platform_id <- function(metadata) { + platform_id <- as.character(unique(metadata$platform_id))[1] + return(platform_id) +} + + +##################################################### +##################################################### +# RNASEQ SAMPLES +##################################################### +##################################################### + +get_rnaseq_samples <- function(geo_data, design_df) { + + rnaseq_sample_df_list <- list() + for (i in 1:length(geo_data)) { + data <- geo_data[[ i ]] + metadata <- pData(data) + if (!("library_strategy" %in% colnames(metadata))) { + message("library_strategy column not found in metadata") + next + } + rnaseq_sample_df_list[[i]] <- metadata %>% + filter(library_strategy == "RNA-Seq" & geo_accession %in% design_df$sample) %>% + select(geo_accession) + } + # concatenate rows + rnaseq_sample_df <- Reduce( + function(df1, df2) dplyr::bind_rows(df1, df2), + rnaseq_sample_df_list + ) + return(rnaseq_sample_df$geo_accession) +} + + +##################################################### +##################################################### +# SAMPLE NAME MAPPING +##################################################### +##################################################### + + +make_sample_name_mapping <- function(geo_data) { + message("Making sample name mapping") + mapping_df_list <- list() + for (i in 1:length(geo_data)) { + data <- geo_data[[ i ]] + metadata <- pData(data) + mapping_df_list[[i]] <- metadata %>% + mutate( + sample_id = geo_accession, + sample_name = title + ) %>% + select(sample_id, sample_name) + } + # concatenate rows + mapping_df <- Reduce( + function(df1, df2) dplyr::bind_rows(df1, df2), + mapping_df_list + ) + return(mapping_df) +} + +rename_columns <- function(df, mapping_df) { + id_map <- setNames(mapping_df$sample_id, mapping_df$sample_name) + names(df) <- ifelse( + names(df) %in% names(id_map), + id_map[names(df)], + names(df) + ) + return(df) +} + +##################################################### +##################################################### +# DESIGN +##################################################### +##################################################### + +get_samples_for_species <- function(metadata, species) { + # check if organism_ch2 exists + if ("organism_ch2" %in% colnames(metadata)) { + keep <- metadata$organism_ch1 == species & metadata$organism_ch2 == species + } else { + keep <- metadata$organism_ch1 == species + } + + # return a data.frame with matching samples + return(metadata$geo_accession[keep]) +} + + +get_columns_for_grouping <- function(df) { + + base_columns <- c("characteristics", "treatment_protocol", "label_protocol", "extract_protocol", "growth_protocol") + + columns_to_group <- c() + for (base_col in base_columns) { + ch1_col <- paste0(base_col, "_ch1") + ch2_col <- paste0(base_col, "_ch2") + + if (ch1_col %in% colnames(df)) { + columns_to_group <- c(columns_to_group, ch1_col) + } + if (ch2_col %in% colnames(df)) { + columns_to_group <- c(columns_to_group, ch2_col) + } + } + + return(columns_to_group) +} + + +build_design_dataframe <- function(df, accession) { + columns_to_group <- get_columns_for_grouping(df) + + design_df <- df %>% + mutate(sample = geo_accession) %>% # change column name geo_accession to sample + group_by(!!!syms(columns_to_group)) %>% # group by all columns for grouping found + mutate(group_num = cur_group_id()) %>% # create column made from group id + ungroup() %>% + mutate( + condition = paste0("G", group_num), # create condition column from group number + batch = accession + ) %>% + select(sample, condition, batch) %>% + arrange(condition) + + return(design_df) +} + + +get_design_for_platform <- function(design_df, metadata) { + platform_samples <- metadata$geo_accession + platform_design_df <- design_df %>% + filter(sample %in% platform_samples) + return(platform_design_df) +} + +get_design_for_rnaseq <- function(design_df, rnaseq_samples) { + rnaseq_design_df <- design_df %>% + filter(sample %in% rnaseq_samples) + return(rnaseq_design_df) +} + + +make_design <- function(metadata, series) { + design_df <- build_design_dataframe(metadata, series$accession) + # get samples corresponding to species + species_samples <- get_samples_for_species(metadata, series$species) + # filter design dataframe + design_df <- design_df %>% + filter(sample %in% species_samples) + return(design_df) +} + + +make_overall_design <- function(geo_data, series) { + message("Making overall design") + design_df_list <- list() + for (i in 1:length(geo_data)) { + data <- geo_data[[ i ]] + metadata <- pData(data) + #print(metadata) + # make design dataframe + # keep only samples corresponding to the species of interest + design_df <- make_design(metadata, series) + design_df_list[[i]] <- design_df + } + # full outer join + design_df <- Reduce( + function(df1, df2) dplyr::bind_rows(df1, df2), + design_df_list + ) + return(design_df) +} + + +##################################################### +##################################################### +# PARSE COUNTS FROM DATA +##################################################### +##################################################### + + +get_microarray_counts <- function(platform) { + # get count data corresponding to samples in the design + counts <- data.frame(exprs(platform$data)) %>% + select(all_of(platform$design$sample)) + # for now, only one element in the list + return(counts) +} + +parse_first_line <- function(filename, sep){ + tryCatch({ + counts <- read.table(filename, header = FALSE, sep = sep, row.names = 1, nrows = 1) + return(counts) + }, error = function(e) { + write_warning(paste("ERROR PARSING FIRST LINE IN", filename)) + return(NULL) + }) +} + +download_file <- function(data_url, filename){ + tryCatch({ + download.file(data_url, filename, method = "wget", quiet = TRUE) + return("SUCCESS") + }, error = function(e) { + write_warning(paste("ERROR WHILE DOWNLOADING:", filename)) + return("FAILURE") + }) +} + + +get_raw_counts_from_url <- function(data_url) { + + if ( tolower(data_url) == "none" || is.na(data_url) || data_url == "") { + write_warning(paste("MISFORMED URL:", data_url)) + return(NULL) + } + + filename <- tolower(basename(data_url)) + extensions <- get_extensions(filename) + ext <- extensions[length(extensions)] + if (ext == "gz") { + ext <- extensions[length(extensions) - 1] + } + if (!(ext %in% c("txt", "tsv", "csv", "tab"))) { + write_warning(paste("UNSUPPORTED EXTENSION:", ext, "for URL:", data_url)) + return(NULL) + } + + message(paste("Downloading", filename)) + download_status <- download_file(data_url, filename) + if (download_status == "FAILURE") { + return(NULL) + } + + separator <- NULL + for (sep in c("\t", ",", " ")) { + + # parsing the first line to determine the separator and see if there is a header + first_line <- parse_first_line(filename, sep) + if (is.null(first_line)) { + return(NULL) + } + + if (ncol(first_line) > 0) { + separator <- sep + if (is.numeric(first_line[1, 1])) { + has_header <- FALSE + } else { + has_header <- TRUE + } + break + } + } + + if (is.null(separator)) { + write_warning(paste("NO VALID SEPARATOR:", filename)) + return(NULL) + } + + message(paste("Parsing", filename)) + tryCatch({ + counts <- read.table(filename, header = has_header, sep = separator, row.names = 1) + }, error = function(e) { + write_warning(paste("ERROR WHILE PARSING", filename)) + return(NULL) + }) + + # removes rows that are all NA + counts <- counts[rowSums(!is.na(counts)) > 0, , drop = FALSE] + return(counts) +} + + +get_all_rnaseq_counts <- function(platform) { + pdata <- platform$metadata + # getting list of samples + samples <- pdata$geo_accession + # getting list of columns corresponding to supp data + # IMPORTANT: we assume here that data are of the same type (raw, TPM, FPKM, etc.) in each supplementary file column + supplementary_cols <- grep("^supplementary_file(_\\d)?$", names(pdata), value = TRUE) + + if (length(supplementary_cols) == 0) { + message("No supplementary files found") + return(data.frame()) + } else if (length(supplementary_cols) > 1) { + message("Multiple supplementary files found") + } + + suppl_df_cpt <- 1 + suppl_count_dfs <- list() + # building one count dataframe by type of suppl data + for (i in 1:length(supplementary_cols)) { + + count_df_list <- list() + cpt = 1 + for (j in 1:length(samples)) { + sample <- samples[[j]] + data_url <- pdata[pdata$geo_accession == sample, supplementary_cols[i]] + + counts <- get_raw_counts_from_url(data_url) + if (is.null(counts)) { + next + } + + if (ncol(counts) == 1) { + colnames(counts) <- c(sample) + } else { + # if multiple columns, we don't know how to deal with it + # nut it will be filtered out later at column match checking + message(paste("Multiple columns found for sample", sample)) + } + + # in case there is already a gene_id column, remove it + if ("gene_id" %in% names(counts)) { + counts <- counts[, -which(names(counts) == "gene_id")] + } + # setting the row names (gene ids) as a column + counts <- tibble::rownames_to_column(counts, var = "gene_id") + # adding to list + count_df_list[[cpt]] <- counts + cpt = cpt + 1 + } + + # checking if all files were skipped + if (length(count_df_list) == 0) { + message("No valid files found") + next + } + + # full outer join + joined_df <- Reduce( + function(df1, df2) merge(df1, df2, by = "gene_id", all = TRUE), + count_df_list + ) + # setting the column gene_id as row names + joined_df <- tibble::column_to_rownames(joined_df, var = "gene_id") + # cleaning column names in case of duplicates + # it should happen only when there were multiple columns for the same sample + joined_df <- clean_column_names(joined_df) + + suppl_count_dfs[[suppl_df_cpt]] <- joined_df + suppl_df_cpt = suppl_df_cpt + 1 + } + return(suppl_count_dfs) +} + + +##################################################### +##################################################### +# DATA QUALITY CONTROL +##################################################### +##################################################### + +is_valid_microarray <- function(counts, platform) { + + if (!all(colnames(counts) %in% platform$design$sample)) { + message("Column names do not match samples in design") + return(FALSE) + } + + vals <- unlist(counts, use.names = FALSE) + vals <- vals[!is.na(vals)] + + all_integers <- all(abs(vals - round(vals)) < 1e-8) + value_range <- range(vals, na.rm = TRUE) + + if (value_range[2] <= 20) { + message(paste(platform$id, ": normalized, log2 scale (e.g. RMA, quantile)")) + return(TRUE) + } else if (all_integers) { + write_warning(paste(platform$id, ": RAW PROBE INTENSITIES FOUND")) + return(FALSE) + } else if (value_range[2] > 1000) { + write_warning(paste(platform$id, ": PARSED INTENSITIES: NORMALIZED BUT NOT LOG-TRANSFORMED")) + return(FALSE) + } else { + write_warning(paste(platform$id, ": UNCLEAR DATA ORIGIN: CHECK GEO METADATA")) + return(FALSE) + } +} + +is_valid_rnaseq <- function(counts, platform) { + + if (!all(colnames(counts) %in% platform$design$sample)) { + message(paste(platform$id, ": column names do not match samples in design")) + return(FALSE) + } + + return(TRUE) +} + + +check_rnaseq_normalisation_state <- function(counts, platform) { + + # checking if all values are integers + tryCatch({ + is_all_integer <- function(x) all(floor(x) == x) + int_counts <- counts %>% + select_if(is_all_integer) + + # if all or the majority of values are decimals + if (nrow(int_counts) < nrow(counts) * 0.01 ) { + return("normalised") + } else if (nrow(int_counts) == nrow(counts)) { + return("raw") + } else { + return("unknown") + } + + }, error = function(e) { + write_warning(paste(platform$id, ": COULD NOT COMPUTE FLOOR")) + return("unknown") + }) + +} + + +##################################################### +##################################################### +# EXPORT +##################################################### +##################################################### + +export_count_data <- function(data, platform, series) { + # renaming columns, to make them specific to accession and data type + colnames(data$counts) <- paste0(series$accession, '_', colnames(data$counts)) + outfilename <- paste0(series$accession, '_', platform$id, '.', platform$type, '.', data$norm_state, COUNT_FILE_EXTENSION) + if (!data$is_valid) { + outfilename <- file.path(get_rejected_dir(platform, series), outfilename) + } + + # exporting to CSV file + # index represents gene names + message(paste(platform$id, ': exporting count data to file', outfilename)) + write.table(data$counts, outfilename, sep = ',', row.names = TRUE, col.names = TRUE, quote = FALSE) +} + + +export_design <- function(data, platform, series) { + new_sample_names <- paste0(series$accession, '_', series$design$sample) + design_df <- series$design %>% + mutate(sample = new_sample_names ) %>% + select(sample, condition, batch) + + outfilename <- paste0(series$accession, '_', platform$id, '.', platform$type,'.', data$norm_state, DESIGN_FILE_EXTENSION) + if (!data$is_valid) { + outfilename <- file.path(get_rejected_dir(platform, series), outfilename) + } + + message(paste(platform$id, ': exporting design data to file', outfilename)) + write.table(design_df, outfilename, sep = ',', row.names = FALSE, col.names = TRUE, quote = FALSE) +} + + +export_name_mapping <- function(data, platform, series) { + outfilename <- paste0(series$accession, '_', platform$id, '.', platform$type, '.', data$norm_state, MAPPING_FILE_EXTENSION) + if (!data$is_valid) { + outfilename <- file.path(get_rejected_dir(platform, series), outfilename) + } + message(paste(platform$id, ': exporting design data to file', outfilename)) + write.table(series$mapping, outfilename, sep = ',', row.names = FALSE, col.names = TRUE, quote = FALSE) +} + +export_metadata <- function(data, platform, series) { + outfilename <- paste0(series$accession, '_', platform$id, '.', platform$type, '.', data$norm_state, METADATA_FILE_EXTENSION) + if (!data$is_valid) { + outfilename <- file.path(get_rejected_dir(platform, series), outfilename) + } + message(paste(platform$id, ': exporting metadata to file', outfilename)) + write.table(platform$metadata, outfilename, sep = ',', row.names = FALSE, col.names = TRUE, quote = FALSE) +} + + +##################################################### +##################################################### +# PROCESS DATA +##################################################### +##################################################### + +post_process_and_export <- function(data, platform, series) { + # keeping only non empty data + if (nrow(data$counts) == 0 || ncol(data$counts) == 0) { + message(paste(platform$id, ': no data found')) + write_warning(paste(platform$id, ": NO DATA")) + return(NULL) + } + # rename columns when needed + counts <- rename_columns(counts, series$mapping) + + export_count_data(data, platform, series) + export_design(data, platform, series) + export_name_mapping(data, platform, series) + export_metadata(data, platform, series) +} + + +process_platform_data <- function(platform, series) { + + platform$metadata <- pData(platform$data) + platform$design <- get_design_for_platform(series$design, platform$metadata) + valid_samples <- as.character(platform$design$sample) + platform$id <- get_platform_id(platform$metadata) + + if (length(valid_samples) == 0) { + message(paste(platform$id, ": no sample corresponding to species", series$species)) + return(NULL) + } + + if (platform$type == "microarray") { + + counts <- get_microarray_counts(platform) + data <- list( counts = counts ) + data$is_valid <- is_valid_microarray(counts, platform) + data$norm_state <- "normalised" + post_process_and_export(data, platform, series) + + } else { + + parsed_counts <- get_all_rnaseq_counts(platform) + for (counts in parsed_counts) { + data <- list( + counts = counts, + is_valid = is_valid_rnaseq(counts, platform), + norm_state = check_rnaseq_normalisation_state(counts, platform) + ) + post_process_and_export(data, platform, series) + } + + } + +} + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +main <- function() { + + args <- get_args() + + series <- list() + + series$accession <- args$accession + series$species <- format_species_name(args$species) + + message(paste("Getting data for accession", series$accession)) + # searching and downloading expression atlas data + geo_data <- download_geo_data_with_retries(series$accession) + + # make a single design dataframe for all samples in the series + series$design <- make_overall_design(geo_data, series) + if ( length(series$design) == 0 ) { + message("No sample corresponding to species", series$species) + write(paste("NO SAMPLES FOR SPECIES", series$species), file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) + } + + # make a map associating sample names to sample IDs + series$mapping <- make_sample_name_mapping(geo_data) + + series$experiment_type <- get_experiment_type(geo_data) + + suppl_data_urls <- get_series_supplementary_data(geo_data, series) + # for now, considering suppl data as raw rnaseq data + # TODO: check if these are always raw rnaseq data + if (length(suppl_data_urls) > 0) { + + message("Processing supplementary data") + for (supp_data_url in suppl_data_urls) { + counts <- get_raw_counts_from_url(supp_data_url) + if (is.null(counts)) { + next + } + platform <- list( + type = "rnaseq", + id = "suppl", + design = series$design + ) + data <- list( + counts = counts, + is_valid = is_valid_rnaseq(counts, platform), + norm_state = check_rnaseq_normalisation_state(counts, platform) + ) + post_process_and_export(data, platform, series) + } + + } + + # NOTE: we consider that a series is either a microarray series OR contains RNA-seq data + # mixed types should be found only in SuperSeries, and it is not handled for now + if ( series$experiment_type == "microarray" ) { + + message("Processing microarray data") + for (i in 1:length(geo_data)) { + platform <- list( + type = "microarray", + data = geo_data[[ i ]] + ) + process_platform_data(platform, series) + } + + } else { + + rnaseq_samples <- get_rnaseq_samples(geo_data, series$design) + if ( series$experiment_type == "rnaseq" || length(rnaseq_samples) > 0 ) { + + message("Processing RNA-seq data") + # taking a subset of the design corresponding to bona-fide RNA-seq samples + rnaseq_design_df <- get_design_for_rnaseq(series$design, rnaseq_samples) + for (i in 1:length(geo_data)) { + platform <- list( + type = "rnaseq", + count_type = "raw", + data = geo_data[[ i ]] + ) + process_platform_data(platform, series) + } + + } else { + write_warning(paste("UNSUPPORTED PLATFORM:", series$experiment_type)) + } + } + + message("Done") +} + + +##################################################### +# ENTRYPOINT +##################################################### +main() diff --git a/bin/download_latest_ensembl_annotation.py b/bin/download_latest_ensembl_annotation.py new file mode 100755 index 00000000..5c7278f1 --- /dev/null +++ b/bin/download_latest_ensembl_annotation.py @@ -0,0 +1,402 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from datetime import datetime +from urllib.request import urlretrieve + +import httpx +import pandas as pd +from bs4 import BeautifulSoup +from tenacity import ( + before_sleep_log, + retry, + stop_after_delay, + wait_exponential, +) +from tqdm import tqdm + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +GENE_IDS_CHUNKSIZE = 50 # max allowed by Ensembl REST API + +ENSEMBL_REST_SERVER = "https://rest.ensembl.org/" +SPECIES_INFO_BASE_ENDPOINT = "info/genomes/taxonomy/{species}" +TAXONOMY_NAME_ENDPOINT = "taxonomy/name/{species}" +ENSEMBL_API_HEADERS = { + "Content-Type": "application/json", + "Accept": "application/json", +} +STOP_RETRY_AFTER_DELAY = 120 + +NCBI_TAXONOMY_API_URL = "https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy" +NCBI_API_HEADERS = {"accept": "application/json", "content-type": "application/json"} + +ENSEMBL_DIVISION_TO_FOLDER = { + "EnsemblPlants": "plants", + "EnsemblVertebrates": "vertebrates", + "EnsemblMetazoa": "metazoa", + "EnsemblFungi": "fungi", + "EnsemblBacteria": "bacteria", + "EnsemblProtists": "protists", +} + +ENSEMBL_GENOMES_BASE_URL = "https://ftp.ebi.ac.uk/ensemblgenomes/pub/current/{}/gff3/" +ENSEMBL_VERTEBRATES_BASE_URL = "https://ftp.ensembl.org/pub/current/gff3/" + + +################################################################## +################################################################## +# FUNCTIONS +################################################################## +################################################################## + + +def parse_args(): + parser = argparse.ArgumentParser("Get GEO Datasets accessions") + parser.add_argument( + "--species", + type=str, + dest="species", + required=True, + help="Species name", + ) + return parser.parse_args() + + +################################################################## +################################################################## +# httpx +################################################################## +################################################################## + + +@retry( + stop=stop_after_delay(600), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), +) +def parse_page_data(url: str) -> BeautifulSoup: + page = httpx.get(url) + page.raise_for_status() + return BeautifulSoup(page.content, "html.parser") + + +@retry( + stop=stop_after_delay(STOP_RETRY_AFTER_DELAY), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), +) +def send_request_to_ncbi_taxonomy(taxid: str | int): + logger.info(f"Sending POST request to {NCBI_TAXONOMY_API_URL}") + taxons = [str(taxid)] + data = {"taxons": taxons} + response = httpx.post(NCBI_TAXONOMY_API_URL, headers=NCBI_API_HEADERS, json=data) + response.raise_for_status() + return response.json() + + +@retry( + stop=stop_after_delay(STOP_RETRY_AFTER_DELAY), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), +) +def send_get_request_to_ensembl(url: str) -> list[dict]: + logger.info(f"Sending GET request to {url}") + response = httpx.get(url, headers=ENSEMBL_API_HEADERS) + if response.status_code == 200: + response.raise_for_status() + else: + raise RuntimeError( + f"Failed to retrieve data: encountered error {response.status_code}" + ) + return response.json() + + +@retry( + stop=stop_after_delay(STOP_RETRY_AFTER_DELAY), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), +) +def download_file(url: str, output_path: str): + try: + urlretrieve(url, output_path) + except Exception as e: + logger.error(f"Failed to download file from {url}: {e}") + raise + + +################################################################## +################################################################## +# PARSING +################################################################## +################################################################## + + +def get_species_taxid(species: str) -> int: + try: + return get_species_taxid_from_ensembl(species) + except Exception as e: + logger.error( + f"Could not get species taxid for species {species} using the Ensembl REST API: {e}.\nTrying NCBI taxonomy." + ) + ncbi_formated_species_name = format_species_name_for_ncbi_taxonomy(species) + return get_species_taxid_from_ncbi(ncbi_formated_species_name) + + +def get_species_taxid_from_ensembl(species: str) -> int: + url = ENSEMBL_REST_SERVER + TAXONOMY_NAME_ENDPOINT.format(species=species) + data = send_get_request_to_ensembl(url) + if len(data) == 0: + raise ValueError(f"No species found for species {species}") + elif len(data) > 1: + logger.warning( + f"Multiple species found for species {species}. Keeping the first one." + ) + species_data = data[0] + if "id" not in species_data: + raise ValueError( + f"Could not find taxid for species {species}. Data collected: {species_data}" + ) + return species_data["id"] + + +def get_species_taxid_from_ncbi(species: str) -> int: + result = send_request_to_ncbi_taxonomy(species) + if len(result["taxonomy_nodes"]) > 1: + raise ValueError(f"Multiple taxids for species {species}") + metadata = result["taxonomy_nodes"][0] + if "taxonomy" not in metadata: + raise ValueError(f"Could not find taxonomy results for species {species}") + return int(metadata["taxonomy"]["tax_id"]) + + +def get_species_division(species_taxid: int) -> str: + url = ENSEMBL_REST_SERVER + SPECIES_INFO_BASE_ENDPOINT.format( + species=str(species_taxid) + ) + data = send_get_request_to_ensembl(url) + if len(data) == 0: + raise ValueError(f"No division found for species Taxon ID {species_taxid}") + elif len(data) > 1: + logger.warning( + f"Multiple divisions found for species Taxon ID {species_taxid}. Keeping the first one." + ) + return data[0]["division"] + + +def get_species_category(species: str) -> str: + species_taxid = get_species_taxid(species) + logger.info(f"Got species taxid: {species_taxid}") + division = get_species_division(species_taxid) + logger.info(f"Got division: {division}") + return ENSEMBL_DIVISION_TO_FOLDER[division] + + +def get_division_url(species: str) -> str: + category = get_species_category(species) + if category == "vertebrates": + return ENSEMBL_VERTEBRATES_BASE_URL + else: + return ENSEMBL_GENOMES_BASE_URL.format(category) + + +def format_species_name_for_ensembl(species: str) -> str: + return species.replace(" ", "_").lower() + + +def format_species_name_for_ncbi_taxonomy(species: str) -> str: + return species.replace("_", " ").lower() + + +def parse_last_modified_date(dt_string: str) -> datetime | None: + try: + return datetime.strptime(dt_string, "%Y-%m-%d %H:%M") + except ValueError: + return None + + +def get_candidate_species_folders( + species: str, url: str, first_level: bool = True +) -> list[dict]: + soup = parse_page_data(url) + species_url_records = [] + + # adding progress bar only at the first level + iterator = tqdm(soup.find_all("tr")) if first_level else soup.find_all("tr") + for item in iterator: + # all line sections + line_sections = list(item.find_all("td")) + # all folders of interest have an associated date + if len(line_sections) < 2: + continue + + folder_name_section = line_sections[1] + date_section = line_sections[2] + last_modified_date = parse_last_modified_date(date_section.text.strip()) + + for folder in folder_name_section.find_all("a"): + folder_url = f"{url}{folder.text}" + if folder.text.startswith(species): + d = { + "date": last_modified_date, + "url": folder_url, + "name": folder.text.rstrip("/"), + } + species_url_records.append(d) + print(folder.text) + elif folder.text.endswith("_collection/"): + species_url_records += get_candidate_species_folders( + species, folder_url, first_level=False + ) + else: + continue + + return species_url_records + + +def get_main_folder_url(records: list[dict], species: str) -> str | None: + main_folder_url = None + for record in records: + if record["name"] == species: + main_folder_url = record["url"] + break + return main_folder_url + + +def get_last_modified_folder_url(records: list[dict]) -> str: + df = pd.DataFrame.from_dict(records) + df.sort_values(by="date", ascending=False, inplace=True) + return df.iloc[0]["url"] + + +def get_current_annotation_folder(records: list[dict], species: str) -> str: + main_folder_url = get_main_folder_url(records, species) + if main_folder_url is not None: + return main_folder_url + + logger.info( + "Could not find a folder having the species as name. Checking for gca folders." + ) + gca_records = [ + record for record in records if record["name"].startswith(f"{species}_gca") + ] + if gca_records: + return get_last_modified_folder_url(gca_records) + + logger.info( + "Could not find a folder having the species as name. Getting the last modified one." + ) + return get_last_modified_folder_url(records) + + +def parse_size(size_str): + """ + Convert size strings like '902K', '4.1M', '5G' to bytes. + + Parameters: + ----------- + size_str : str + Size string with suffix (K, M, G, T, etc.) + + Returns: + -------- + int : size in bytes + """ + size_str = size_str.strip().upper() + + # Define multipliers + multipliers = {"K": 1024, "M": 1024**2, "G": 1024**3, "T": 1024**4, "P": 1024**5} + + # Check if last character is a unit + if size_str[-1] in multipliers: + number = float(size_str[:-1]) + multiplier = multipliers[size_str[-1]] + return int(number * multiplier) + else: + # No suffix, assume it's already in bytes + return int(float(size_str)) + + +def get_annotation_file(url: str) -> str: + soup = parse_page_data(url) + file_records = [] + + for item in soup.find_all("tr"): + # all line sections + line_sections = list(item.find_all("td")) + if len(line_sections) < 4: + continue + + file = line_sections[1].text.strip() + if not file.endswith(".gff3.gz"): + continue + + d = { + "file": file, + "date": parse_last_modified_date(line_sections[2].text.strip()), + "size": parse_size(line_sections[3].text.strip()), + } + file_records.append(d) + + if not file_records: + raise ValueError("No annotation files found") + + df = pd.DataFrame.from_dict(file_records) + + # keeping the biggest annotation + max_size_df = df.loc[ + [df["size"].idxmax()] + ] # double brackets to keep it as a DataFrame + if len(max_size_df) == 1: + return max_size_df["file"].iloc[0] + + # if multiple files with the same size, return the most recent + most_recent_df = max_size_df.loc[ + [max_size_df["date"].idxmax()] + ] # double brackets to keep it as a DataFrame + if len(most_recent_df) == 1: + return max_size_df["file"].iloc[0] + + # if still multiple files, return the first one + # remove the one ending with 'chr.gff3.gz' if it exists + if max_size_df["file"].str.endswith("chr.gff3.gz").any(): + max_size_df = max_size_df[~max_size_df["file"].str.endswith("chr.gff3.gz")] + return max_size_df["file"].iloc[0] + + +################################################################## +################################################################## +# MAIN +################################################################## +################################################################## + + +def main(): + args = parse_args() + + species = format_species_name_for_ensembl(args.species) + division_url = get_division_url(species) + logger.info(f"Searching for the right folder in {division_url}") + + species_url_records = get_candidate_species_folders(species, division_url) + if not species_url_records: + raise ValueError(f"No species folder found for {species}") + + annotation_folder_url = get_current_annotation_folder(species_url_records, species) + logger.info(f"Found current annotation folder: {annotation_folder_url}") + + annotation_file = get_annotation_file(annotation_folder_url) + + annotation_full_url = annotation_folder_url + annotation_file + logger.info(f"Found annotation URL: {annotation_full_url}.\nDownloading...") + + download_file(annotation_full_url, annotation_file) + logger.info("Done") + + +if __name__ == "__main__": + main() diff --git a/bin/download_latest_ncbi_annotation.py b/bin/download_latest_ncbi_annotation.py new file mode 100755 index 00000000..384906dc --- /dev/null +++ b/bin/download_latest_ncbi_annotation.py @@ -0,0 +1,238 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +import shutil +import sys +import zipfile +from pathlib import Path + +import httpx +from tenacity import ( + before_sleep_log, + retry, + stop_after_delay, + wait_exponential, +) + +logging.basicConfig( + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO +) +logger = logging.getLogger(__name__) + +# Modern NCBI API +NCBI_DATASET_API_URL = "https://api.ncbi.nlm.nih.gov/datasets/v2/" + +NCBI_TAXONOMY_ENDPOINT = "taxonomy" +NCBI_GENOME_DATASET_REPORT_BASE_ENDPOINT = "genome/taxon/{taxid}/dataset_report" +NCBI_DOWNLOAD_ENDPOINT = "genome/download" + + +NCBI_GENOME_DATASET_REPORT_API_PARAMS = { + "filters.has_annotation": True, + "page_size": 1000, +} +NCBI_API_HEADERS = {"accept": "application/json", "content-type": "application/json"} + +DOWNLOADED_FILENAME = "ncbi_dataset.zip" +ACCESSION_FILE = "accession.txt" + + +##################################################### +##################################################### +# PARSER +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Get best assembly for a specific taxon ID" + ) + parser.add_argument("--species", type=str, required=True, help="Species name") + return parser.parse_args() + + +##################################################### +##################################################### +# httpx +##################################################### +##################################################### + + +@retry( + stop=stop_after_delay(600), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), +) +def send_post_request_to_ncbi_dataset(endpoint: str, data: dict, params: dict = {}): + url = NCBI_DATASET_API_URL + endpoint + response = httpx.post(url, headers=NCBI_API_HEADERS, json=data, params=params) + response.raise_for_status() + return response.json() + + +@retry( + stop=stop_after_delay(600), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), +) +def send_get_request_to_ncbi_dataset(endpoint: str, params: dict = {}): + url = NCBI_DATASET_API_URL + endpoint + response = httpx.get(url, headers=NCBI_API_HEADERS, params=params) + response.raise_for_status() + return response.json() + + +##################################################### +##################################################### +# DATA HANDLING +##################################################### +##################################################### + + +def get_species_taxid(species: str) -> int: + data = {"taxons": [species]} + result = send_post_request_to_ncbi_dataset(NCBI_TAXONOMY_ENDPOINT, data) + + if len(result["taxonomy_nodes"]) > 1: + raise ValueError(f"Multiple taxids for species {species}") + metadata = result["taxonomy_nodes"][0] + + if "taxonomy" not in metadata: + logger.info(f"Could not find taxonomy results for species {species}") + if "errors" in metadata: + for error in metadata["errors"]: + logger.error(f"Error: {error['reason']}\n") + sys.exit(100) + return int(metadata["taxonomy"]["tax_id"]) + + +def get_assembly_reports(taxid: int): + result = send_get_request_to_ncbi_dataset( + endpoint=NCBI_GENOME_DATASET_REPORT_BASE_ENDPOINT.format(taxid=taxid), + params=NCBI_GENOME_DATASET_REPORT_API_PARAMS, + ) + return result.get("reports", []) + + +def get_assembly_with_best_stats(reports: list[dict]): + sorted_reports = sorted( + reports, + key=lambda x: ( + int(x.get("assembly_stats").get("total_sequence_length", 0)), + -int(x.get("assembly_stats", {}).get("total_number_of_chromosomes", 1e9)), + ), + reverse=True, + ) + return sorted_reports[0] + + +def get_current_assemblies(reports: list[dict]) -> dict | None: + current_assembly_reports = [ + report + for report in reports + if report.get("assembly_info", {}).get("refseq_category") == "reference genome" + ] + if not current_assembly_reports: + return None + + refseq_reports = [ + report + for report in current_assembly_reports + if report.get("source_database") == "SOURCE_DATABASE_REFSEQ" + ] + + if refseq_reports: + return refseq_reports[0] + else: + return None + + +def get_reference_assembly(reports: list[dict]) -> dict: + best_assembly_report = get_current_assemblies(reports) + if best_assembly_report is not None: + return best_assembly_report + else: + return get_assembly_with_best_stats(reports) + + +def format_species_name(species: str): + return species.replace("_", " ").lower() + + +def download_genome_annotation(genome_accession: str) -> str: + data = {"accessions": [genome_accession], "include_annotation_type": ["GENOME_GFF"]} + params = {"filename": DOWNLOADED_FILENAME} + send_post_request_to_ncbi_dataset(NCBI_TAXONOMY_ENDPOINT, data, params) + if not Path(DOWNLOADED_FILENAME).exists(): + raise FileNotFoundError( + f"Downloaded file not found for accession {genome_accession}" + ) + + +def extract_annotation_file_from_archive(): + with zipfile.ZipFile(DOWNLOADED_FILENAME, "r") as zip_ref: + zip_ref.extractall() + + valid_files = list(Path().cwd().glob(f"ncbi_dataset/data/{accession}/*.gff")) + + if not valid_files: + raise ValueError(f"No annotation file found for accession {accession}") + + if len(valid_files) > 1: + logger.warning( + f"Multiple annotation files found for accession {accession}. Taking the first one" + ) + + annotation_file = valid_files[0] + shutil.move(annotation_file, f"{accession}.gff") + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + +if __name__ == "__main__": + args = parse_args() + + species = format_species_name(args.species) + + species_taxid = get_species_taxid(species) + logger.info(f"Species taxid: {species_taxid}") + + logger.info(f"Getting best NCBI assembly for taxid: {species_taxid}") + reports = get_assembly_reports(species_taxid) + + if not reports: + logger.error(f"No assembly reports found for taxid {species_taxid}") + sys.exit(100) + + # looping while we can get an annotation file + annotation_found = False + while not annotation_found and reports: + best_assembly_report = get_reference_assembly(reports) + logger.info( + f"Best assembly: {best_assembly_report['accession']}. Trying to download annotation" + ) + accession = best_assembly_report["accession"] + try: + download_genome_annotation(accession) + extract_annotation_file_from_archive() + annotation_found = True + except Exception as e: + logger.error(f"Error downloading annotation for accession {accession}: {e}") + + if not annotation_found: + # Remove the best assembly report from the list of reports + reports = [report for report in reports if report["accession"] != accession] + + if not annotation_found: + logger.error(f"No annotation found for taxid {species_taxid}") + sys.exit(100) + + logger.info("Done") diff --git a/bin/edger_normalize.R b/bin/edger_normalize.R deleted file mode 100755 index 857a633d..00000000 --- a/bin/edger_normalize.R +++ /dev/null @@ -1,108 +0,0 @@ -#!/usr/bin/env Rscript - -# Written by Olivier Coen. Released under the MIT license. - -library(edgeR) -library(optparse) - -##################################################### -##################################################### -# FUNCTIONS -##################################################### -##################################################### - -get_args <- function() { - - option_list <- list( - make_option("--counts", dest = 'count_file', help = "Path to input count file"), - make_option("--design", dest = 'design_file', help = "Path to input design file") - ) - - args <- parse_args(OptionParser( - option_list = option_list, - description = "Normalize counts using edgeR" - )) - - return(args) -} - -replace_zero_counts_with_pseudocounts <- function(count_data_matrix) { - # Add a small pseudocount of 0.01 to avoid zero counts - count_data_matrix[count_data_matrix == 0] <- 0.01 - return(count_data_matrix) -} - -filter_out_lowly_expressed_genes <- function(dge) { - # filter the dataframe to exclude rows where the mean is 0 - # filter out lowly expressed genes - keep <- filterByExpr(dge) - dge <- dge[keep, , keep.lib.sizes=FALSE] - return(dge) -} - -get_non_zero_rows <- function(count_matrix) { - # get gene IDs corresponding to rows with only non-zero counts - non_zero_rows <- rownames(count_matrix[apply(count_matrix!=0, 1, all),]) - return(non_zero_rows) -} - -filter_out_genes_with_at_least_one_zero <- function(non_zero_rows, cpm_counts) { - # filter out genes with zero counts - cpm_counts <- cpm_counts[rownames(cpm_counts) %in% non_zero_rows, ] - return(cpm_counts) -} - -get_log2_cpm_counts <- function(dge) { - cpm_counts <- cpm(dge, normalized.lib.sizes = TRUE, log = TRUE) - return(cpm_counts) -} - - -get_normalized_cpm_counts <- function(count_file, design_file) { - - print(paste('Normalizing counts in:', count_file)) - - count_data <- read.csv(args$count_file, row.names = 1) - design_data <- read.csv(design_file) - - design_data <- design_data[design_data$sample %in% colnames(count_data), ] - group <- factor(design_data$condition) - count_matrix <- as.matrix(count_data) - - non_zero_rows <- get_non_zero_rows(count_matrix) - - count_matrix <- replace_zero_counts_with_pseudocounts(count_matrix) - - dge <- DGEList(counts = count_matrix, group = group) - rownames(dge) <- rownames(count_matrix) - colnames(dge) <- colnames(count_matrix) - - dge <- filter_out_lowly_expressed_genes(dge) - - # normalization - dge <- calcNormFactors(dge, method="TMM") - - cpm_counts <- get_log2_cpm_counts(dge) - - cpm_counts <- filter_out_genes_with_at_least_one_zero(non_zero_rows, cpm_counts) - - return(cpm_counts) -} - -export_data <- function(cpm_counts, filename) { - filename <- sub("\\.csv$", ".log_cpm.csv", filename) - print(paste('Exporting normalized counts per million to:', filename)) - write.table(cpm_counts, filename, sep = ',', row.names = TRUE, quote = FALSE) -} - -##################################################### -##################################################### -# MAIN -##################################################### -##################################################### - -args <- get_args() - -cpm_counts <- get_normalized_cpm_counts(args$count_file, args$design_file) - -export_data(cpm_counts, basename(args$count_file)) diff --git a/bin/extract_gene_ids.py b/bin/extract_gene_ids.py new file mode 100755 index 00000000..f09f01c8 --- /dev/null +++ b/bin/extract_gene_ids.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import config +import polars as pl +from common import parse_count_table + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +CLEANED_GENE_IDS_SUFFIX = ".gene_ids.txt" + + +def parse_args(): + parser = argparse.ArgumentParser("Rename gene IDs using mapped IDs") + parser.add_argument( + "--count-file", type=Path, required=True, help="Input file containing counts" + ) + return parser.parse_args() + + +def get_sorted_gene_ids(df: pl.DataFrame): + return ( + df.select(config.GENE_ID_COLNAME) + .sort(config.GENE_ID_COLNAME) + .to_series() + .to_list() + ) + + +################################################################## +# MAIN +################################################################## + + +def main(): + args = parse_args() + + logger.info(f"Converting IDs for count file {args.count_file.name}...") + + df = parse_count_table(args.count_file) + + logger.info("Writing cleaned IDs") + gene_ids_outfile = args.count_file.with_name( + args.count_file.stem + CLEANED_GENE_IDS_SUFFIX + ) + gene_ids = get_sorted_gene_ids(df) + + with open(gene_ids_outfile, "w") as fout: + fout.write("\n".join(gene_ids)) + + +if __name__ == "__main__": + main() diff --git a/bin/filter_and_rename_genes.py b/bin/filter_and_rename_genes.py new file mode 100755 index 00000000..106f8013 --- /dev/null +++ b/bin/filter_and_rename_genes.py @@ -0,0 +1,221 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +import sys +from pathlib import Path + +import config +import polars as pl +from common import parse_count_table, parse_table + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +################################################################## +# CONSTANTS +################################################################## + +RENAMED_FILE_SUFFIX = ".renamed.parquet" + +WARNING_REASON_FILE = "warning_reason.txt" +FAILURE_REASON_FILE = "failure_reason.txt" + +UNMAPPED_FILE_SUFFIX = "unmapped.txt" +NOT_VALID_FILE_SUFFIX = "not_valid.txt" +MERGED_FILE_SUFFIX = "merged.txt" +FINAL_FILE_SUFFIX = "final.txt" + +################################################################## +# FUNCTIONS +################################################################## + + +def parse_args(): + parser = argparse.ArgumentParser("Rename gene IDs using mapped IDs") + parser.add_argument( + "--count-file", type=Path, required=True, help="Input file containing counts" + ) + parser.add_argument( + "--mappings", + type=Path, + dest="mapping_file", + help="Mapping file containing gene IDs", + ) + parser.add_argument( + "--valid-gene-ids", + type=Path, + dest="valid_gene_ids_file", + help="File containing valid gene IDs", + ) + return parser.parse_args() + + +################################################################## +# MAIN +################################################################## + + +def main(): + args = parse_args() + + logger.info(f"Converting IDs for count file {args.count_file.name}...") + + ############################################################# + # PARSING FILES + ############################################################# + + df = parse_count_table(args.count_file) + + if df.is_empty(): + msg = "COUNT FILE IS EMPTY" + logger.warning(msg) + with open(FAILURE_REASON_FILE, "w") as f: + f.write(msg) + sys.exit(0) + + ############################################################# + # GETTING MAPPINGS + ############################################################# + + mapping_df = parse_table(args.mapping_file) + mapping_dict = dict( + zip( + mapping_df[config.ORIGINAL_GENE_ID_COLNAME], + mapping_df[config.GENE_ID_COLNAME], + ) + ) + + ############################################################# + # MAPPING GENE IDS IN DATAFRAME + ############################################################# + + # IMPORTANT: KEEPING ONLY GENES THAT HAVE BEEN CONVERTED + # filtering the DataFrame to keep only the rows where the index can be mapped + original_nb_genes = len(df) + + rejected_df = df.filter(~pl.col(config.GENE_ID_COLNAME).is_in(mapping_dict.keys())) + nb_unmapped_genes = len(rejected_df) + + # df = df.loc[df.index.isin(mapping_dict)] + df = df.filter(pl.col(config.GENE_ID_COLNAME).is_in(mapping_dict.keys())) + nb_mapped_genes = len(df) + + with open(UNMAPPED_FILE_SUFFIX, "w") as f: + f.write(str(nb_unmapped_genes)) + + if df.is_empty(): + sample_size = min(5, nb_unmapped_genes) + example_rejected_genes = ( + rejected_df[config.GENE_ID_COLNAME].head(sample_size).to_list() + ) + msg = f"NO GENES WERE MAPPED. EXAMPLE OF GENE IDS: {example_rejected_genes}" + logger.error(msg) + with open(FAILURE_REASON_FILE, "w") as f: + f.write(msg) + + with open(NOT_VALID_FILE_SUFFIX, "w") as f: + f.write("0") + with open(MERGED_FILE_SUFFIX, "w") as f: + f.write("0") + with open(FINAL_FILE_SUFFIX, "w") as f: + f.write("0") + + sys.exit(0) + + if len(df) < original_nb_genes: + sample_size = min(5, nb_unmapped_genes) + example_rejected_genes = ( + rejected_df[config.GENE_ID_COLNAME].head(sample_size).to_list() + ) + msg = ( + f"{nb_mapped_genes / original_nb_genes:.2%} of genes were mapped ({nb_mapped_genes} out of {original_nb_genes}). " + + f"Example of unmapped genes: {example_rejected_genes}" + ) + logger.warning(msg) + with open(WARNING_REASON_FILE, "a") as f: + f.write(msg) + else: + logger.info( + f"All genes were mapped ({nb_mapped_genes} out of {original_nb_genes})" + ) + + logger.info("Renaming gene names") + # renaming gene names to mapped ids using mapping dict + df = df.with_columns( + pl.col(config.GENE_ID_COLNAME) + .replace(mapping_dict) + .alias(config.GENE_ID_COLNAME) + ) + + ############################################################# + # GETTING VALID GENE IDS + ############################################################# + + logger.info("Keeping only genes with sufficient occurrence over datasets") + nb_genes_before_validation = len(df) + + with open(args.valid_gene_ids_file, "r") as fin: + valid_gene_ids = [line.strip() for line in fin.readlines()] + + df = df.filter(pl.col(config.GENE_ID_COLNAME).is_in(valid_gene_ids)) + + nb_not_valid_genes = nb_genes_before_validation - len(df) + logger.info( + f"{nb_not_valid_genes} ({nb_not_valid_genes / nb_genes_before_validation:.2%}) genes were not valid" + ) + + with open(NOT_VALID_FILE_SUFFIX, "w") as f: + f.write(str(nb_not_valid_genes)) + + if df.is_empty(): + msg = "NO GENES LEFT AFTER REMOVING RARE GENE IDS" + logger.error(msg) + with open(FAILURE_REASON_FILE, "w") as f: + f.write(msg) + + with open(MERGED_FILE_SUFFIX, "w") as f: + f.write("0") + with open(FINAL_FILE_SUFFIX, "w") as f: + f.write("0") + + sys.exit(0) + + ############################################################# + # GENE COUNT HANDLING + ############################################################# + + # handling cases where multiple genes have the same Gene ID + # since subsequent steps in the pipeline require integer values, + # we need to ensure that the resulting DataFrame has integer values + + # TODO: check is there is another way to avoid duplicate gene names + # sometimes different gene names have the same Gene ID + # for now, we just get the max of values, but this is not ideal + # we do not take the mean because if counts are integers, we want to keep them as integers + + logger.info("Computing max counts for genes with duplicate IDs") + df = df.group_by(config.GENE_ID_COLNAME, maintain_order=True).agg( + pl.exclude(config.GENE_ID_COLNAME).max() + ) + + ############################################################# + # WRITING OUTFILES + ############################################################# + + nb_merged = nb_mapped_genes - len(df) + with open(MERGED_FILE_SUFFIX, "w") as f: + f.write(str(nb_merged)) + with open(FINAL_FILE_SUFFIX, "w") as f: + f.write(str(len(df))) + + logger.info("Writing output file") + outfilename = args.count_file.with_suffix(RENAMED_FILE_SUFFIX).name + df.write_parquet(outfilename) + + +if __name__ == "__main__": + main() diff --git a/bin/filter_out_samples_with_too_many_missing_values.py b/bin/filter_out_samples_with_too_many_missing_values.py new file mode 100755 index 00000000..973111b4 --- /dev/null +++ b/bin/filter_out_samples_with_too_many_missing_values.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import config +import polars as pl +from common import export_parquet, parse_count_table + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +OUTFILE_SUFFIX = ".nulls_filtered.parquet" +RATIO_NULL_VALUES_PER_SAMPLE_OUTFILE = "ratio_null_values_per_sample.csv" +RATIO_NULL_VALUES_OUTFILE = "ratio_null_values.csv" +NB_REJECTED_SAMPLES_OUTFILE = "nb_rejected_samples.csv" +NB_KEPT_SAMPLES_OUTFILE = "nb_kept_samples.csv" + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser(description="Filter out samples not valid") + parser.add_argument( + "--counts", type=Path, dest="count_file", required=True, help="Count file" + ) + parser.add_argument( + "--max-null-ratio", + type=float, + dest="max_null_ratio", + required=True, + help="Maximum ratio of null values", + ) + parser.add_argument( + "--valid-gene-ids", + type=Path, + dest="valid_gene_ids", + required=True, + help="Valid gene IDs", + ) + return parser.parse_args() + + +def get_nb_valid_genes(valid_gene_ids_file: Path) -> int: + with open(valid_gene_ids_file, "r") as fin: + return len(fin.readlines()) + + +def get_nb_internal_nulls(df: pl.DataFrame) -> pl.DataFrame: + """ + Get the number of null values per sample. + :return: + A polars dataframe containing 2 columns: + - sample: name of the sample + - nb_nulls: number of null values + """ + return df.select(pl.exclude(config.GENE_ID_COLNAME).is_null().sum()).transpose( + include_header=True, + header_name=config.SAMPLE_COLNAME, + column_names=[config.GENE_COUNT_COLNAME], + ) + + +def get_ratio_null_values( + df: pl.DataFrame, nb_missing_genes: int, nb_valid_genes: int +) -> pl.DataFrame: + return df.select( + pl.col(config.SAMPLE_COLNAME), + ( + (pl.col(config.GENE_COUNT_COLNAME) + pl.lit(nb_missing_genes)) + / nb_valid_genes + ).alias(config.RATIO_COLNAME), + ) + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + # putting all counts into a single dataframe + logger.info("Loading count data...") + count_df = parse_count_table(args.count_file) + nb_genes = len(count_df) + nb_samples = count_df.shape[1] - 1 + logger.info(f"Loaded count data with {nb_genes} genes and {nb_samples} samples") + + logger.info("Computing total number of nulls per sample") + + # getting nb of missing values inside the dataframe (rare but may exist) + nb_null_values_df = get_nb_internal_nulls(count_df) + + # getting nb of missing valid genes inside the dataframe + nb_valid_genes = get_nb_valid_genes(args.valid_gene_ids) + nb_missing_genes = nb_valid_genes - nb_genes + + # adding the nb of missing genes to the number of null vaues for each sample + ratio_values_df = get_ratio_null_values( + nb_null_values_df, nb_missing_genes, nb_valid_genes + ) + + valid_samples = ( + ratio_values_df.filter(pl.col(config.RATIO_COLNAME) <= args.max_null_ratio) + .select(pl.col(config.SAMPLE_COLNAME)) + .to_series() + .to_list() + ) + + # if at least one valid sample is remaining, making an updated count dataframe + if valid_samples: + logger.info(f"Filtered out {count_df.shape[1] - len(valid_samples)} columns") + valid_count_df = count_df.select([config.GENE_ID_COLNAME] + valid_samples) + export_parquet(valid_count_df, args.count_file, OUTFILE_SUFFIX) + else: + logger.error("No valid columns remaining") + + # collect all ratio values for export + ratio_values = ratio_values_df.select(config.RATIO_COLNAME).to_series().to_list() + with open(RATIO_NULL_VALUES_OUTFILE, "w") as outfile: + # sorting values in order to having consistent output + outfile.write(",".join([str(val) for val in sorted(ratio_values)])) + + ratio_values_df.write_csv(RATIO_NULL_VALUES_PER_SAMPLE_OUTFILE) + + with open(NB_KEPT_SAMPLES_OUTFILE, "w") as fout: + fout.write(str(len(valid_samples))) + + with open(NB_REJECTED_SAMPLES_OUTFILE, "w") as fout: + fout.write(str(nb_samples - len(valid_samples))) + + logger.info("Done") + + +if __name__ == "__main__": + main() diff --git a/bin/filter_out_samples_with_too_many_zeros.py b/bin/filter_out_samples_with_too_many_zeros.py new file mode 100755 index 00000000..7708b0ad --- /dev/null +++ b/bin/filter_out_samples_with_too_many_zeros.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import config +import polars as pl +from common import export_parquet, parse_count_table + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +OUTFILE_SUFFIX = ".zeros_filtered.parquet" +RATIO_ZEROS_PER_SAMPLE_OUTFILE = "ratio_zeros_per_sample.csv" +RATIO_ZERO_VALUES_OUTFILE = "ratio_zeros.csv" +NB_REJECTED_SAMPLES_OUTFILE = "nb_rejected_samples.csv" +NB_KEPT_SAMPLES_OUTFILE = "nb_kept_samples.csv" + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser(description="Filter out samples not valid") + parser.add_argument( + "--counts", type=Path, dest="count_file", required=True, help="Count file" + ) + parser.add_argument( + "--max-zero-ratio", + type=float, + dest="max_zero_ratio", + required=True, + help="Maximum ratio of zeros allowed", + ) + return parser.parse_args() + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + # putting all counts into a single dataframe + logger.info("Loading count data...") + count_df = parse_count_table(args.count_file) + nb_samples = count_df.shape[1] - 1 + logger.info( + f"Loaded count data with {len(count_df)} genes and {nb_samples} samples" + ) + + # computing the number of zeros values per sample + ratio_zeros_df = count_df.select( + pl.exclude(config.GENE_ID_COLNAME).eq(pl.lit(0)).mean() + ) + + # getting the samples with a zero ratio lower than the max zero ratio + valid_samples = [ + col + for col in ratio_zeros_df.columns + if ratio_zeros_df[col][0] <= args.max_zero_ratio + ] + + # if at least one valid sample is remaining, making an updated count dataframe + if valid_samples: + logger.info(f"Filtered out {count_df.shape[1] - len(valid_samples)} columns") + valid_count_df = count_df.select( + pl.col(config.GENE_ID_COLNAME), pl.col(valid_samples) + ) + export_parquet(valid_count_df, args.count_file, OUTFILE_SUFFIX) + else: + logger.error("No valid columns remaining") + + # collect all ratio values for export + ratio_values = list(ratio_zeros_df.row(0)) + with open(RATIO_ZERO_VALUES_OUTFILE, "w") as outfile: + # sorting values in order to having consistent output + outfile.write(",".join([str(val) for val in sorted(ratio_values)])) + + ratio_zeros_df.write_csv(RATIO_ZEROS_PER_SAMPLE_OUTFILE) + + with open(NB_KEPT_SAMPLES_OUTFILE, "w") as fout: + fout.write(str(len(valid_samples))) + + with open(NB_REJECTED_SAMPLES_OUTFILE, "w") as fout: + fout.write(str(nb_samples - len(valid_samples))) + + logger.info("Done") + + +if __name__ == "__main__": + main() diff --git a/bin/get_candidate_genes.py b/bin/get_candidate_genes.py new file mode 100755 index 00000000..c0f8c58b --- /dev/null +++ b/bin/get_candidate_genes.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import config +import polars as pl + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# outfile names +CANDIDATE_COUNTS_OUTFILENAME = "section_{}.candidate_counts.parquet" +STATS_WITH_SECTION_OUTFILENAME = "section_{}.stats.parquet" + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Get statistics from count data for each gene" + ) + parser.add_argument( + "--counts", + type=Path, + dest="count_file", + required=True, + help="File containing counts for all genes", + ) + parser.add_argument( + "--stats", + type=Path, + dest="stat_file", + required=True, + help="File containing statistics of expression over all datasets", + ) + parser.add_argument( + "--nb-candidates-per-section", + type=int, + dest="nb_candidates_per_section", + required=True, + help="Number of candidates per section to select for subsequent steps", + ) + parser.add_argument( + "--nb-sections", + type=int, + dest="nb_sections", + required=True, + help="Number of sections to divide the data into", + ) + return parser.parse_args() + + +def parse_stats(file: Path) -> pl.DataFrame: + return pl.read_csv(file).select( + pl.col(config.GENE_ID_COLNAME).cast(pl.String()), + pl.exclude(config.GENE_ID_COLNAME).cast(pl.Float64()), + ) + + +def add_sections(stat_df: pl.DataFrame, nb_sections: int): + """ + Assigns gene to sections bases on mean expression level + Polars only ranks non-null values and preserves the null ones. + """ + return stat_df.with_columns( + ( + pl.col(config.MEAN_COLNAME).rank(method="ordinal", descending=True) + / pl.col(config.MEAN_COLNAME).count() + * nb_sections + + pl.lit(1) + ) + .floor() + .cast(pl.Int8) + # we want the only value at to be at + .replace({nb_sections + 1: nb_sections}) + .alias("section") + ) + + +def get_best_candidates( + stat_df: pl.DataFrame, nb_candidates_per_section: int +) -> pl.DataFrame: + return ( + stat_df.sort( + config.COEFFICIENT_OF_VARIATION_COLNAME, + descending=False, + nulls_last=True, + maintain_order=True, + ) + .group_by("section", maintain_order=True) + .agg(pl.col(config.GENE_ID_COLNAME).head(nb_candidates_per_section)) + ) + + +def get_counts_for_candidates(file: Path, best_candidates: list[str]) -> pl.DataFrame: + return pl.read_parquet(file).filter( + pl.col(config.GENE_ID_COLNAME).is_in(best_candidates) + ) + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + stat_df = parse_stats(args.stat_file) + + # first basic filters + # stat_df = filter_out_low_expression_genes(stat_df, args.min_pct_quantile_expr_level) + # stat_lf = filter_out_genes_with_zero_counts(stat_lf) + + logger.info("Getting sections") + stat_df = add_sections(stat_df, args.nb_sections) + + logger.info("Getting best candidates") + # get base candidate genes based on the chosen statistical descriptor (cv, rcvm) + best_candidates_df = get_best_candidates( + stat_df, + args.nb_candidates_per_section, + ) + + logger.info("Getting counts of best candidates") + # this was coded as a loop in order to keep it simple + # since it does not impact much speed and scability + for row in best_candidates_df.iter_rows(): + section = row[0] + best_candidates = row[1] + candidate_gene_count_lf = get_counts_for_candidates( + args.count_file, best_candidates + ) + # exporting count data for the best candidates for this section + candidate_gene_count_lf.write_parquet( + CANDIDATE_COUNTS_OUTFILENAME.format(section) + ) + # exporting statistics for all genes in this section + stat_df.filter(pl.col("section") == section).write_parquet( + STATS_WITH_SECTION_OUTFILENAME.format(section) + ) + + +if __name__ == "__main__": + main() diff --git a/bin/get_eatlas_accessions.py b/bin/get_eatlas_accessions.py index 4369a531..58f72443 100755 --- a/bin/get_eatlas_accessions.py +++ b/bin/get_eatlas_accessions.py @@ -3,40 +3,38 @@ # Written by Olivier Coen. Released under the MIT license. import argparse -import requests -from retry import retry -from os import cpu_count -import json +import logging +import random +from functools import partial from multiprocessing import Pool -import nltk -from nltk.corpus import wordnet + +import httpx +import pandas as pd +import yaml +from natural_language_utils import keywords_in_fields +from tenacity import ( + before_sleep_log, + retry, + stop_after_delay, + wait_exponential, +) + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +ALLOWED_PLATFORMS = ["rnaseq", "microarray"] +# accessions that should not be fetched automatically: +# - E-GTEX-8 contains 17350 samples (way too big) +EXCLUDED_ACCESSION_PATTERNS = ["E-GTEX-"] ALL_EXP_URL = "https://www.ebi.ac.uk/gxa/json/experiments/" ACCESSION_OUTFILE_NAME = "accessions.txt" -JSON_OUTFILE_NAME = "found.json" - -################################################################## -################################################################## -# NLTK MODELS AND OBJECTS -################################################################## -################################################################## +# ALL_EXPERIMENTS_METADATA_OUTFILE_NAME = "all_experiments.metadata.tsv" +SPECIES_EXPERIMENTS_METADATA_OUTFILE_NAME = "species_experiments.metadata.tsv" +SELECTED_EXPERIMENTS_METADATA_OUTFILE_NAME = "selected_experiments.metadata.tsv" +FILTERED_EXPERIMENTS_WITH_KEYWORDS_OUTFILE_NAME = "filtered_experiments.keywords.yaml" -nltk.download("punkt_tab") -nltk.download("averaged_perceptron_tagger_eng") -nltk.download("wordnet") - -lemmatizer = nltk.WordNetLemmatizer() -stemmer = nltk.PorterStemmer() - -################################################################## -################################################################## -# EXCEPTIONS -################################################################## -################################################################## - - -class ExpressionAtlasNothingFoundError(Exception): - pass +SAMPLING_QUOTA_OUTFILE = "sampling_quota.txt" ################################################################## @@ -48,139 +46,45 @@ class ExpressionAtlasNothingFoundError(Exception): def parse_args(): parser = argparse.ArgumentParser("Get expression atlas accessions") - parser.add_argument("--species", type=str, help="Species to convert IDs for") + parser.add_argument( + "--species", + type=str, + required=True, + help="Search Expression Atlas for this specific species", + ) parser.add_argument( "--keywords", type=str, nargs="*", help="Keywords to search for in experiment description", ) + parser.add_argument( + "--platform", type=str, help="Platform type", choices=ALLOWED_PLATFORMS + ) + parser.add_argument( + "--random-sampling-size", + dest="random_sampling_size", + type=int, + help="Random sampling size", + ) + parser.add_argument( + "--random-sampling-seed", + dest="random_sampling_seed", + type=int, + help="Random sampling seed", + ) + parser.add_argument( + "--cpus", type=int, dest="nb_cpus", required=True, help="Number of CPUs" + ) return parser.parse_args() -def get_wordnet_pos(token: str): - tag = nltk.pos_tag([token])[0][1][0].upper() - tag_dict = { - "J": wordnet.ADJ, - "N": wordnet.NOUN, - "V": wordnet.VERB, - "R": wordnet.ADV, - } - return tag_dict.get(tag, wordnet.NOUN) # Default to NOUN if not found - - -def get_stemmed_tokens(sentence: str): - """ - Tokenize a sentence into its constituent words, and then stem each word - - Parameters - ---------- - sentence : str - The sentence to be tokenized and stemmed - - Returns - ------- - tokens : List[str] - The list of stemmed tokens - """ - - tokens = nltk.word_tokenize(sentence) - return [stemmer.stem(token) for token in tokens] - - -def get_lemmed_tokens(sentence: str): - """ - Tokenize a sentence into its constituent words, and then lemmatize each word - - Parameters - ---------- - sentence : str - The sentence to be tokenized and lemmatized - - Returns - ------- - tokens : List[str] - The list of lemmatized tokens - """ - tokens = nltk.word_tokenize(sentence) - return [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens] - - -def get_synonyms(word): - """ - Get all synonyms of a word from the wordnet database. - - Parameters - ---------- - word : str - The word for which to get synonyms - - Returns - ------- - synonyms : set - A set of all synonyms of the word - """ - synonyms = [] - for syn in wordnet.synsets(word): - for lemma in syn.lemmas(): - synonyms.append(lemma.name()) # Get the name of each lemma (synonym) - return set(synonyms) # Return as a set to avoid duplicates - - -def get_all_candidate_target_words(sentence: str): - """ - Get all candidate target words from a sentence by stemming and lemmatizing the - tokens and getting synonyms from the wordnet database. - - Parameters - ---------- - sentence : str - The sentence from which to get candidate target words - - Returns - ------- - candidates : list - A list of all candidate target words - """ - candidates = [] - lemmatized_tokens = get_stemmed_tokens(sentence) - stemmed_tokens = get_stemmed_tokens(sentence) - tokens = list(set(lemmatized_tokens + stemmed_tokens)) - for token in tokens: - candidates += get_synonyms(token) - return candidates - - -def word_in_sentence(word: str, sentence: str): - """ - Checks if a word (or a stemmed version of it) is in a sentence, or if it is a - subword of a stemmed version of any word in the sentence. - - Parameters - ---------- - word : str - The word to be searched for - sentence : str - The sentence in which to search for the word - - Returns - ------- - bool - True if the word is found in the sentence, False otherwise - """ - for stemmed_word in [word] + get_stemmed_tokens(word): - # testing if stemmed word is in sentence as it is - if stemmed_word in sentence: - return True - # or testing if stemmed word is a subword of a stemmed word from the sentence - for target_word in get_all_candidate_target_words(sentence): - if stemmed_word in target_word: - return True - return False - - -@retry(ExpressionAtlasNothingFoundError, tries=3, delay=2, backoff=2) -def get_data(url: str): +@retry( + stop=stop_after_delay(600), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), +) +def get_data(url: str) -> dict: """ Queries a URL and returns the data as a JSON object @@ -199,13 +103,9 @@ def get_data(url: str): RuntimeError If the query fails """ - response = requests.get(url) - if response.status_code == 200: - return response.json() - elif response.status_code == 500: - raise ExpressionAtlasNothingFoundError - else: - raise RuntimeError(f"Failed to retrieve data: {response.status_code}") + response = httpx.get(url) + response.raise_for_status() + return response.json() def get_experiment_description(exp_dict: dict): @@ -238,6 +138,36 @@ def get_experiment_description(exp_dict: dict): raise KeyError(f"Could not find description field in {exp_dict}") +def get_experiment_accession(exp_dict: dict): + """ + Gets the accession from an experiment dictionary + + Parameters + ---------- + exp_dict : dict + The experiment dictionary + + Returns + ------- + accession : str + The experiment accession + + Raises + ------ + KeyError + If the accession field is not found in the experiment dictionary + """ + if "experiment" in exp_dict: + if "accession" in exp_dict["experiment"]: + return exp_dict["experiment"]["accession"] + else: + raise KeyError(f"Could not find accession field in {exp_dict}") + elif "experimentAccession" in exp_dict: + return exp_dict["experimentAccession"] + else: + raise KeyError(f"Could not find accession field in {exp_dict}") + + def get_properties_values(exp_dict: dict): """ Gets all values from properties from an experiment dictionary @@ -263,17 +193,78 @@ def get_properties_values(exp_dict: dict): break if not key_found: raise KeyError(f"Could not find property value in {column_header_dict}") - return values + # removing empty strings + values = [value for value in values if value != ""] + # removing duplicates + return list(set(values)) + + +def get_eatlas_experiments(): + """ + Gets all experiments from Expression Atlas + + Parameters + ---------- + + Returns + ------- + experiments : list + A list of experiment dictionaries + """ + data = get_data(ALL_EXP_URL) + return data["experiments"] -def get_species_experiments( - species: str, -): +def filter_by_platform(experiments: list[dict], platform: str | None): """ - Gets all experiments for a given species + Gets all experiments for a given platform from Expression Atlas + Possible platforms in Expression Atlas are 'rnaseq', 'microarray', 'proteomics' Parameters ---------- + experiments: list[str] + platform : str + Name of platform. Example: "rnaseq" + + Returns + ------- + experiments : list + A list of experiment dictionaries + """ + platform_experiments = [] + for exp_dict in experiments: + if technology_type := exp_dict.get("technologyType"): + parsed_technology_type = ( + technology_type[0] + if isinstance(technology_type, list) + else technology_type + ) + # parsed_platform is in ["rnaseq", "microarray", "proteomics", ...] + parsed_platform = ( + parsed_technology_type.lower().split(" ")[0].replace("-", "") + ) + + if platform is not None: + if parsed_platform == platform: + platform_experiments.append(exp_dict) + else: + if parsed_platform in ALLOWED_PLATFORMS: + platform_experiments.append(exp_dict) + + else: + logger.warning( + f"Technology type not found for experiment {exp_dict['accession']}" + ) + return platform_experiments + + +def get_species_experiments(experiments: list[dict], species: str): + """ + Gets all experiments for a given species from Expression Atlas + + Parameters + ---------- + experiments: list[str] species : str Name of species. Example: "Arabidopsis thaliana" @@ -282,12 +273,11 @@ def get_species_experiments( experiments : list A list of experiment dictionaries """ - data = get_data(ALL_EXP_URL) - experiments = [] - for exp_dict in data["experiments"]: + species_experiments = [] + for exp_dict in experiments: if exp_dict["species"] == species: - experiments.append(exp_dict) - return experiments + species_experiments.append(exp_dict) + return species_experiments def get_experiment_data(exp_dict: dict): @@ -308,51 +298,100 @@ def get_experiment_data(exp_dict: dict): return get_data(exp_url) -def search_keywords_in_experiment(exp_dict: dict, keywords: list[str]): - """ - Searches for keywords in an experiment's description and conditions +def filter_out_excluded_accessions(experiments: list[dict]) -> list[dict]: + valid_experiments = [] + for exp_dict in experiments: + for accession_pattern in EXCLUDED_ACCESSION_PATTERNS: + if exp_dict["experimentAccession"].startswith(accession_pattern): + logger.warning( + f"Skipping experiment {exp_dict['experimentAccession']} due to exclusion pattern" + ) + break + else: + valid_experiments.append(exp_dict) + return valid_experiments - Parameters - ---------- - exp_dict : dict - The experiment dictionary - keywords : list[str] - The list of keywords to search for - Returns - ------- - result : dict - A dictionary with the experiment data and a description of which keyword was found - Example: {'data': {'experiment': {...}}, 'found': {'word': 'salt', 'description': '...'}} - If no keyword was found, returns None - """ +def parse_experiment(exp_dict: dict): + # getting accession and description + accession = get_experiment_accession(exp_dict) + description = get_experiment_description(exp_dict) + # getting properties of this experiment exp_data = get_experiment_data(exp_dict) - exp_description = get_experiment_description(exp_dict) + properties_values = get_properties_values(exp_data) - for keyword in keywords: - if word_in_sentence(keyword, exp_description): - return { - "data": exp_data, - "found": {"word": keyword, "description": exp_description}, - } + return { + "accession": accession, + "description": description, + "properties": properties_values, + } - # if no keyword was found in the description - # we try and find a keyword in one of the conditions of the experimental design - exp_data = get_experiment_data(exp_dict) - properties_values = get_properties_values(exp_data) - properties_values_str = " ".join(properties_values) - for keyword in keywords: - if word_in_sentence(keyword, properties_values_str): - return { - "data": exp_data, - "found": {"word": keyword, "properties": properties_values_str}, - } +def filter_experiment_with_keywords(exp_dict: dict, keywords: list[str]) -> dict | None: + all_searchable_fields = [exp_dict["description"]] + exp_dict["properties"] + found_keywords = keywords_in_fields(all_searchable_fields, keywords) + # only returning experiments if found keywords + if found_keywords: + exp_dict["found_keywords"] = list(set(found_keywords)) + return exp_dict + else: + return None + + +def get_metadata_for_selected_experiments( + experiments: list[dict], results: list[dict] +) -> list[dict]: + filtered_accessions = [result_dict["accession"] for result_dict in results] + return [ + exp_dict + for exp_dict in experiments + if get_experiment_accession(exp_dict) in filtered_accessions + ] + + +def sample_experiments_randomly( + experiments: list[dict], sampling_size: int, seed: int +) -> tuple[list[str], bool]: + random.seed(seed) + sampled_experiments = [] + + total_nb_samples = 0 + sampling_quota_reached = False + experiments_left = list(experiments) + while experiments_left: + # if the min number of samples is greater than the remaining space left, we get out of the loop + experiments_left_nb_samples = [exp["nb_samples"] for exp in experiments_left] + min_nb_samples = min(experiments_left_nb_samples) + if min_nb_samples > sampling_size - total_nb_samples: + sampling_quota_reached = True + logger.warning("Sampling quota reached") + break + + experiment = None + test_total_nb_samples = int(total_nb_samples) + experiments_not_tested = list(experiments_left) + while experiments_not_tested: + experiment = random.choice(experiments_not_tested) + experiments_not_tested.remove(experiment) + # if we do not exceed the sampling size with this experiment + # we keep it + test_total_nb_samples = total_nb_samples + experiment["nb_samples"] + if test_total_nb_samples <= sampling_size: + break + + # this should not happen but we keep it for safety + if experiment is None: + logger.error("No experiment found") + continue - return None + total_nb_samples = test_total_nb_samples + experiments_left.remove(experiment) + sampled_experiments.append(experiment) + return [exp["accession"] for exp in sampled_experiments], sampling_quota_reached -def format_species_name(species: str): + +def format_species_name(species: str) -> str: return species.replace("_", " ").capitalize().strip() @@ -366,57 +405,125 @@ def format_species_name(species: str): def main(): args = parse_args() + results = None + selected_accessions = [] + selected_experiments = [] + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # PARSING EXPRESSION ATLAS + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # Getting arguments species_name = format_species_name(args.species) keywords = args.keywords - print(f"Getting experiments corresponding to species {species_name}") - species_experiments = get_species_experiments(species_name) - print(f"Found {len(species_experiments)} experiments") + logger.info(f"Getting experiments corresponding to species {species_name}") + experiments = get_eatlas_experiments() - if keywords: - print(f"Filtering experiments corresponding to keywords {keywords}") - selected_accessions = [] - found_dict = {} - with Pool(cpu_count()) as pool: - items = [ - ( - exp_dict, - keywords, - ) - for exp_dict in species_experiments - ] - results = pool.starmap(search_keywords_in_experiment, items) - for result in results: - if result is not None: - accession = result["data"]["experiment"]["accession"] - selected_accessions.append(accession) - found_dict[accession] = result["found"] - - if not selected_accessions: - raise RuntimeError( - "Could not find experiments for species {args.species} and keywords {args.keywords}" - ) - else: - print( - f"Kept {len(selected_accessions)} experiments:\n{selected_accessions}" - ) + logger.info("Filtering on species name") + experiments = get_species_experiments(experiments, species_name) + logger.info(f"Found {len(experiments)} experiments for species {species_name}") - print(f"Writing logs of found keywords to {JSON_OUTFILE_NAME}") - with open(JSON_OUTFILE_NAME, "w") as fout: - json.dump(found_dict, fout) + logger.info("Filtering experiments based on platform") + experiments = filter_by_platform(experiments, args.platform) - else: - print("No keywords specified. Keeping all experiments") - selected_accessions = [ - exp_dict["experimentAccession"] for exp_dict in species_experiments + logger.info("Filtering out excluded accessions") + experiments = filter_out_excluded_accessions(experiments) + + logger.info("Parsing experiments") + with Pool(processes=args.nb_cpus) as pool: + results = pool.map(parse_experiment, experiments) + + if keywords: + logger.info(f"Filtering experiments with keywords {keywords}") + func = partial(filter_experiment_with_keywords, keywords=keywords) + with Pool(processes=args.nb_cpus) as pool: + results = [res for res in pool.map(func, results) if res is not None] + logger.info( + f"Found {len(results)} experiments corresponding to keywords {keywords}" + ) + + # getting accessions of selected experiments + selected_accessions = [exp_dict["accession"] for exp_dict in results] + + sampling_status = "ok" + if args.random_sampling_size and args.random_sampling_seed: + selected_accession_to_nb_samples = [ + { + "accession": exp_dict["experimentAccession"], + "nb_samples": exp_dict["numberOfAssays"], + } + for exp_dict in experiments + if exp_dict["experimentAccession"] in selected_accessions ] - print(selected_accessions) - print(f"Writing accessions to {ACCESSION_OUTFILE_NAME}") + nb_samples_df = pd.DataFrame.from_dict(selected_accession_to_nb_samples) + nb_samples_df.to_csv("selected_accession_to_nb_samples.csv", index=False) + + logger.info("Sampling experiments randomly") + selected_accessions, sampling_quota_reached = sample_experiments_randomly( + selected_accession_to_nb_samples, + args.random_sampling_size, + args.random_sampling_seed, + ) + logger.info( + f"Kept {len(selected_accessions)} experiments after random sampling" + ) + + if sampling_quota_reached: + sampling_status = "full" + + # writing status to file + # so that the wrapper module can get the status + with open(SAMPLING_QUOTA_OUTFILE, "w") as fout: + fout.write(sampling_status) + + # keeping metadata only for selected experiments + selected_experiments = get_metadata_for_selected_experiments(experiments, results) + + if not selected_accessions: + logger.warning( + f"Could not find experiments for species {species_name} and keywords {keywords}" + ) + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # EXPORTING DATA + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + # exporting list of accessions + logger.info(f"Writing accessions to {ACCESSION_OUTFILE_NAME}") with open(ACCESSION_OUTFILE_NAME, "w") as fout: fout.writelines([f"{acc}\n" for acc in selected_accessions]) + # exporting metadata + logger.info( + f"Writing metadata of all experiments for species {species_name} to {SPECIES_EXPERIMENTS_METADATA_OUTFILE_NAME}" + ) + df = pd.DataFrame.from_dict(experiments) + df.to_csv( + SPECIES_EXPERIMENTS_METADATA_OUTFILE_NAME, sep="\t", index=False, header=True + ) + + if selected_experiments: + logger.info( + f"Writing metadata of filtered experiments to {SELECTED_EXPERIMENTS_METADATA_OUTFILE_NAME}" + ) + df = pd.DataFrame.from_dict(selected_experiments) + df.to_csv( + SELECTED_EXPERIMENTS_METADATA_OUTFILE_NAME, + sep="\t", + index=False, + header=True, + ) + + if results: + # exporting list of selected experiments with their keywords + logger.info( + f"Writing filtered experiments with keywords to {FILTERED_EXPERIMENTS_WITH_KEYWORDS_OUTFILE_NAME}" + ) + with open(FILTERED_EXPERIMENTS_WITH_KEYWORDS_OUTFILE_NAME, "w") as fout: + yaml.dump(results, fout) + if __name__ == "__main__": main() diff --git a/bin/get_eatlas_data.R b/bin/get_eatlas_data.R deleted file mode 100755 index 0987e01a..00000000 --- a/bin/get_eatlas_data.R +++ /dev/null @@ -1,162 +0,0 @@ -#!/usr/bin/env Rscript - -# Written by Olivier Coen. Released under the MIT license. - -library(ExpressionAtlas) -library(optparse) - -##################################################### -##################################################### -# FUNCTIONS -##################################################### -##################################################### - -get_args <- function() { - option_list <- list( - make_option("--accession", type = "character", help = "Accession number of expression atlas experiment. Example: E-MTAB-552") - ) - - args <- parse_args(OptionParser( - option_list = option_list, - description = "Get expression atlas data" - )) - return(args) -} - -download_expression_atlas_data_with_retries <- function(accession, max_retries = 3, wait_time = 5) { - success <- FALSE - attempts <- 0 - - while (!success && attempts < max_retries) { - attempts <- attempts + 1 - tryCatch({ - atlas_data <- getAtlasData( accession ) - success <- TRUE - message("Download successful on attempt ", attempts) - }, error = function(e) { - message("Attempt ", attempts, " failed: ", e$message) - if (attempts < max_retries) { - message("Retrying in ", wait_time, " seconds...") - Sys.sleep(wait_time) - } else { - message("All attempts failed. Please check the URL or your connection.") - } - }) - } - - return(atlas_data) -} - -get_rnaseq_data <- function(data) { - - return(list( - count_data = assays( data )$counts, - count_type = 'raw', - sample_groups = colData(data)$AtlasAssayGroup - )) -} - -get_one_colour_microarray_data <- function(data) { - - return(list( - count_data = exprs( data ), - count_type = 'normalized', - sample_groups = phenoData(data)$AtlasAssayGroup - )) -} - -get_batch_id <- function(accession, data_type) { - batch_id <- paste0(accession, '_', data_type) - # cleaning - batch_id <- gsub("-", "_", batch_id) - return(batch_id) -} - -get_new_sample_names <- function(result, batch_id) { - new_colnames <- paste0(batch_id, '_', colnames(result$count_data)) - return(new_colnames) -} - -export_count_data <- function(result, batch_id) { - - # renaming columns, to make them specific to accession and data type - colnames(result$count_data) <- get_new_sample_names(result, batch_id) - - outfilename <- paste0(batch_id, '.', result$count_type, '.csv') - - # exporting to CSV file - # index represents gene names - print(paste('Exporting count data to file', outfilename)) - write.table(result$count_data, outfilename, sep = ',', row.names = TRUE, col.names = TRUE, quote = FALSE) -} - -export_metadata <- function(result, batch_id) { - - new_colnames <- get_new_sample_names(result, batch_id) - batch_list <- rep(batch_id, length(new_colnames)) - - df <- data.frame(batch = batch_list, condition = result$sample_groups, sample = new_colnames) - - outfilename <- paste0(batch_id, '.design.csv') - print(paste('Exporting metadata to file', outfilename)) - write.table(df, outfilename, sep = ',', row.names = FALSE, col.names = TRUE, quote = FALSE) -} - - -process_data <- function(atlas_data, accession) { - - eset <- atlas_data[[ accession ]] - - # looping through each data type (ex: 'rnaseq') in the experiment - for (data_type in names(eset)) { - - data <- eset[[ data_type ]] - - skip_iteration <- FALSE - # getting count dataframe - tryCatch({ - - if (data_type == 'rnaseq') { - result <- get_rnaseq_data(data) - } else if (startsWith(data_type, 'A-AFFY-')) { - result <- get_one_colour_microarray_data(data) - } else { - stop(paste('ERROR: Unknown data type:', data_type)) - } - - }, error = function(e) { - print(paste("Caught an error: ", e$message)) - print(paste('ERROR: Could not get assay data for experiment ID', accession, 'and data type', data_type)) - skip_iteration <- TRUE - }) - - # If an error occurred, skip to the next iteration - if (skip_iteration) { - next - } - - batch_id <- get_batch_id(accession, data_type) - - # exporting count data to CSV - export_count_data(result, batch_id) - - # exporting metadata to CSV - export_metadata(result, batch_id) - } - -} - -##################################################### -##################################################### -# MAIN -##################################################### -##################################################### - -args <- get_args() - -# searching and downloading expression atlas data -atlas_data <- download_expression_atlas_data_with_retries(args$accession) - -# writing count data in atlas_data to specific CSV files -process_data(atlas_data, args$accession) - diff --git a/bin/get_geo_dataset_accessions.py b/bin/get_geo_dataset_accessions.py new file mode 100755 index 00000000..d20b3ef3 --- /dev/null +++ b/bin/get_geo_dataset_accessions.py @@ -0,0 +1,961 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +import random +import tarfile +from functools import partial +from multiprocessing import Pool +from pathlib import Path +from urllib.request import urlretrieve + +import httpx +import pandas as pd +import xmltodict +from Bio import Entrez +from natural_language_utils import keywords_in_fields +from tenacity import ( + before_sleep_log, + retry, + stop_after_delay, + wait_exponential, +) +from tqdm import tqdm + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# set a custom writable directory before any Entrez operations +# mandatory for running the script in an apptainer container +# Entrez.Parser.Parser.directory("/tmp/biopython") + +ALLOWED_PLATFORMS = ["rnaseq", "microarray"] + +ACCESSION_OUTFILE_NAME = "accessions.txt" +SPECIES_DATASETS_OUTFILE_NAME = "geo_all_datasets.metadata.tsv" +REJECTED_DATASETS_OUTFILE_NAME = "geo_rejected_datasets.metadata.tsv" +# WRONG_SPECS_DATASETS_METADATA_OUTFILE_NAME = "geo_wrong_platform_moltype_datasets.metadata.tsv" +# WRONG_KEYWORDS_DATASETS_METADATA_OUTFILE_NAME = "geo_wrong_keywords_datasets.metadata.tsv" +# PLATFORM_NOT_AVAILABLE_DATASETS_METADATA_OUTFILE_NAME = "platform_not_available_datasets.metadata.tsv" +# GENE_ID_MAPPING_ISSUES_DATASETS_METADATA_OUTFILE_NAME = "gene_id_mapping_issues_datasets.metadata.tsv" +SELECTED_DATASETS_OUTFILE_NAME = "geo_selected_datasets.metadata.tsv" + +ENTREZ_QUERY_MAX_RESULTS = 9999 +ENTREZ_EMAIL = "stableexpression@nfcore.com" +PLATFORM_METADATA_CHUNKSIZE = 2000 + +NCBI_API_BASE_URL = ( + "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?view=data&acc={accession}" +) +STOP_RETRY_AFTER_DELAY = 600 + +NB_PROBE_IDS_TO_PARSE = 1000 +NB_PROBE_IDS_TO_SAMPLE = 10 + +SUPERSERIES_SUMMARY = "This SuperSeries is composed of the SubSeries listed below." + +ALLOWED_LIBRARY_SOURCES = ["transcriptomic", "RNA"] +ALLOWED_MOLECULE_TYPES = ["RNA", "SRA"] + +GEO_EXPERIMENT_TYPE_TO_PLATFORM = { + "Expression profiling by array": "microarray", + "Expression profiling by high throughput sequencing": "rnaseq", +} + +MINIML_TMPDIR = "geo_miniml" +PLATFORM_SOFT_TMPDIR = "geo_platform_soft" +Path(MINIML_TMPDIR).mkdir(exist_ok=True) +Path(PLATFORM_SOFT_TMPDIR).mkdir(exist_ok=True) + + +################################################################## +################################################################## +# EXCEPTIONS +################################################################## +################################################################## + + +class GeoDatasetNothingFoundError(Exception): + pass + + +class GeoPlatformDataTableNotFound(Exception): + pass + + +################################################################## +################################################################## +# FUNCTIONS +################################################################## +################################################################## + + +def parse_args(): + parser = argparse.ArgumentParser("Get GEO Datasets accessions") + parser.add_argument( + "--species", + type=str, + required=True, + help="Search GEO Datasets for this specific species", + ) + parser.add_argument( + "--keywords", + type=str, + nargs="*", + help="Keywords to search for in datasets description", + ) + parser.add_argument( + "--platform", type=str, help="Platform type", choices=ALLOWED_PLATFORMS + ) + parser.add_argument( + "--exclude-accessions-in", + dest="excluded_accessions_file", + type=Path, + help="Exclude accessions contained in this file", + ) + parser.add_argument( + "--random-sampling-size", + dest="random_sampling_size", + type=int, + help="Random sampling size", + ) + parser.add_argument( + "--random-sampling-seed", + dest="random_sampling_seed", + type=int, + help="Random sampling seed", + ) + parser.add_argument( + "--cpus", type=int, dest="nb_cpus", required=True, help="Number of CPUs" + ) + parser.add_argument( + "--accessions", + type=str, + help="[For dev purposes / testing: provide directly accessions (separated by commas) and try to get their metadata]", + ) + return parser.parse_args() + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# QUERIES TO ENTREZ +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +@retry( + stop=stop_after_delay(STOP_RETRY_AFTER_DELAY), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), + retry_error_callback=(lambda _: {}), +) +def send_request_to_entrez_esearch(query: str) -> dict: + Entrez.email = ENTREZ_EMAIL + with Entrez.esearch( + db="gds", term=query, retmax=ENTREZ_QUERY_MAX_RESULTS + ) as handle: + return Entrez.read(handle) + + +@retry( + stop=stop_after_delay(STOP_RETRY_AFTER_DELAY), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), + retry_error_callback=(lambda _: []), +) +def send_request_to_entrez_esummary(ids: list[str]) -> list[dict]: + Entrez.email = ENTREZ_EMAIL + ids_str = ",".join(ids) + with Entrez.esummary( + db="gds", id=ids_str, retmax=ENTREZ_QUERY_MAX_RESULTS + ) as handle: + return Entrez.read(handle) + + +@retry( + stop=stop_after_delay(STOP_RETRY_AFTER_DELAY), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), + retry_error_callback=(lambda _: None), +) +def send_request_to_ncbi_api(accession: str) -> httpx.Response | None: + url = NCBI_API_BASE_URL.format(accession=accession) + server_error = False + response = None + + try: + response = httpx.get(url) + except httpx.ConnectError: + server_error = True + else: + try: + response.raise_for_status() + except Exception as err: + if str(response.status_code).startswith("5"): # error 500 -> 509 + server_error = True + raise err + else: + logger.error( + f"Error {response.status_code} while sending request to NCBI: {err}" + ) + raise err + + # if we get connection issues or 500 -> 509 server errors + # we stop immediately for this accession (return None) + if server_error: + logger.critical( + f"Server error while sending request to NCBI for accession {accession}" + ) + + return response + + +@retry( + stop=stop_after_delay(STOP_RETRY_AFTER_DELAY), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), + retry_error_callback=(lambda _: None), +) +def download_file_at_url(url: str, output_file: Path): + urlretrieve(url, output_file) + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# GEO DATASETS +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +def chunk_list(lst: list, chunksize: int) -> list: + """Splits a list into chunks of a given size. + + Args: + lst (list): The list to split. + chunksize (int): The size of each chunk. + + Returns: + list: A list of chunks, where each chunk is a list of len(chunksize). + """ + return [lst[i : i + chunksize] for i in range(0, len(lst), chunksize)] + + +def fetch_geo_datasets_for_species(species: str) -> list[dict]: + """ + Fetch GEO datasets (GSE series) for a given species + + Args: + species (str): Scientific name of the species (e.g. "Homo sapiens"). + """ + dataset_types = [ + f'"{experiment_type}"[DataSet Type]' + for experiment_type in GEO_EXPERIMENT_TYPE_TO_PLATFORM + ] + formatted_dataset_type = "(" + " OR ".join(dataset_types) + ")" + + query = f'"{species}"[Organism] AND "gse"[Entry Type] AND {formatted_dataset_type}' + logger.info(f"Fetching GEO datasets with query: {query}") + + # getting list of all datasets IDs for this species + # we need possibly to perform multiple queries because the max number of returned results is capped + nb_entries = None + retstart = 0 + record = {} + while not nb_entries or retstart < nb_entries: + record = send_request_to_entrez_esearch(query) + + if not record: + logger.warning(f"Failed to query Entrey Esearch with query: {query}") + return [] + + # getting total nb of entries + if not nb_entries: + nb_entries = int(record["Count"]) + + # if there is no entry for this species + if nb_entries == 0: + logger.info(f"No entries found for query: {query}") + return [] + + # setting next cursor to the next group + retstart += ENTREZ_QUERY_MAX_RESULTS + + ids = record.get("IdList", []) + if not ids: + logger.warning("No GEO datasets found for your query.") + return [] + + # fetching summary info + results = send_request_to_entrez_esummary(ids) + + # keeping only series datasets (just a double check here) + # and removing superseries (they are just containers of series that are also contained here) + return [ + r + for r in results + if "GSE" in r["Accession"] and r["summary"] != SUPERSERIES_SUMMARY + ] + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# FORMATTING +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +def format_species(species: str) -> str: + return "_".join(species.lower().split(" ")) + + +def format_platform_name(platform_name: str) -> str: + return platform_name.replace("_", "").replace("-", "").lower() + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# GET METADATA +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +def download_dataset_metadata(ftp_link: str, accession: str) -> Path | None: + filename = f"miniml/{accession}_family.xml.tgz" + ftp_url = ftp_link + filename + output_file = Path(MINIML_TMPDIR) / f"{accession}.tar.gz" + download_file_at_url(ftp_url, output_file) + if output_file.exists(): + return output_file + else: + logger.error(f"Failed to download dataset metadata for accession: {accession}") + return None + + +def parse_dataset_metadata(file: Path, accession: str) -> dict | None: + with tarfile.open(file, "r:gz") as tar: + file_to_read = f"{accession}_family.xml" + + try: + f = tar.extractfile(file_to_read) + except KeyError: + file_to_read = f"{accession}_family.xml/{accession}_family.xml" + try: + f = tar.extractfile(file_to_read) + except KeyError: + return None + + if f is None: + logger.warning(f"Failed to get file: {file_to_read}") + return None + + try: + xml_content = f.read().decode("utf-8") + except UnicodeDecodeError: + logger.warning(f"Failed to decode file: {file_to_read}") + return None + + return xmltodict.parse(xml_content)["MINiML"] + + +def parse_characteristics( + characteristics: str | dict | list, stored_characteristics: list +): + if isinstance(characteristics, str): + stored_characteristics.append(characteristics) + elif isinstance(characteristics, dict): + if "#text" in characteristics: + stored_characteristics.append(characteristics["#text"]) + elif isinstance(characteristics, list): + for c in characteristics: + parse_characteristics(c, stored_characteristics) + + +def parse_interesting_metadata( + dataset_metadata: dict, additional_metadata: dict +) -> dict: + """ + Parses interesting metadata from a dataset metadata dictionary and additional metadata dictionary. + + Args: + dataset_metadata (dict): The dataset metadata dictionary. + additional_metadata (dict): The additional metadata dictionary. + + Returns: + dict: The parsed interesting metadata dictionary. + """ + sample_characteristics = [] + sample_library_strategies = [] + sample_library_sources = [] + sample_descriptions = [] + sample_titles = [] + sample_molecule_types = [] + + platform_accessions = [ + "GPL" + gpl_id for gpl_id in dataset_metadata["GPL"].split(";") + ] + + experiment_types = dataset_metadata["gdsType"] + experiment_types = ( + experiment_types if isinstance(experiment_types, list) else [experiment_types] + ) + + # if additional metadata have sample information + if "Sample" in additional_metadata: + # change to list if it's a single dictionary + if isinstance(additional_metadata["Sample"], dict): + additional_metadata["Sample"] = [additional_metadata["Sample"]] + + for sample in additional_metadata["Sample"]: + # storing description if exists + if sample_description := sample.get("Description"): + sample_descriptions.append(sample_description) + + # storing title if exists + if sample_title := sample.get("Title"): + sample_titles.append(sample_title) + + # storing molecule type if exists + if sample_molecule_type := sample.get("Type"): + sample_molecule_types.append(sample_molecule_type) + + # storing library strategy if exists + if sample_library_strategy := sample.get("Library-Strategy"): + sample_library_strategies.append(sample_library_strategy) + + # storing library source if exists + if sample_library_source := sample.get("Library-Source"): + sample_library_sources.append(sample_library_source) + + # parsing sample metadata + if channels := sample.get("Channel"): + if isinstance(channels, dict): + channels = [channels] + for channel in channels: + parse_characteristics( + channel["Characteristics"], sample_characteristics + ) + + return { + "accession": dataset_metadata["Accession"], + "taxon": dataset_metadata["taxon"], + "platform_accessions": platform_accessions, + "summary": dataset_metadata["summary"], + "title": dataset_metadata["title"], + "overall_design": additional_metadata["Series"]["Overall-Design"], + "experiment_types": experiment_types, + "sample_characteristics": list(set(sample_characteristics)), + "sample_library_strategies": list(set(sample_library_strategies)), + "sample_library_sources": list(set(sample_library_sources)), + "sample_descriptions": list(set(sample_descriptions)), + "sample_titles": list(set(sample_titles)), + "sample_molecule_types": list(set(sample_molecule_types)), + } + + +def fetch_dataset_metadata(dataset_metadata: dict) -> dict | None: + """ + Parses metadata from a dataset metadata dictionary. + + Args: + dataset_metadata (dict): The dataset metadata dictionary. + + Returns: + dict | None: The parsed metadata dictionary or None if the metadata is missing. + """ + accession = dataset_metadata["Accession"] + ftp_link = dataset_metadata["FTPLink"].replace("ftp://", "https://") + downloaded_file = download_dataset_metadata(ftp_link, accession) + if downloaded_file is None: + logger.warning(f"Skipping {accession} as metadata download failed") + return None + + additional_metadata = parse_dataset_metadata(downloaded_file, accession) + + # if we could not get additional metadata, we lack too much information to conclude + if additional_metadata is None: + logger.warning(f"Skipping {accession} as additional metadata is missing") + return None + + # parsing interesting information in all available metadata + return parse_interesting_metadata(dataset_metadata, additional_metadata) + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# METADATA TESTS +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +def exclude_unwanted_accessions( + datasets: list[dict], excluded_accessions: list[str] +) -> tuple[list[dict], list[dict]]: + datasets_to_keep = [] + excluded_datasets = [] + for dataset in datasets: + if dataset["accession"] in excluded_accessions: + excluded_datasets.append(dataset) + else: + datasets_to_keep.append(dataset) + return datasets_to_keep, excluded_datasets + + +def check_species_issues(parsed_species_list: list, species: str) -> str | None: + # trying to find our species in the list of species parsed + for parsed_species in parsed_species_list: + if format_species(parsed_species) == format_species(species): + return None + return f"PARSED SPECIES: {parsed_species_list}" + + +def check_molecule_type_issues(molecules_types: list) -> str | None: + # we want only GEO series that contain only RNA molecules + # for other series, they should be superseries contained other series that are being parsed too + # so anyway, this would lead in duplicates + if any( + [ + molecule_type.upper() in ALLOWED_MOLECULE_TYPES + for molecule_type in molecules_types + ] + ): + return None + return f"MOLECULE TYPES: {molecules_types}" + + +def check_experiment_type_issues(experiment_types: list, platform: str) -> str | None: + for experiment_type in experiment_types: + # if at least one experiment type is ok, we keep this dataset + if GEO_EXPERIMENT_TYPE_TO_PLATFORM.get(experiment_type) == platform: + return None + return f"EXPERIMENT TYPES: {experiment_types}" + + +def check_source_issues(library_sources: list) -> str | None: + # if we have no data about library sources, we just cannot infer + if not library_sources: + return None + if any( + library_source in ALLOWED_LIBRARY_SOURCES for library_source in library_sources + ): + return None + return f"LIBRARY SOURCES: {library_sources}" + + +def search_keywords(dataset: dict, keywords: list[str]) -> tuple[list, str | None]: + accession = dataset["accession"] + all_searchable_fields = ( + [dataset["summary"], dataset["title"]] + + dataset["sample_characteristics"] + + dataset["sample_descriptions"] + + dataset["sample_titles"] + ) + found_keywords = keywords_in_fields(all_searchable_fields, keywords) + # only returning experiments if found keywords + if found_keywords: + dataset["found_keywords"] = list(set(found_keywords)) + logger.info(f"Found keywords: {found_keywords} in accession {accession}") + return found_keywords, None + else: + return [], "NO KEYWORDS_FOUND" + + +def check_dataset( + dataset: dict, species: str, platform: str | None, keywords: list[str] | None +) -> tuple[list, dict]: + accession = dataset["accession"] + parsed_species_list = dataset["taxon"].split("; ") + experiment_types = dataset["experiment_types"] + library_sources = dataset["sample_library_sources"] + molecules_types = dataset["sample_molecule_types"] + + issues = [] + + # checking species + if issue := check_species_issues(parsed_species_list, species): + issues.append(issue) + + # checking platform + if platform is not None: + if issue := check_experiment_type_issues(experiment_types, platform): + issues.append(issue) + + # checking that library sources fit + if issue := check_source_issues(library_sources): + issues.append(issue) + + # checking that all molecule types are RNA + if issue := check_molecule_type_issues(molecules_types): + issues.append(issue) + + found_keywords = [] + if keywords: + found_keywords, keyword_issue = search_keywords(dataset, keywords) + if keyword_issue: + issues.append(keyword_issue) + + if issues: + rejection_dict = {accession: issues} + else: + rejection_dict = {} + + return found_keywords, rejection_dict + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# GEO PLATFORMS +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +def fetch_geo_platform_metadata(datasets: list[dict]) -> dict: + """ + Fetch data for a GEO platform + + Args: + platform_accession (str): accession of the platform + """ + # unique list of platform accessions + platform_accessions = list( + set( + [ + platform_accession + for dataset in datasets + for platform_accession in dataset["platform_accessions"] + ] + ) + ) + # formating query + formatted_platform_accessions = [ + f'"{platform_accession}"[GEO Accession]' + for platform_accession in platform_accessions + ] + platform_accessions_str = " OR ".join(formatted_platform_accessions) + query = f'({platform_accessions_str}) AND "gpl"[Entry Type] ' + + record = send_request_to_entrez_esearch(query=query) + + ids = record.get("IdList", []) + if not ids: + logger.warning(f"No GEO platform found for accessions {platform_accessions}.") + return {} + + # fetching summary info + # one single request to NCBI for all platform accessions + platform_metadatas = send_request_to_entrez_esummary(ids) + # return dict associating dataset accessions with platform metadata + return { + platform_metadata["Accession"]: platform_metadata + for platform_metadata in platform_metadatas + } + + +def check_dataset_platforms( + dataset: dict, accession_to_platform_metadata: dict, species: str +) -> dict: + accession = dataset["accession"] + platform_accessions = dataset["platform_accessions"] + + if not platform_accessions: + return {accession: "NO PLATFORM ACCESSIONS"} + + platforms_metadata = [ + accession_to_platform_metadata[platform_accession] + for platform_accession in dataset["platform_accessions"] + ] + + # getting list of platform taxon + platforms_taxons = [] + for metadata in platforms_metadata: + if metadata.get("taxon") is not None: + platforms_taxons += metadata.get("taxon").split("; ") + platforms_taxons = list(set(platforms_taxons)) + + if not platforms_taxons: + return {accession: "NO PLATFORM TAXON"} + + # checking if at least one of the platform accession is the good one + # sample will be further filtered during download (download_geo_data.R) + if not any( + format_species(species) == format_species(taxon) for taxon in platforms_taxons + ): + return {accession: f"TAXON MISMATCH: {platforms_taxons}"} + + return {} + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# RANDOM SAMPLING +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +def sample_experiments_randomly( + experiments: list[dict], sampling_size: int, seed: int +) -> list[str]: + random.seed(seed) + sampled_experiments = [] + + total_nb_samples = 0 + experiments_left = list(experiments) + while experiments_left and total_nb_samples <= sampling_size: + # if the min number of samples is greater than the remaining space left, we get out of the loop + experiments_left_nb_samples = [exp["nb_samples"] for exp in experiments_left] + min_nb_samples = min(experiments_left_nb_samples) + if min_nb_samples > sampling_size - total_nb_samples: + break + + found_experiment = False + test_total_nb_samples = int(total_nb_samples) + not_chosen_yet = list(experiments_left) + while not_chosen_yet and not found_experiment: + experiment = random.choice(not_chosen_yet) + not_chosen_yet.remove(experiment) + test_total_nb_samples = total_nb_samples + experiment["nb_samples"] + if test_total_nb_samples <= sampling_size: + found_experiment = True + + # if the last one was not good, it means we reached the limit of samples we can take + if not found_experiment: + break + else: + total_nb_samples = test_total_nb_samples + experiments_left.remove(experiment) + sampled_experiments.append(experiment) + + return [exp["accession"] for exp in sampled_experiments] + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# EXPORT +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +def sort_if_list(x): + if isinstance(x, list): + return sorted(x) + else: + return x + + +def export_dataset_metadatas( + datasets: list[dict], output_file: str, clean_columns: bool = True +): + if datasets: + df = pd.DataFrame.from_dict(datasets) + # all dataframe contain the column "accession" + # sorting by accessions to ensure that outputs are reproducible + df.sort_values(by="accession", inplace=True) + for col in df.columns: + df[col] = df[col].apply(sort_if_list) + # cleaning columns so that MultiQC can parse them + if clean_columns: + for col in df.columns: + df[col] = df[col].astype(str).str.replace("\n", "") + df[col] = df[col].astype(str).str.replace("\t", "") + df.to_csv( + output_file, + sep="\t", + index=False, + header=True, + ) + + +################################################################## +################################################################## +# MAIN +################################################################## +################################################################## + + +def main(): + args = parse_args() + + random_sampling_size = args.random_sampling_size + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # PARSING GEO DATASETS + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + logger.info(f"Getting datasets corresponding to species {args.species}") + datasets = fetch_geo_datasets_for_species(args.species) + logger.info(f"Found {len(datasets)} datasets for species {args.species}") + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # FOR DEV PURPOSES / TESTING: RESTRICT TO SPECIFIC ACCESSIONS + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + if args.accessions: + logger.info(f"Keeping only accessions {args.accessions}") + dev_accessions = args.accessions.split(",") + datasets = [d for d in datasets if d["Accession"] in dev_accessions] + logger.info(f"Kept {len(datasets)} datasets for dev / testing purposes") + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # PARSING DATASET METADATA + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + logger.info(f"Parsing metadata for {len(datasets)} datasets") + augmented_datasets = [] + with ( + Pool(processes=args.nb_cpus) as p, + tqdm(total=len(datasets)) as pbar, + ): + for result in p.imap_unordered(fetch_dataset_metadata, datasets): + pbar.update() + pbar.refresh() + if result is None: + continue + augmented_datasets.append(result) + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # VALIDATING DATASETS + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + logger.info(f"Validating {len(augmented_datasets)} datasets") + checked_datasets = [] + rejection_dict = {} + for dataset in tqdm(augmented_datasets): + found_keywords, issue_dict = check_dataset( + dataset, args.species, args.platform, args.keywords + ) + if issue_dict: + rejection_dict |= issue_dict + else: + if found_keywords: + dataset["found_keywords"] = found_keywords + checked_datasets.append(dataset) + + logger.info(f"Validated {len(checked_datasets)} datasets") + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # EXCLUDING UNWANTED ACCESSIONS + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + # we exclude unwanted accessions only now + # because we want to get the metadata of the excluded datasets + # in order to adjust the random sampling size + if args.excluded_accessions_file: + # parsing list of accessions which were already fetched from Expression Atlas + with open(args.excluded_accessions_file) as fin: + excluded_accessions = fin.read().splitlines() + logger.info("Excluding unwanted datasets") + checked_datasets, excluded_datasets = exclude_unwanted_accessions( + checked_datasets, excluded_accessions + ) + logger.info( + f"{len(checked_datasets)} datasets remaining after excluding unwanted accessions" + ) + + # adjusting random sampling size by substracting the number of excluded accessions + if random_sampling_size: + total_nb_excluded_samples = sum( + [len(dataset["sample_titles"]) for dataset in excluded_datasets] + ) + logger.info( + f"Subtracting {total_nb_excluded_samples} samples from random sampling size" + ) + random_sampling_size -= total_nb_excluded_samples + # keeping it positive (just in case) + if random_sampling_size < 0: + logger.warning( + f"Random sampling size is negative ({random_sampling_size}), setting it to 0" + ) + random_sampling_size = 0 + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # GETTING METADATA OF SEQUENCING PLATFORMS + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + logger.info("Getting platform metadata") + # making chunks to group httpx to NCBI GEO + checked_datasets_chunks = chunk_list(checked_datasets, PLATFORM_METADATA_CHUNKSIZE) + # resetting selecting datasets + accession_to_platform_metadata = {} + for selected_datasets_chunk in tqdm(checked_datasets_chunks): + accession_to_platform_metadata |= fetch_geo_platform_metadata( + selected_datasets_chunk + ) + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # VALIDATING EACH PLATFORM SEPARATELY, DATASET BY DATASET + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + logger.info(f"Checking each platform for {len(checked_datasets)} datasets") + func = partial( + check_dataset_platforms, + accession_to_platform_metadata=accession_to_platform_metadata, + species=args.species, + ) + selected_datasets = [] + # resetting selecting datasets + for dataset in tqdm(checked_datasets): + accession = dataset["accession"] + issue_dict = func(dataset) + if issue_dict: + if accession in rejection_dict: # should not happen but in case + rejection_dict[accession] += issue_dict[accession] + else: + rejection_dict |= issue_dict + else: + selected_datasets.append(dataset) + + if rejection_dict: + logger.warning(f"{len(rejection_dict)} datasets rejected") + logger.warning(f"Reasons for rejection: {rejection_dict}") + + selected_accessions = sorted( + [dataset["accession"] for dataset in selected_datasets] + ) + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # RANDOM SAMPLING + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + if random_sampling_size is not None and args.random_sampling_seed is not None: + selected_accession_to_nb_samples = [ + { + "accession": dataset["accession"], + "nb_samples": len(dataset["sample_titles"]), + } + for dataset in selected_datasets + ] + + nb_samples_df = pd.DataFrame.from_dict(selected_accession_to_nb_samples) + nb_samples_df.to_csv("selected_accession_to_nb_samples.csv", index=False) + + logger.info("Sampling experiments randomly") + selected_accessions = sample_experiments_randomly( + selected_accession_to_nb_samples, + random_sampling_size, + args.random_sampling_seed, + ) + logger.info( + f"Kept {len(selected_accessions)} experiments after random sampling" + ) + selected_datasets = [ + dataset + for dataset in selected_datasets + if dataset["accession"] in selected_accessions + ] + else: + logger.info( + f"No random sampling requested. Kept {len(selected_datasets)} datasets" + ) + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # EXPORTING ACCESSIONS + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + # sorting accessions to ensure that outputs are reproducible + selected_accessions = sorted(selected_accessions) + with open(ACCESSION_OUTFILE_NAME, "w") as fout: + fout.write("\n".join(selected_accessions)) + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # EXPORTING DATASETS + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + export_dataset_metadatas(augmented_datasets, SPECIES_DATASETS_OUTFILE_NAME) + export_dataset_metadatas(selected_datasets, SELECTED_DATASETS_OUTFILE_NAME) + + rejected_datasets = [ + {"accession": accession, "reason": reason} + for accession, reason in rejection_dict.items() + ] + export_dataset_metadatas( + rejected_datasets, REJECTED_DATASETS_OUTFILE_NAME, clean_columns=False + ) + + +if __name__ == "__main__": + main() diff --git a/bin/get_ratio_standard_variation.py b/bin/get_ratio_standard_variation.py new file mode 100755 index 00000000..76d4ebfd --- /dev/null +++ b/bin/get_ratio_standard_variation.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import config +import polars as pl + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# experimentally chosen +RATIO_CHUNK_SIZE = 100 + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser(description="Compute M-measure for each gene") + parser.add_argument( + "--file", + type=Path, + dest="ratio_file", + required=True, + help="File log of pairwise expression ratios", + ) + parser.add_argument( + "--task-attempts", + dest="task_attempts", + type=int, + default=1, + help="Number of task attempts", + ) + return parser.parse_args() + + +def get_nb_rows(lf: pl.LazyFrame): + return lf.select(pl.len()).collect().item() + + +def get_count_columns(lf: pl.LazyFrame) -> list[str]: + """Get all column names except the config.GENE_ID_COLNAME column. + + The config.GENE_ID_COLNAME column contains only gene IDs. + """ + return [ + col + for col in lf.collect_schema().names() + if not col.startswith(config.GENE_ID_COLNAME) + ] + + +def compute_standard_deviations(file: Path, low_memory: bool) -> pl.LazyFrame: + ratios_lf = pl.scan_parquet(file, low_memory=low_memory) + ratio_columns = [ + col for col in ratios_lf.collect_schema().names() if col.endswith("_log_ratio") + ] + concat_ratios_lf = ratios_lf.select( + [ + pl.concat_list( + [pl.col(col) for col in ratio_columns[i : i + RATIO_CHUNK_SIZE]] + ).alias(f"concat_list_chunk_{i // RATIO_CHUNK_SIZE}") + for i in range(0, len(ratio_columns), RATIO_CHUNK_SIZE) + ] + ).select(pl.concat_list(pl.all()).alias("ratios")) + return pl.concat( + [ + concat_ratios_lf.select("ratios"), + ratios_lf.select(pl.exclude("^.*_log_ratio$")), # gene_id & gene_id_other + ], + how="horizontal", + ).select( + pl.col("ratios").list.std(ddof=0).alias(config.RATIOS_STD_COLNAME), + pl.col(config.GENE_ID_COLNAME), + pl.col(f"{config.GENE_ID_COLNAME}_other"), + ) + + +def get_column_standard_deviations(std_lf: pl.LazyFrame, column: str) -> pl.LazyFrame: + # column is either config.GENE_ID_COLNAME or f"{config.GENE_ID_COLNAME}_other" + return ( + std_lf.group_by(column) + .agg(config.RATIOS_STD_COLNAME) # getting list of ratio std for this gene + .select( + pl.col(column).alias(config.GENE_ID_COLNAME), + pl.col(config.RATIOS_STD_COLNAME), + ) + ) + + +def group_standard_deviations(std_lf: pl.LazyFrame) -> pl.LazyFrame: + # getting the standard devs for genes in the gene_id column + std_a = get_column_standard_deviations(std_lf, column=config.GENE_ID_COLNAME) + # getting the standard devs for genes in the gene_id_other column + std_b = get_column_standard_deviations( + std_lf, column=f"{config.GENE_ID_COLNAME}_other" + ) + # concatenating both dataframes vertically + # if both lists of gene ids are the identical, + # we need to collect values only for one column to avoid duplicates + return ( + pl.concat([std_a, std_b], how="vertical") + .unique(subset=config.GENE_ID_COLNAME) + .sort( + config.GENE_ID_COLNAME + ) # only needed to have consistent output (for snapshots) + ) + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + low_memory = True if args.task_attempts > 1 else False + std_lf = compute_standard_deviations(args.ratio_file, low_memory) + std_lf = group_standard_deviations(std_lf) + + # when the ratio file corresponds to the same gene ids cross joined with themselves (i == i) + # then we want only only one row per gene id + + std_df = std_lf.collect() + if len(std_df) == 0: + raise ValueError( + f"No output following treatment of file {str(args.ratio_file)}" + ) + + outfile = args.ratio_file.name.replace("ratios", "std") + std_df.write_parquet(outfile) + + +if __name__ == "__main__": + main() diff --git a/bin/get_variation_coefficient.R b/bin/get_variation_coefficient.R deleted file mode 100755 index 72d83973..00000000 --- a/bin/get_variation_coefficient.R +++ /dev/null @@ -1,106 +0,0 @@ -#!/usr/bin/env Rscript - -# Written by Olivier Coen. Released under the MIT license. - -library(optparse) - -##################################################### -##################################################### -# FUNCTIONS -##################################################### -##################################################### - - -get_args <- function() { - - option_list <- list( - make_option("--count-files", dest = 'files', help = "Files to concatenate") - ) - - args <- parse_args(OptionParser( - option_list = option_list, - description = "Get variation coefficient from count data for each gene" - )) - - return(args) -} - -merge_count_files <- function(file_list) { - # Read and merge CSV files - concat_df <- NULL - for (file in file_list) { - df <- read.csv(file, row.names = 1, header=TRUE, stringsAsFactors = FALSE) - if (is.null(concat_df)) { - concat_df <- df - } else { - # Perform outer join by row names - concat_df <- merge(concat_df, df, by = "row.names", all = TRUE) - rownames(concat_df) <- concat_df$Row.names - concat_df <- concat_df[, -1] - } - } - return(concat_df) -} - - -average_log2 <- function(row) { - # the dataframe has already been filtered to exclude rows where mean is 0 - return(mean(log2(row + 1))) # adds 1 to avoid log(0) and to stabilize variance -} - -get_variation_coefficient <- function(count_data) { - - print('Getting coefficients of variation') - - # filter the dataframe to exclude rows where the mean is 0 - count_data <- count_data[rowMeans(count_data) != 0, ] - - # filter the dataframe to exclude rows where row mean is in the top 5% or bottom 5% - # determine the percentile thresholds - row_means <- rowMeans(count_data, na.rm = TRUE) - lower_threshold <- quantile(row_means, 0.05, na.rm = TRUE) - upper_threshold <- quantile(row_means, 0.95, na.rm = TRUE) - - count_data <- count_data[row_means >= lower_threshold & row_means <= upper_threshold, ] - - # calculate the coefficient of variation - row_means <- rowMeans(count_data) - row_sds <- apply(count_data, 1, sd) - cv <- row_sds / row_means - - av_log_cpm <- apply(count_data, 1, average_log2) - # combine results into a dataframe - df <- data.frame( - #gene = rownames(count_data), - variation_coefficient = cv, - average_log_cpm = av_log_cpm - ) - - df <- df[order(df$variation_coefficient, decreasing = FALSE), ] - - return(df) -} - -export_data <- function(cv_df, count_data) { - count_outfilename <- 'all_normalized_counts.csv' - print(paste('Exporting normalized counts to:', count_outfilename)) - write.table(count_data, sep=",", file=count_outfilename, row.names = TRUE, col.names = NA, quote = FALSE) - cv_outfilename <- 'variation_coefficients.csv' - print(paste('Exporting variation coefficients to:', cv_outfilename)) - write.table(cv_df, sep=",", file=cv_outfilename, row.names = TRUE, col.names = NA, quote = FALSE) -} - -##################################################### -##################################################### -# MAIN -##################################################### -##################################################### - -args <- get_args() - -file_list <- strsplit(args$files, " ")[[1]] -count_data <- merge_count_files(file_list) - -cv_df <- get_variation_coefficient(count_data) - -export_data(cv_df, count_data) diff --git a/bin/gprofiler_map_ids.py b/bin/gprofiler_map_ids.py new file mode 100755 index 00000000..4a559bde --- /dev/null +++ b/bin/gprofiler_map_ids.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +import sys +from pathlib import Path + +import config +import pandas as pd +from gprofiler_utils import convert_ids + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +################################################################## +# CONSTANTS +################################################################## + +MAPPED_GENE_IDS_OUTFILE = "mapped_gene_ids.csv" +METADATA_OUTFILE = "gene_metadata.csv" + +TARGET_DATABASE_CHOICES = ["ENTREZGENE", "ENSG"] + +FAILURE_REASON_FILE = "failure_reason.txt" + +################################################################## +# FUNCTIONS +################################################################## + + +def parse_args(): + parser = argparse.ArgumentParser("Map IDs using g:Profiler") + parser.add_argument( + "--gene-ids", + type=Path, + dest="gene_id_file", + required=True, + help="Input file containing gene IDs", + ) + parser.add_argument( + "--species", type=str, required=True, help="Species to convert IDs for" + ) + parser.add_argument( + "--target-db", + type=str, + dest="gprofiler_target_db", + required=True, + choices=TARGET_DATABASE_CHOICES, + help="Target database to convert IDs to", + ) + return parser.parse_args() + + +################################################################## +# MAIN +################################################################## + + +def main(): + args = parse_args() + + with open(args.gene_id_file, "r") as fin: + gene_ids = list(set([line.strip() for line in fin])) + + logger.info(f"Converting {len(gene_ids)} IDs for species {args.species} ") + + ############################################################# + # QUERYING g:PROFILER SERVER + ############################################################# + + gene_metadata_dfs = [] + + mapping_dict, gene_metadata_dfs = convert_ids( + gene_ids, args.species, args.gprofiler_target_db + ) + + if not mapping_dict: + msg = ( + f"No mapping found for gene IDs such as {' '.join(gene_ids[:5])} on species {args.species} " + + f"and g:Profiler target database {args.gprofiler_target_db}" + ) + logger.error(msg) + with open(FAILURE_REASON_FILE, "w") as fout: + fout.write(msg) + sys.exit(100) + + ############################################################# + # WRITING MAPPING + ############################################################# + + # making dataframe for mapping (only two columns: original and new) + mapping_df = ( + pd.DataFrame(mapping_dict, index=[0]) + .T.reset_index() # transpose: setting keys as indexes instead of columns + .rename( + columns={ + "index": config.ORIGINAL_GENE_ID_COLNAME, + 0: config.GENE_ID_COLNAME, + } + ) + .sort_values(by=config.ORIGINAL_GENE_ID_COLNAME) + ) + mapping_df.to_csv(MAPPED_GENE_IDS_OUTFILE, index=False, header=True) + + ############################################################# + # WRITING METADATA + ############################################################# + + gene_metadata_df = pd.concat(gene_metadata_dfs, ignore_index=True) + # dropping duplicates and keeping the first occurence + gene_metadata_df.drop_duplicates( + subset=[config.GENE_ID_COLNAME], keep="first" + ).sort_values(by=config.GENE_ID_COLNAME).to_csv( + METADATA_OUTFILE, index=False, header=True + ) + + +if __name__ == "__main__": + main() diff --git a/bin/gprofiler_utils.py b/bin/gprofiler_utils.py new file mode 100755 index 00000000..831b94b5 --- /dev/null +++ b/bin/gprofiler_utils.py @@ -0,0 +1,231 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import logging + +import config +import httpx +import pandas as pd +from tenacity import ( + before_sleep_log, + retry, + stop_after_delay, + wait_exponential, +) + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +################################################################## +# CONSTANTS +################################################################## + +GPROFILER_CONVERT_API_ENDPOINT = "https://biit.cs.ut.ee/gprofiler/api/convert/convert/" +GPROFILER_CONVERT_BETA_API_ENDPOINT = ( + "https://biit.cs.ut.ee/gprofiler_beta/api/convert/convert/" +) + +CHUNKSIZE = 2000 # number of IDs to convert at a time - may create trouble if > 2000 + +COLS_TO_KEEP = ["incoming", "converted", "name", "description"] +DESCRIPTION_PART_TO_REMOVE_REGEX = r"\s*\[Source:.*?\]" + +GPROFILER_ERROR_MESSAGE = ( + "g:Profiler servers (main and beta) seem to be down... Please retry later... " + "If you have gene ID mappings and / or gene metadata for these datasets, you can provide them " + "directly using the `--gene_id_mapping` and `--gene_metadata` parameters respectively, " + "and by skipping the g:Profiler ID mapping step with `--skip_id_mapping`." +) + + +################################################################## +# FUNCTIONS +################################################################## + + +class GProfilerConnectionError(Exception): + pass + + +def format_species_name(species: str): + """ + Format a species name into a format accepted by g:Profiler. + Example: Arabidopsis thaliana -> athaliana + + Parameters + ---------- + species : str + The species name. + + Returns + ------- + str + The formatted species name. + """ + splitted_species = species.lower().replace("_", " ").split(" ") + return splitted_species[0][0] + splitted_species[1] + + +@retry( + stop=stop_after_delay(600), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), +) +def request_conversion( + gene_ids: list, + species: str, + target_database: str, + url: str = GPROFILER_CONVERT_API_ENDPOINT, + attempts: int = 0, +) -> list[str]: + """ + Send a request to the g:Profiler API to convert a list of gene IDs. + + Parameters + ---------- + gene_ids : list + The list of gene IDs to convert. + species : str + The species to convert the IDs for. + url : str, optionalrequest_conversion + The URL to send the request to, by default GPROFILER_CONVERT_API_ENDPOINT + attempts : int, optional + The number of attempts already performed, by default 0 + + Returns + ------- + list + The list of dicts corresponding to the converted IDs. + """ + + # formatting species for g:Profiler + organism = format_species_name(species) + + if attempts > 0: + logger.warning( + "g:Profiler main server appears down, trying with the beta server..." + ) + + server_appears_down = False + + try: + response = httpx.post( + url=url, + json={"organism": organism, "query": gene_ids, "target": target_database}, + ) + except httpx.ConnectError: + server_appears_down = True + else: + try: + response.raise_for_status() + except Exception as err: + if str(response.status_code).startswith("5"): # error 500 -> 509 + server_appears_down = True + else: + logger.error( + f"Error {response.status_code} while converting IDs: {err}" + ) + raise err + + if server_appears_down: + if attempts == 0: + logger.warning( + "g:Profiler main server appears down, trying with the beta server..." + ) + return request_conversion( + gene_ids, + species, + target_database=target_database, + url=GPROFILER_CONVERT_BETA_API_ENDPOINT, # backup endpoint + attempts=1, + ) + else: + # both servers appear down, we stop here... + logger.error(GPROFILER_ERROR_MESSAGE) + raise GProfilerConnectionError(GPROFILER_ERROR_MESSAGE) + + else: + return response.json()["result"] + + +def convert_chunk_of_ids( + gene_ids: list, species: str, gprofiler_target_db: str +) -> tuple[dict, pd.DataFrame]: + """ + Wrapper function that converts a list of gene IDs to another namespace. + + Parameters + ---------- + species : str + The species to convert the IDs for. + gene_ids : list + The IDs to convert. + target_database : str + The target database to convert to. + + Returns + ------- + dict + A dictionary where the keys are the original IDs and the values are the converted IDs. + """ + + results = request_conversion(gene_ids, species, gprofiler_target_db) + df = pd.DataFrame.from_records(results) + + if df.empty: + return {}, pd.DataFrame() + + # keeping only rows where 'converted' is not null and only the columns of interest + df = df.loc[df["converted"] != "None", COLS_TO_KEEP] + + # dict associating incoming IDs to converted IDs + mapping_dict = df.set_index("incoming").to_dict()["converted"] + + # DataFrame associating converted IDs to name and description + meta_df = df.drop(columns=["incoming"]).rename( + columns={"converted": config.GENE_ID_COLNAME} + ) + + meta_df["name"] = meta_df["name"].str.replace(",", ";") + + # Extract the part before '[Source:...]', or the whole string if not found + meta_df["description"] = ( + meta_df["description"] + .str.replace(DESCRIPTION_PART_TO_REMOVE_REGEX, "", regex=True) + .str.replace(",", ";") + ) + + return mapping_dict, meta_df + + +def chunk_list(lst: list, chunksize: int) -> list: + """Splits a list into chunks of a given size. + + Args: + lst (list): The list to split. + chunksize (int): The size of each chunk. + + Returns: + list: A list of chunks, where each chunk is a list of len(chunksize). + """ + return [lst[i : i + chunksize] for i in range(0, len(lst), chunksize)] + + +def convert_ids( + ids: list[str], species: str, gprofiler_target_db: str +) -> tuple[dict, pd.DataFrame]: + mapping_dict = {} + gene_metadata_dfs = [] + + chunks = chunk_list(ids, chunksize=CHUNKSIZE) + for chunk_gene_ids in chunks: + # converting to Gene IDs for all IDs comprised in this chunk + gene_mapping, meta_df = convert_chunk_of_ids( + chunk_gene_ids, species, gprofiler_target_db + ) + mapping_dict.update(gene_mapping) + gene_metadata_dfs.append(meta_df) + + return mapping_dict, gene_metadata_dfs diff --git a/bin/impute_missing_values.py b/bin/impute_missing_values.py new file mode 100755 index 00000000..962e9728 --- /dev/null +++ b/bin/impute_missing_values.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import config +import polars as pl +from common import export_parquet, parse_count_table +from sklearn.experimental import enable_iterative_imputer # noqa +from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +OUTFILE_SUFFIX = ".imputed.parquet" + +THRESHOLD_RATIO_ZEROS = 0.9 + +# KNN +N_NEIGHBORS = 10 + +# ITERATIVE +MAX_ITERATIONS = 10 +N_NEAREST_FEATURES = 100 + +IMPUTERS = ["knn", "iterative", "gene_mean"] + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser(description="Perform KNN imputation on count data") + parser.add_argument( + "--counts", type=Path, dest="count_file", required=True, help="Count file" + ) + parser.add_argument("--imputer", choices=IMPUTERS, required=True, dest="imputer") + return parser.parse_args() + + +def get_count_columns(df: pl.DataFrame): + return df.select(pl.exclude(config.GENE_ID_COLNAME)).columns + + +def apply_imputer(df: pl.DataFrame, imputer): + # convert to numpy, impute, then convert back + count_matrix = df.select(get_count_columns(df)).to_numpy() + imputed_array = imputer.fit_transform(count_matrix) + return df.with_columns(pl.DataFrame(imputed_array, schema=get_count_columns(df))) + + +def apply_simle_imputer(df: pl.DataFrame): + imputer = SimpleImputer() + return apply_imputer(df, imputer) + + +def apply_knn_imputer(df: pl.DataFrame) -> pl.DataFrame: + imputer = KNNImputer(n_neighbors=N_NEIGHBORS, weights="distance") + return apply_imputer(df, imputer) + + +def apply_iterative_imputer(df: pl.DataFrame) -> pl.DataFrame: + imputer = IterativeImputer( + max_iter=MAX_ITERATIONS, + n_nearest_features=N_NEAREST_FEATURES, + random_state=0, + initial_strategy="mean", + min_value=0, + max_value=1, + imputation_order="random", + verbose=1, + ) + return apply_imputer(df, imputer) + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + logger.info(f"Parsing {args.count_file.name}") + df = parse_count_table(args.count_file) + + # logger.info("Separating genes with high number of zeros") + # df, high_zero_genes_df = separate_genes_with_high_number_of_zeros(count_df) + + if args.imputer == "iterative": + logger.info("Applying iterative imputation") + df = apply_iterative_imputer(df) + elif args.imputer == "knn": + logger.info("Applying KNN imputation") + df = apply_knn_imputer(df) + elif args.imputer == "gene_mean": + logger.info("Applying simple imputation") + df = apply_simle_imputer(df) + + export_parquet(df, args.count_file, OUTFILE_SUFFIX) + + logger.info("Done") + + +if __name__ == "__main__": + main() diff --git a/bin/make_cross_join.py b/bin/make_cross_join.py new file mode 100755 index 00000000..f28a597c --- /dev/null +++ b/bin/make_cross_join.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import polars as pl + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser(description="Compute M-measure for each gene") + parser.add_argument( + "--file1", + type=Path, + dest="count_file_1", + required=True, + help="Chunk count file 1", + ) + parser.add_argument( + "--file2", + type=Path, + dest="count_file_2", + required=True, + help="Chunk count file 2", + ) + parser.add_argument( + "--index1", + type=Path, + dest="count_file_1_index", + required=True, + help="Index of chunk count file 1", + ) + parser.add_argument( + "--index2", + type=Path, + dest="count_file_2_index", + required=True, + help="Index of chunk count file 2", + ) + parser.add_argument( + "--task-attempts", + dest="task_attempts", + type=int, + default=1, + help="Number of task attempts", + ) + return parser.parse_args() + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + low_memory = True if args.task_attempts > 1 else False + lf = pl.scan_parquet(args.count_file_1, low_memory=low_memory) + lf_other = pl.scan_parquet(args.count_file_2, low_memory=low_memory) + + logger.info("Computing cross join data") + lf = lf.join( + lf_other, how="cross", suffix="_other" + ) # Perform a cross join with itself + + df = lf.collect() + if len(df) == 0: + raise ValueError( + f"No output following treatment of files {str(args.count_file_1)} and {str(args.count_file_2)}" + ) + + outfile = f"cross_join.{args.count_file_1_index}.{args.count_file_2_index}.parquet" + df.write_parquet(outfile) + + +if __name__ == "__main__": + main() diff --git a/bin/make_pairwise_gene_expression_ratio.py b/bin/make_pairwise_gene_expression_ratio.py new file mode 100755 index 00000000..0fdc5715 --- /dev/null +++ b/bin/make_pairwise_gene_expression_ratio.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import config +import polars as pl + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser(description="Compute M-measure for each gene") + parser.add_argument( + "--file", + type=Path, + dest="cross_joined_file", + required=True, + help="File where each row contains counts for two genes", + ) + parser.add_argument( + "--task-attempts", + dest="task_attempts", + type=int, + default=1, + help="Number of task attempts", + ) + return parser.parse_args() + + +def get_count_columns(lf: pl.LazyFrame) -> list[str]: + """Get all column names except the config.GENE_ID_COLNAME column. + + The config.GENE_ID_COLNAME column contains only gene IDs. + """ + return [ + col + for col in lf.collect_schema().names() + if not col.startswith(config.GENE_ID_COLNAME) + ] + + +def compute_ratios(file: Path, low_memory: bool) -> pl.LazyFrame: + # getting ratios for each sample + cross_join_lf = pl.scan_parquet(file, low_memory=low_memory) + column_pairs = { + col: f"{col}_other" + for col in get_count_columns(cross_join_lf) + if not col.endswith("_other") + } + return cross_join_lf.select( + [pl.col(config.GENE_ID_COLNAME), pl.col(f"{config.GENE_ID_COLNAME}_other")] + + [ + (pl.col(col) / pl.col(other_col)).log(base=2).alias(f"{col}_log_ratio") + for col, other_col in column_pairs.items() + ] + ) + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + low_memory = True if args.task_attempts > 1 else False + ratios_lf = compute_ratios(args.cross_joined_file, low_memory) + + ratios_df = ratios_lf.collect() + + if len(ratios_df) == 0: + raise ValueError( + f"No output following treatment of file {str(args.cross_joined_file)}" + ) + + outfilename = args.cross_joined_file.name.replace("cross_join", "ratios") + ratios_df.write_parquet(outfilename) + + +if __name__ == "__main__": + main() diff --git a/bin/make_parquet_chunks.py b/bin/make_parquet_chunks.py new file mode 100755 index 00000000..59b8df3a --- /dev/null +++ b/bin/make_parquet_chunks.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from math import ceil +from pathlib import Path + +import config +import polars as pl + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# experimentally chosen +GENE_CHUNK_SIZE = 100 +ZERO_REPLACE_VALUE = 1e-8 + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser(description="Compute M-measure for each gene") + parser.add_argument( + "--counts", + type=Path, + dest="count_file", + required=True, + help="File containing normalised counts for all genes and all samples", + ) + parser.add_argument( + "--task-attempts", + dest="task_attempts", + type=int, + default=1, + help="Number of task attempts", + ) + return parser.parse_args() + + +def get_nb_rows(lf: pl.LazyFrame): + return lf.select(pl.len()).collect().item() + + +def parse_count_dataset(file: Path, low_memory: bool) -> pl.LazyFrame: + lf = pl.scan_parquet(file, low_memory=low_memory).fill_null(0).fill_nan(0) + count_columns = get_count_columns(lf) + cols = [pl.col(config.GENE_ID_COLNAME)] + [ + pl.col(column).replace({0: ZERO_REPLACE_VALUE}).cast(pl.Float64) + for column in count_columns + ] + return lf.select(cols) + + +def get_count_columns(lf: pl.LazyFrame) -> list[str]: + """Get all column names except the config.GENE_ID_COLNAME column. + + The config.GENE_ID_COLNAME column contains only gene IDs. + """ + return [ + col + for col in lf.collect_schema().names() + if not col.startswith(config.GENE_ID_COLNAME) + ] + + +def split_count_summary_in_chunks(lf: pl.LazyFrame): + lf = lf.with_row_index(name="index") + + nb_rows = get_nb_rows(lf) + logger.info(f"Number of rows (genes) in count file: {nb_rows}") + nb_chunks = ceil(nb_rows / GENE_CHUNK_SIZE) + logger.info(f"Number of chunks: {nb_chunks}") + + for i, start in enumerate(range(0, nb_rows, GENE_CHUNK_SIZE)): + partition = ( + lf.filter( + (pl.col("index") >= start) & (pl.col("index") < start + GENE_CHUNK_SIZE) + ) + .drop("index") + .collect() + ) + outfile = f"count_chunk.{i}.parquet" + partition.write_parquet(outfile) + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + low_memory = True if args.task_attempts > 1 else False + logger.info("Parsing count file") + lf = parse_count_dataset(args.count_file, low_memory) + + logger.info("Splitting count file into chunks") + split_count_summary_in_chunks(lf) + + +if __name__ == "__main__": + main() diff --git a/bin/map_ids_to_ensembl.py b/bin/map_ids_to_ensembl.py deleted file mode 100755 index cb0a30aa..00000000 --- a/bin/map_ids_to_ensembl.py +++ /dev/null @@ -1,188 +0,0 @@ -#!/usr/bin/env python3 - -# Written by Olivier Coen. Released under the MIT license. - -import requests -import pandas as pd -from pathlib import Path -import argparse -import json - - -class NoIDFoundException(Exception): - pass - - -################################################################## -# CONSTANTS -################################################################## - -RENAMED_FILE_SUFFIX = "_renamed.csv" -MAPPING_FILE_SUFFIX = "_mapping.json" -CHUNKSIZE = 2000 # number of IDs to convert at a time - may create trouble if > 2000 - -GPROFILER_CONVERT_API_ENDPOINT = "https://biit.cs.ut.ee/gprofiler/api/convert/convert/" -TARGET_DATABASE = "ENSG" # Ensembl database - - -################################################################## -# FUNCTIONS -################################################################## - - -def parse_args(): - parser = argparse.ArgumentParser("Map IDs to Ensembl") - parser.add_argument("--count-file", type=Path, help="Input file containing counts") - parser.add_argument("--species", type=str, help="Species to convert IDs for") - return parser.parse_args() - - -def format_species_name(species: str): - """ - Format a species name into a format accepted by g:Profiler. - Example: Arabidopsis thaliana -> athaliana - - Parameters - ---------- - species : str - The species name. - - Returns - ------- - str - The formatted species name. - """ - splitted_species = species.lower().replace("_", " ").split(" ") - return splitted_species[0][0] + splitted_species[1] - - -def chunk_list(lst: list, chunksize: int): - """Splits a list into chunks of a given size. - - Args: - lst (list): The list to split. - chunksize (int): The size of each chunk. - - Returns: - list: A list of chunks, where each chunk is a list of len(chunksize). - """ - return [lst[i : i + chunksize] for i in range(0, len(lst), chunksize)] - - -def request_conversion( - gene_ids: list, species: str, target_database: str -) -> list[dict]: - """ - Send a request to the g:Profiler API to convert a list of gene IDs. - - Parameters - ---------- - gene_ids : list - The list of gene IDs to convert. - species : str - The species to convert the IDs for. - target_database : str - The target database to convert to. - - Returns - ------- - list - The list of dicts corresponding to the converted IDs. - """ - response = requests.post( - url=GPROFILER_CONVERT_API_ENDPOINT, - json={"organism": species, "query": gene_ids, "target": TARGET_DATABASE}, - ) - response.raise_for_status() - return response.json()["result"] - - -def convert_ids(gene_ids: list, species: str): - """ - Wrapper function that converts a list of gene IDs to another namespace. - - Parameters - ---------- - species : str - The species to convert the IDs for. - gene_ids : list - The IDs to convert. - target_database : str - The target database to convert to. - - Returns - ------- - dict - A dictionary where the keys are the original IDs and the values are the converted IDs. - """ - - results = request_conversion(gene_ids, species, TARGET_DATABASE) - df = pd.DataFrame.from_records(results) - - if df.empty: - return {} - - # keeping only rows where 'converted' is not null and only the columns of interest - df = df.loc[df["converted"] != "None", ["incoming", "converted"]] - # changing index - df.set_index("incoming", inplace=True) - - return df.to_dict()["converted"] - - -################################################################## -# MAIN -################################################################## - - -def main(): - args = parse_args() - - count_file = args.count_file - species_name = format_species_name(args.species) - print( - f"Converting IDs for species {species_name} and count file {count_file.name}..." - ) - - df = pd.read_csv(count_file, header=0, index_col=0) - df.index = df.index.astype(str) - - gene_ids = df.index.tolist() - mapping_dict = {} - - chunks = chunk_list(gene_ids, chunksize=CHUNKSIZE) - for chunk_gene_names in chunks: - # converting to uniprot IDs / NCBI Gene IDs for all IDs comprised in this chunk - gene_mapping = convert_ids(chunk_gene_names, species_name) - mapping_dict.update(gene_mapping) - - if not mapping_dict: # if mapping dict is empty - raise NoIDFoundException( - f"No mapping found for gene names in count file {count_file.name} " - f"and for species {species_name}! " - f"Example of gene names found in the provided dataframe: {df.index[:5]}" - ) - - # filtering the DataFrame to keep only the rows where the index can be mapped - df = df.loc[df.index.isin(mapping_dict)] - - # renaming gene names to mapped ids using mapping dict - df.index = df.index.map(mapping_dict) - - # TODO: check is there is another way to avoid duplicate gene names - # sometimes different gene names have the same ensembl ID - # for now, we just get the mean of values, but this is not ideal - df = df.groupby(df.index).mean() - - # writing to output file - outfile = count_file.with_name(count_file.stem + RENAMED_FILE_SUFFIX) - df.to_csv(outfile, index=True, header=True) - - # writing mapping dict to file - mapping_file = count_file.with_name(count_file.stem + MAPPING_FILE_SUFFIX) - with open(mapping_file, "w") as f: - json.dump(mapping_dict, f) - - -if __name__ == "__main__": - main() diff --git a/bin/merge_counts.py b/bin/merge_counts.py new file mode 100755 index 00000000..0f9d426c --- /dev/null +++ b/bin/merge_counts.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import hashlib +import json +import logging +from operator import attrgetter +from pathlib import Path + +import config +import polars as pl + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +ALL_COUNTS_PARQUET_OUTFILENAME = "all_counts.parquet" + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser(description="Merge count datasets") + parser.add_argument( + "--counts", type=str, dest="count_files", required=True, help="Count files" + ) + return parser.parse_args() + + +##################################################### +# COUNTS +##################################################### + + +def get_lazyframes(files: list[Path]) -> list[pl.LazyFrame]: + """Get a list of LazyFrames from a list of files.""" + return [pl.scan_parquet(file, low_memory=True) for file in files] + + +def get_columns(lf: pl.LazyFrame) -> list[str]: + return lf.collect_schema().names() + + +def get_count_columns(lf: pl.LazyFrame) -> list[str]: + return [col for col in get_columns(lf) if col != config.GENE_ID_COLNAME] + + +def reproducible_hash(lf: pl.LazyFrame) -> str: + """ + Return a deterministic MD5 hash for a lazyframe. + + Steps: + 1. Convert the tuple (and any nested structures) to a canonical JSON string. + - `sort_keys=True` guarantees that dictionaries are ordered consistently. + - `separators=(',', ':')` removes unnecessary whitespace. + 2. Encode the string as UTF‑8 bytes. + 3. Feed the bytes to hashlib.md5 and return the hex digest. + + The result is a 64‑character hexadecimal string that will be identical + across Python runs, machines, and even different Python versions + (provided the data types are JSON‑compatible). + """ + tpl = tuple(get_columns(lf)) + # Canonical JSON representation + canonical_str = json.dumps(tpl, sort_keys=True, separators=(",", ":")) + # Encode to bytes + data_bytes = canonical_str.encode("utf-8") + # Compute MD5 + hash_obj = hashlib.md5(data_bytes) + return hash_obj.hexdigest() + + +def scan_counts(files: list[Path]) -> list[pl.LazyFrame]: + """ + Get all count data from a list of files. + """ + logger.info("Parsing counts") + # sorting them by file name to ensure consistent order between runs + files.sort(key=attrgetter("name")) + + lfs = get_lazyframes(files) + + # sorting dataframes by a hash on column names + # this is crucial for consistent output of the script + # in case multiple files have the same name + return sorted(lfs, key=lambda lf: reproducible_hash(lf)) + + +def collect_all_gene_ids(lfs: list[pl.LazyFrame]) -> pl.DataFrame: + """ + Collect all gene IDs from a list of lazyframes. + """ + logger.info("Getting the full list of gene IDs") + gene_id_set = set() + for lf in lfs: + lf_gene_ids = lf.select(config.GENE_ID_COLNAME).collect().to_series().to_list() + gene_id_set.update(lf_gene_ids) + return pl.DataFrame({config.GENE_ID_COLNAME: sorted(list(gene_id_set))}) + + +def make_tmp_sorted_dataframes( + lfs: list[pl.LazyFrame], gene_id_df: pl.DataFrame +) -> list[Path]: + """ """ + tmp_files = [] + for i, lf in enumerate(lfs): + # perform left join from gene ids so that all dataframes can be compared row-wise + # removing the gene id column for now + df = gene_id_df.join( + lf.collect(), on=config.GENE_ID_COLNAME, how="left" + ).select(pl.exclude(config.GENE_ID_COLNAME)) + outfile = Path(f"tmp.{i}.parquet") + df.write_parquet(outfile) + tmp_files.append(outfile) + return tmp_files + + +def formating_counts(lf: pl.LazyFrame): + """ + The config.GENE_ID_COLNAME column is cast + to String, and all other columns are cast to Float64. + """ + + # casting count columns to Float64 + # casting gene id column to Stringcount_files + # casting nans to nulls + logger.info("Cleaning merged lazyframe") + return lf.select( + [pl.col(config.GENE_ID_COLNAME).cast(pl.String)] + + [pl.col(column).cast(pl.Float64) for column in get_count_columns(lf)] + ).fill_nan(None) + + +##################################################### +# EXPORT +##################################################### + + +def export_data(lf: pl.LazyFrame): + """Export gene expression data.""" + logger.info(f"Exporting normalised counts to: {ALL_COUNTS_PARQUET_OUTFILENAME}") + lf.sink_parquet(ALL_COUNTS_PARQUET_OUTFILENAME) + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + # parsing count files + count_files = [Path(file) for file in args.count_files.split(" ")] + logger.info(f"Merging {len(count_files)} count files") + + lfs = scan_counts(count_files) + + # collecting all gene ids from all lazyframes into a dataframe with one column + gene_id_df = collect_all_gene_ids(lfs) + + # performing a left join between the sorted list of gene if and each collected lazyframe separately + # writing this sorted dataframe in a tmp file + tmp_files = make_tmp_sorted_dataframes(lfs, gene_id_df) + + # scanning the newly created tmp files + lfs = scan_counts(tmp_files) + + # these files are ready to be merged directly through horizontal concatenation + # setting strict=True requires all DataFrames to be the same height, raising an error if not. + merged_lf = pl.concat([gene_id_df.lazy()] + lfs, how="horizontal", strict=True) + + # performing some cleaning / formating operations + merged_lf = formating_counts(merged_lf) + + # exporting merged data in streaming mode + export_data(merged_lf) + + # cleaning up tmp files + for tmp_file in tmp_files: + tmp_file.unlink() + + +if __name__ == "__main__": + main() diff --git a/bin/natural_language_utils.py b/bin/natural_language_utils.py new file mode 100755 index 00000000..79f8463c --- /dev/null +++ b/bin/natural_language_utils.py @@ -0,0 +1,139 @@ +import nltk +from nltk.corpus import wordnet + +nltk.download("punkt_tab") +nltk.download("averaged_perceptron_tagger_eng") +nltk.download("wordnet") + +lemmatizer = nltk.WordNetLemmatizer() +stemmer = nltk.PorterStemmer() + + +def get_wordnet_pos(token: str) -> str: + tag = nltk.pos_tag([token])[0][1][0].upper() + tag_dict = { + "J": wordnet.ADJ, + "N": wordnet.NOUN, + "V": wordnet.VERB, + "R": wordnet.ADV, + } + return tag_dict.get(tag, wordnet.NOUN) # Default to NOUN if not found + + +def get_stemmed_tokens(sentence: str) -> list[str]: + """ + Tokenize a sentence into its constituent words, and then stem each word + + Parameters + ---------- + sentence : str + The sentence to be tokenized and stemmed + + Returns + ------- + tokens : List[str] + The list of stemmed tokens + """ + + tokens = nltk.word_tokenize(sentence) + return [stemmer.stem(token) for token in tokens] + + +def get_lemmed_tokens(sentence: str) -> list[str]: + """ + Tokenize a sentence into its constituent words, and then lemmatize each word + + Parameters + ---------- + sentence : str + The sentence to be tokenized and lemmatized + + Returns + ------- + tokens : List[str] + The list of lemmatized tokens + """ + tokens = nltk.word_tokenize(sentence) + return [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens] + + +def get_synonyms(word) -> set[str]: + """ + Get all synonyms of a word from the wordnet database. + + Parameters + ---------- + word : str + The word for which to get synonyms + + Returns + ------- + synonyms : set + A set of all synonyms of the word + """ + synonyms = [] + for syn in wordnet.synsets(word): + for lemma in syn.lemmas(): + synonyms.append(lemma.name()) # Get the name of each lemma (synonym) + return set(synonyms) # Return as a set to avoid duplicates + + +def get_all_candidate_target_words(sentence: str) -> list[str]: + """ + Get all candidate target words from a sentence by stemming and lemmatizing the + tokens and getting synonyms from the wordnet database. + + Parameters + ---------- + sentence : str + The sentence from which to get candidate target words + + Returns + ------- + candidates : list + A list of all candidate target words + """ + candidates = [] + lemmatized_tokens = get_stemmed_tokens(sentence) + stemmed_tokens = get_stemmed_tokens(sentence) + tokens = list(set(lemmatized_tokens + stemmed_tokens)) + for token in tokens: + candidates += get_synonyms(token) + return candidates + + +def word_is_in_sentence(word: str, sentence: str) -> bool: + """ + Check if a word (or a stemmed version of it) is in a sentence, or if it is a + subword of a stemmed version of any word in the sentence. + + Parameters + ---------- + word : str + The word to be searched for + sentence : str + The sentence in which to search for the word + + Returns + ------- + bool + True if the word is found in the sentence, False otherwise + """ + for stemmed_word in [word] + get_stemmed_tokens(word): + # testing if stemmed word is in sentence as it is + if stemmed_word in sentence: + return True + # or testing if stemmed word is a subword of a stemmed word from the sentence + for target_word in get_all_candidate_target_words(sentence): + if stemmed_word in target_word: + return True + return False + + +def keywords_in_fields(fields: list[str], keywords: list[str]) -> list[str]: + return [ + keyword + for keyword in keywords + for field in fields + if word_is_in_sentence(keyword, field) + ] diff --git a/bin/normalise_microarray.R b/bin/normalise_microarray.R new file mode 100755 index 00000000..f9343ebf --- /dev/null +++ b/bin/normalise_microarray.R @@ -0,0 +1,129 @@ +#!/usr/bin/env Rscript + +# Written by Olivier Coen. Released under the MIT license. + +suppressPackageStartupMessages(library("affy")) +suppressPackageStartupMessages(library("optparse")) +suppressPackageStartupMessages(library("AnnotationDbi")) +suppressPackageStartupMessages(library("dplyr")) + +# Load library +library(affy) +library(optparse) +library(AnnotationDbi) +library(dplyr) +library(tibble) + +options(error = traceback) + +# we need to install the affy package manually while disabling threading +# when installed through conda, we get: ERROR; return code from pthread_create() is 22 +if (!requireNamespace("affy", quietly = TRUE)) { + BiocManager::install("affy", configure.args="--disable-threading", force = TRUE, quiet = TRUE) +} + + +##################################################### +##################################################### +# ARG PARSER +##################################################### +##################################################### + +get_args <- function() { + option_list <- list( + make_option("--input", help = "Folder containing CEL files"), + make_option("--target-gene-id-db", dest = "target_gene_id_db", help = "Target database for gene IDs (ENSEMBL or ENTREZID)") + ) + + args <- parse_args(OptionParser( + option_list = option_list, + description = "Normalize microarray data using RMA" + )) + return(args) +} + +get_probe_id_mapping <- function(data, annot_db, target_gene_id_db, stringent) { + + probe_ids <- rownames(data) + annotations <- AnnotationDbi::select( + annot_db, + keys = probe_ids, + columns = c(target_gene_id_db), + keytype = "PROBEID" + ) + + if (stringent) { + annotations <- annotations %>% + group_by(PROBEID) %>% + filter(n_distinct(.data[[target_gene_id_db]], na.rm = TRUE) == 1) %>% + ungroup() + } + + return(annotations) +} + +replace_probe_ids_by_target_ids <- function(data, annotations, target_gene_id_db) { + data <- as.data.frame(data) + data$PROBEID <- rownames(data) + + data <- merge(annotations, data, by = "PROBEID", all.x = TRUE) + + # computing mean of probe values for each gene + data <- data %>% + group_by(.data[[target_gene_id_db]]) %>% + summarise(across(where(is.numeric), function(x) mean(x, na.rm = TRUE))) %>% + ungroup() + + data <- tibble::column_to_rownames(data, var = target_gene_id_db) + return(data) +} + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +main <- function() { + + args <- get_args() + + # Read CEL files from a directory + message("Reading CEL files from", args$input) + data <- ReadAffy(celfile.path = args$input) + + message("Installing annotation database") + db_name <- paste0(annotation(data), ".db") + if (!requireNamespace(db_name, quietly = TRUE)) { + BiocManager::install(db_name, quiet = TRUE) + } + library(db_name, character.only = TRUE) + + # Normalize using RMA (most common method) + eset <- rma(data) + # Extract normalized expression values + message("Extracting normalized expression values") + normalised_data <- exprs(eset) + + annotations <- get_probe_id_mapping( + normalised_data, + annot_db = get(db_name), # Get the database object using get() + target_gene_id_db = args$target_gene_id_db, + stringent = TRUE + ) + + normalised_data_df <- replace_probe_ids_by_target_ids(normalised_data, annotations, args$target_gene_id_db) + + # cleaning colnames + colnames(normalised_data_df) <- sub("\\..*", "", colnames(normalised_data_df)) + colnames(normalised_data_df) <- sub("-", "_", colnames(normalised_data_df)) + + # Save results + message("Saving results to normalised_expression.csv") + write.csv(normalised_data_df, "normalised_expression.csv") + +} + +main() diff --git a/bin/normfinder.py b/bin/normfinder.py new file mode 100755 index 00000000..7cd49282 --- /dev/null +++ b/bin/normfinder.py @@ -0,0 +1,517 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +import sys +from dataclasses import dataclass, field +from pathlib import Path +from statistics import mean + +import config +import numpy as np +import polars as pl +from common import write_float_csv +from numba import njit, prange +from tqdm import tqdm + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +STABILITY_OUTFILENAME = "stability_values.normfinder.csv" + + +############################################################################ +# POLARS EXTENSIONS +############################################################################ + + +@pl.api.register_expr_namespace("row") +class StatsExtension: + def __init__(self, expr: pl.Expr): + self._expr = expr + + def not_null_values(self): + return self._expr.list.eval(pl.element().drop_nulls().drop_nans()).list + + def mean(self) -> pl.Expr: + """Mean over non nulls values in row""" + return self.not_null_values().mean() + + def sum(self) -> pl.Expr: + """Median over non nulls values in row""" + return self.not_null_values().sum() + + def min(self) -> pl.Expr: + """Median over non nulls values in row""" + return self.not_null_values().min() + + +############################################################################ +# NUMBA-ACCELERATED FUNCTIONS +############################################################################ + + +@njit(parallel=True) +def compute_minvars(z: np.ndarray, target_idx: np.ndarray) -> np.ndarray: + """ + z: (ngenes, nsamples) array + target_idx: 1D array of indices (int64) for which to compute minvar + returns: 1D array of length len(target_idx) + """ + ngenes, nsamples = z.shape + + # should not happen as it is controlled before, but just in case + if nsamples < 2: + raise ValueError("Number of samples must be at least 2") + + minvars = np.empty(len(target_idx), dtype=np.float64) + for k in prange(len(target_idx)): + i = target_idx[k] + # checking if counts for this gene are all nans + nb_valid_counts = (~np.isnan(z[i, :])).sum() + if nb_valid_counts < 1: + minvars[k] = np.nan + continue # skip this gene + # computing variances of pairwise differences + minv = 1e18 + for j in prange(ngenes): + if i == j: + continue + diffs = z[i, :] - z[j, :] + mean = np.sum(diffs) / nsamples # scalar + var = np.sum((diffs - mean) ** 2) / (nsamples - 1) # scalar + if np.isnan(var): + continue # skip + if var < minv: + minv = var + minvars[k] = minv / 4.0 if minv < 1e18 else np.inf + return minvars + + +##################################################### +# NORMFINDER CLASS +##################################################### + + +@dataclass +class NormFinder: + count_lf: pl.LazyFrame + design_df: pl.DataFrame + + genes: list[str] = field(init=False) + + group_to_samples_dict: dict[str, list[str]] = field(init=False) + + n_groups: int = field(init=False) + n_genes: int = field(init=False) + + def __post_init__(self): + # format_design + self.design_df = self.design_df.with_columns( + pl.concat_str([pl.col("batch"), pl.col("condition")], separator="_").alias( + "group" + ) + ).select("sample", "group") + + # make dict associating a group to the list of its samples + group_to_sample_df = self.design_df.group_by("group", maintain_order=True).agg( + "sample" + ) # maintain order is better for repeatability and testing + + self.group_to_samples_dict = { + d["group"]: d["sample"] for d in group_to_sample_df.to_dicts() + } + + groups = list(self.group_to_samples_dict.keys()) + self.n_groups = len(groups) + + self.genes = ( + self.count_lf.select(config.GENE_ID_COLNAME).collect().to_series().to_list() + ) + self.n_genes = len(self.genes) + + if self.n_genes <= 2: + logger.error("Too few genes") + sys.exit(100) + + @staticmethod + def get_overall_mean_for_group(df_with_means_over_samples: pl.DataFrame) -> float: + return df_with_means_over_samples.mean().item() + + @staticmethod + def get_means_over_samples(df: pl.DataFrame) -> pl.DataFrame: + return df.with_columns( + mean_over_samples_for_gene=pl.concat_list(pl.all()).row.mean() + ).select("mean_over_samples_for_gene") + + def correct_negative_values( + self, intra_var_df: pl.DataFrame, group_count_df: pl.DataFrame + ) -> pl.DataFrame: + genes_with_negative_values = intra_var_df.select( + col for col in self.genes if (intra_var_df[col] < 0).all() + ).columns # intra_var_df has only one row but it is a dataframe + + # getting indexes of genes for which we must compute minvar + indexes_of_genes_with_negative_values = np.array( + [ + i + for i, gene in enumerate(self.genes) + if gene in genes_with_negative_values + ], + dtype=np.int64, + ) + + minvars = compute_minvars( + group_count_df.to_numpy(), indexes_of_genes_with_negative_values + ) + + # associating back minvars to their respective gene + minvar_dict = { + gene: minvars[i] for i, gene in enumerate(genes_with_negative_values) + } + return intra_var_df.with_columns( + [pl.lit(val).alias(col) for col, val in minvar_dict.items()] + ) + + def get_unbiased_intragroup_variance_for_group( + self, + group_count_df: pl.DataFrame, + means_over_samples_df: pl.DataFrame, + group_overall_mean: float, + samples: list[str], + ): + # TODO: see if it's correct + # if only one sample in the group, there's no variance + if len(samples) == 1: + data = {gene: [0] for gene in self.genes} + return pl.DataFrame(data) + + # lf is a lazyframe with a column being the gene ids (gene_id) + # and other columns being the samples + # the current chunk corresponds to only one group + # means_over_samples_df is a single column dataframe containing the means across each row (ie for each gene across samples) + ng = len(samples) + + means_over_samples = means_over_samples_df.to_series().rename( + "mean_over_samples_for_gene" + ) + + mean_over_genes = ( + group_count_df.mean() + .transpose() + .to_series() + .rename("mean_over_genes_for_sample") + ) + + sample_variance_df = ( + group_count_df.hstack( + [means_over_samples] + ) # adding column containing means over all samples in this group (for each gene) + .select( + [ + (pl.col(c) - pl.col("mean_over_samples_for_gene")).alias( + c + ) # y_igj - mean(y_ig*) + for c in samples + ] + ) + .transpose( + include_header=True, column_names=self.genes + ) # columns are now genes + .hstack( + [mean_over_genes] + ) # adding column containing means over all genes (for each sample) + .select( + [ + ( + ( + pl.col(c) + - pl.col("mean_over_genes_for_sample") + + group_overall_mean + ) + ** 2 + ).alias( + c + ) # r_igj ^2 = (y_igj - mean(y_ig*) -mean(y_*gj) + mean(y_*g*) ) ^ 2 + for c in self.genes + ] + ) + .transpose(include_header=True, column_names=samples) + .with_columns( + sample_variance=pl.concat_list(samples).row.sum() + / ( + (ng - 1) * (1 - 2 / self.n_genes) + ) # sum over j (samples) of r_igj ^2 terms + ) + .select("sample_variance") + .transpose() + .rename({f"column_{i}": gene for i, gene in enumerate(self.genes)}) + ) + + # sum of all sample variances for all genes + sample_variance_sum_over_genes = sample_variance_df.select( + pl.sum_horizontal(pl.all()) + ).item() # sum of all s_ij² over all genes + + intra_var_df = sample_variance_df.select( + [ + ( + pl.col(c) + - sample_variance_sum_over_genes + / (self.n_genes * (self.n_genes - 1)) + ).alias(c) + for c in self.genes + ] + ) + # if some values are negative, we need a special process + corrected_intra_var_df = self.correct_negative_values( + intra_var_df, group_count_df + ) + + return corrected_intra_var_df + + def get_unbiased_intragroup_variances(self): + unbiased_intragroup_variance_dfs = [] + means_over_samples_dfs = [] + group_overall_means = [] + + for group, samples in tqdm(self.group_to_samples_dict.items()): + # sub dataframe corresponding to this group + chunk_df = self.count_lf.select(samples).collect() + # computing means over samples for each gene + means_over_samples_df = self.get_means_over_samples(chunk_df) + # getting overall expression average in the group for all genes + group_overall_mean = self.get_overall_mean_for_group(means_over_samples_df) + + group_unbiased_intragroup_variance_df = ( + self.get_unbiased_intragroup_variance_for_group( + chunk_df, means_over_samples_df, group_overall_mean, samples + ) + ) + + # storing intragroup values for each gene in this group + unbiased_intragroup_variance_dfs.append( + group_unbiased_intragroup_variance_df + ) + # storing means over samples in this group for each gene + means_over_samples_df = means_over_samples_df.rename( + {"mean_over_samples_for_gene": group} + ) + means_over_samples_dfs.append(means_over_samples_df) + # storing overall mean of expression in this group, for all genes and samples + group_overall_means.append(group_overall_mean) + + # cast all values to float (to avoid issues when concat) + unbiased_intragroup_variance_dfs = [ + df.select([pl.col(col).cast(pl.Float64) for col in df.columns]) + for df in unbiased_intragroup_variance_dfs + ] + + # removing None values in group_overall_means + # which would originate from group chunk dataframes that are full of null values + group_overall_means = [mean for mean in group_overall_means if mean is not None] + + # before returning: + # concatenate together all intragroup variance data to have a single df for all groups + # stack all means over samples horizontally (becomes a gene * group df ) + # get the mean of group_overall_means to get the overall mean expression value in the count dataframe + return ( + pl.concat(unbiased_intragroup_variance_dfs), + pl.concat(means_over_samples_dfs, how="horizontal"), + mean(group_overall_means), + ) + + def adjust_for_nb_of_samples_in_groups( + self, unbiased_intragroup_variance_df: pl.DataFrame + ): + n_samples_list = [ + len(samples) for samples in self.group_to_samples_dict.values() + ] + return unbiased_intragroup_variance_df.with_columns( + n_samples=pl.Series(n_samples_list) + ).select([(pl.col(c) / pl.col("n_samples")).alias(c) for c in self.genes]) + + def get_unbiased_intergroup_variance( + self, gene_means_in_groups_df: pl.DataFrame, dataset_overall_mean: float + ): + mean_over_genes = ( + gene_means_in_groups_df.mean() + .transpose() + .to_series() + .rename("mean_over_genes_for_group") + ) + + return ( + gene_means_in_groups_df.with_columns( + mean_over_groups_for_gene=pl.concat_list(pl.all()).row.mean() + ) + .select( + [ + (pl.col(c) - pl.col("mean_over_groups_for_gene")).alias(c) + for c in gene_means_in_groups_df.columns + ] + ) + .transpose(column_names=self.genes) + .hstack([mean_over_genes]) + .select( + [ + ( + pl.col(c) + - pl.col("mean_over_genes_for_group") + + dataset_overall_mean + ).alias(c) + for c in self.genes + ] + ) + .select( + [(pl.col(c) ** 2).alias(c) for c in self.genes] + ) # square to get variance + ) + + def compute_gamma_factor(self, diff_df: pl.DataFrame, vardiff_df: pl.DataFrame): + logger.info("Computing gamma factor") + first_term = ( + diff_df.with_columns( + sum_of_squares=pl.concat_list(pl.all()).row.sum() # sum over columns + ) + .select("sum_of_squares") + .sum() # sum over rows + .select( + ( + pl.col("sum_of_squares") + / ((self.n_groups - 1) * (self.n_genes - 1)) + ).alias("normalised_sum_of_squares") + ) + .item() + ) + + second_term = ( + vardiff_df.with_columns( + sum=pl.concat_list(pl.all()).row.sum() # sum over columns + ) + .select("sum") + .sum() # sum over rows + .select( + (pl.col("sum") / (self.n_groups * self.n_genes)).alias("normalised_sum") + ) + .item() + ) + + return max(first_term - second_term, 0) # set to 0 if negative + + @staticmethod + def apply_gamma_factor( + gamma: float, diff_df: pl.DataFrame, vardiff_df: pl.DataFrame + ): + difnew = diff_df * gamma / (gamma + vardiff_df) + varnew = vardiff_df + gamma * vardiff_df / (gamma + vardiff_df) + return difnew, varnew + + def apply_shrinkage( + self, intergroup_variance_df: pl.DataFrame, group_mean_variance_df: pl.DataFrame + ): + gamma = self.compute_gamma_factor( + intergroup_variance_df, group_mean_variance_df + ) + return self.apply_gamma_factor( + gamma, intergroup_variance_df, group_mean_variance_df + ) + + def get_stability_values( + self, shrunk_intervar_df: pl.DataFrame, shrunk_gr_mean_var_df: pl.DataFrame + ): + return ( + ( + shrunk_intervar_df.select([pl.col(c).abs() for c in self.genes]) + + shrunk_gr_mean_var_df.select([pl.col(c).sqrt() for c in self.genes]) + ) + .mean() + .transpose( + include_header=True, + header_name=config.GENE_ID_COLNAME, + column_names=[config.NORMFINDER_STABILITY_VALUE_COLNAME], + ) + ) + + def compute_stability_scoring(self): + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # UNBIASED INTRAGROUP VARIANCE + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + logger.info("Computing intragroup variances") + intragroup_variance_df, gene_means_in_groups_df, dataset_overall_mean = ( + self.get_unbiased_intragroup_variances() + ) + + logger.info("Adjusting variances by group size") + group_mean_variance_df = self.adjust_for_nb_of_samples_in_groups( + intragroup_variance_df + ) + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # INTERGROUP VARIANCE + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + logger.info("Computing intergroup variances") + intergroup_variance_df = self.get_unbiased_intergroup_variance( + gene_means_in_groups_df, dataset_overall_mean + ) + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # STABILITY VALUES + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + logger.info("Shrinking intragroup and intergroup variances using gamma factor") + shrunk_intervar_df, shrunk_gr_mean_var_df = self.apply_shrinkage( + intergroup_variance_df, group_mean_variance_df + ) + + logger.info("Computing stability values") + return self.get_stability_values(shrunk_intervar_df, shrunk_gr_mean_var_df) + + +##################################################### +# FUNCTIONS +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Quantile normalise count data for each sample in the dataset" + ) + parser.add_argument( + "--counts", type=Path, dest="count_file", required=True, help="Count file" + ) + parser.add_argument( + "--design", type=Path, dest="design_file", required=True, help="Design file" + ) + return parser.parse_args() + + +def export_stability(stabilities: pl.DataFrame): + """Export stability values to CSV file.""" + logger.info(f"Exporting stability values to: {STABILITY_OUTFILENAME}") + write_float_csv(stabilities, STABILITY_OUTFILENAME) + + +def main(): + args = parse_args() + + logger.info(f"Getting counts from {args.count_file}") + count_lf = pl.scan_parquet(args.count_file) + + logger.info(f"Getting design from {args.design_file}") + design_df = pl.read_csv(args.design_file) + # filter design df to keep only samples that are present in the count dataframe + design_df = design_df.filter( + pl.col("sample").is_in(count_lf.collect_schema().names()) + ) + + nfd = NormFinder(count_lf, design_df) + stabilities = nfd.compute_stability_scoring() + + logger.info(f"Stability values:\n{stabilities}") + export_stability(stabilities) + + +if __name__ == "__main__": + main() diff --git a/bin/quantile_normalise.py b/bin/quantile_normalise.py new file mode 100755 index 00000000..5b26f4d9 --- /dev/null +++ b/bin/quantile_normalise.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import config +import polars as pl +from common import export_parquet, parse_count_table +from sklearn.preprocessing import quantile_transform + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +OUTFILE_SUFFIX = ".quant_norm.parquet" + +N_QUANTILES = 1000 + +ALLOWED_TARGET_DISTRIBUTIONS = ["normal", "uniform"] + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Quantile normalise count data for each sample in the dataset" + ) + parser.add_argument( + "--counts", type=Path, dest="count_file", required=True, help="Count file" + ) + parser.add_argument( + "--target-distrib", + type=str, + dest="target_distribution", + required=True, + choices=ALLOWED_TARGET_DISTRIBUTIONS, + help="Target distribution to map counts to", + ) + return parser.parse_args() + + +def quantile_normalise(df: pl.DataFrame, target_distribution: str): + """ + Quantile normalize a dataframe; column by column, based on a target distribution. + """ + kwargs = dict( + n_quantiles=N_QUANTILES, output_distribution=target_distribution, subsample=None + ) + return df.with_columns( + pl.exclude(config.GENE_ID_COLNAME).map_batches( + lambda x: quantile_transform(x.to_frame(), **kwargs).flatten(), + return_dtype=pl.Float64, + ) + ) + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + logger.info(f"Parsing {args.count_file.name}") + count_df = parse_count_table(args.count_file) + + logger.info(f"Quantile normalising {args.count_file.name}") + quantile_normalized_counts = quantile_normalise(count_df, args.target_distribution) + + export_parquet(quantile_normalized_counts, args.count_file, OUTFILE_SUFFIX) + + +if __name__ == "__main__": + main() diff --git a/conf/base.config b/conf/base.config index 47047a68..12e88c25 100644 --- a/conf/base.config +++ b/conf/base.config @@ -8,53 +8,85 @@ ---------------------------------------------------------------------------------------- */ +executor { + cpus = 8 + memory = 24.GB +} + process { - resourceLimits = [ cpus: 24, memory: 768.GB, time: 72.h ] + resourceLimits = [ + cpus: 16, + memory: '24.GB', + time: '4.h' + ] + + cpus = { 1 * task.attempt } + memory = { 6.GB * task.attempt } + time = { 4.h * task.attempt } + + errorStrategy = { + if (task.exitStatus == 100) { // managed errors that should not be retried but ignored at once + 'ignore' + } else if (task.exitStatus == 101) { // connection errors that should be retried + 'retry' + } else if (task.exitStatus in ((130..145) + 104 + 175)) { // OOM & related errors; should be retried as long as memory does not fit + sleep(Math.pow(2, task.attempt) * 200 as long) + 'retry' + } else if (task.attempt <= 3) { // all other errors should be retried with exponential backoff with max retry = 3 + sleep(Math.pow(2, task.attempt) * 200 as long) + 'retry' + } else { // after 3 retries, ignore the error + 'terminate' + } + } - errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } - maxRetries = 1 - maxErrors = '-1' + maxRetries = 10 + maxErrors = '-1' // Process-specific resource requirements - // NOTE - Please try and re-use the labels below as much as possible. + // NOTE - Please try and reuse the labels below as much as possible. // These labels are used and recognised by default in DSL2 files hosted on nf-core/modules. // If possible, it would be nice to keep the same label naming convention when // adding in your local modules too. // TODO nf-core: Customise requirements for specific processes. // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors withLabel:process_single { - cpus = { check_max( 1 , 'cpus' ) } - memory = { check_max( 6.GB * task.attempt, 'memory' ) } - time = { check_max( 4.h * task.attempt, 'time' ) } + cpus = { 1 } + memory = { 2.GB * task.attempt } + time = { 1.h * task.attempt } } withLabel:process_low { - cpus = { check_max( 2 * task.attempt, 'cpus' ) } - memory = { check_max( 12.GB * task.attempt, 'memory' ) } - time = { check_max( 4.h * task.attempt, 'time' ) } + cpus = { 2 } + memory = { 4.GB + 2.GB * task.attempt } + time = { 2.h * task.attempt } } withLabel:process_medium { - cpus = { check_max( 6 * task.attempt, 'cpus' ) } - memory = { check_max( 36.GB * task.attempt, 'memory' ) } - time = { check_max( 8.h * task.attempt, 'time' ) } + cpus = { 4 } + memory = { 6.GB + 2.GB * task.attempt } + time = { 4.h * task.attempt } } withLabel:process_high { - cpus = { check_max( 12 * task.attempt, 'cpus' ) } - memory = { check_max( 72.GB * task.attempt, 'memory' ) } - time = { check_max( 16.h * task.attempt, 'time' ) } - } - withLabel:process_long { - time = { check_max( 20.h * task.attempt, 'time' ) } - } - withLabel:process_high_memory { - memory = { check_max( 200.GB * task.attempt, 'memory' ) } - } - withLabel:error_ignore { - errorStrategy = 'ignore' + cpus = { 4 } + memory = { 8.GB + 4.GB * task.attempt } + time = { 8.h * task.attempt } } - withLabel:error_retry { - errorStrategy = { sleep(Math.pow(2, task.attempt) * 200 as long); return 'retry' } - maxRetries = 3 + withLabel:can_fail { + errorStrategy = { + if (task.exitStatus == 100) { // managed errors that should not be retried but ignored at once + 'ignore' + } else if (task.exitStatus == 101) { // connection errors that should be retried + 'retry' + } else if (task.exitStatus in ((130..145) + 104 + 175)) { // OOM & related errors; should be retried as long as memory does not fit + sleep(Math.pow(2, task.attempt) * 200 as long) + 'retry' + } else if (task.attempt <= 3) { // all other errors should be retried with exponential backoff with max retry = 3 + sleep(Math.pow(2, task.attempt) * 200 as long) + 'retry' + } else { // after 3 retries, ignore the error + 'ignore' + } + } } } diff --git a/conf/modules.config b/conf/modules.config index e27fd282..70604b85 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -10,12 +10,19 @@ ---------------------------------------------------------------------------------------- */ -process { +/* +publishDir = [ + path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } +] +*/ - publishDir = [ - path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] -} +includeConfig 'modules/public_data.config' +includeConfig 'modules/id_mapping.config' +includeConfig 'modules/gene_length.config' +includeConfig 'modules/normalisation.config' +includeConfig 'modules/merging.config' +includeConfig 'modules/statistics.config' +includeConfig 'modules/reporting.config' diff --git a/conf/modules/gene_length.config b/conf/modules/gene_length.config new file mode 100644 index 00000000..4fa9b8be --- /dev/null +++ b/conf/modules/gene_length.config @@ -0,0 +1,17 @@ +process { + + withName: DOWNLOAD_ENSEMBL_ANNOTATION { + publishDir = [ + path: { "${params.outdir}/gene_length" }, + mode: params.publish_dir_mode + ] + } + + withName: COMPUTE_GENE_TRANSCRIPT_LENGTHS { + publishDir = [ + path: { "${params.outdir}/gene_length" }, + mode: params.publish_dir_mode + ] + } + +} diff --git a/conf/modules/id_mapping.config b/conf/modules/id_mapping.config new file mode 100644 index 00000000..4e048a37 --- /dev/null +++ b/conf/modules/id_mapping.config @@ -0,0 +1,27 @@ +process { + + withName: COLLECT_GENE_IDS { + publishDir = [ + path: { "${params.outdir}/idmapping/collected_gene_ids" }, + mode: params.publish_dir_mode + ] + } + + withName: GPROFILER_IDMAPPING { + publishDir = [ + path: { "${params.outdir}/idmapping/gprofiler" }, + mode: params.publish_dir_mode + ] + } + + withName: FILTER_AND_RENAME_GENES { + publishDir = [ + path: { "${params.outdir}/idmapping/renamed" }, + mode: params.publish_dir_mode, + saveAs: { + filename -> ["failure_reason.txt", "warning_reason.txt"].contains(filename) ? null : filename + } + ] + } + +} diff --git a/conf/modules/merging.config b/conf/modules/merging.config new file mode 100644 index 00000000..f9c1e46d --- /dev/null +++ b/conf/modules/merging.config @@ -0,0 +1,8 @@ +process { + + withName: 'NFCORE_STABLEEXPRESSION:STABLEEXPRESSION:MERGE_DATA:PLATFORM' { + tag = { "${meta.platform}" } + maxForks = 1 + } + +} diff --git a/conf/modules/normalisation.config b/conf/modules/normalisation.config new file mode 100644 index 00000000..94aa5dc7 --- /dev/null +++ b/conf/modules/normalisation.config @@ -0,0 +1,30 @@ +process { + + withName: COMPUTE_CPM { + publishDir = [ + path: { "${params.outdir}/normalised/cpm/${meta.dataset}/" }, + mode: params.publish_dir_mode, + saveAs: { + filename -> ["failure_reason.txt", "warning_reason.txt"].contains(filename) ? null : filename + } + ] + } + + withName: COMPUTE_TPM { + publishDir = [ + path: { "${params.outdir}/normalised/tpm/${meta.dataset}/" }, + mode: params.publish_dir_mode, + saveAs: { + filename -> ["failure_reason.txt", "warning_reason.txt"].contains(filename) ? null : filename + } + ] + } + + withName: QUANTILE_NORMALISATION { + publishDir = [ + path: { "${params.outdir}/normalised/quantile_normalised/${meta.dataset}/" }, + mode: params.publish_dir_mode + ] + } + +} diff --git a/conf/modules/public_data.config b/conf/modules/public_data.config new file mode 100644 index 00000000..9198c9e2 --- /dev/null +++ b/conf/modules/public_data.config @@ -0,0 +1,39 @@ +process { + + withName: EXPRESSIONATLAS_GETACCESSIONS { + publishDir = [ + path: { "${params.outdir}/public_data/expression_atlas/accessions/" }, + mode: params.publish_dir_mode + ] + } + + withName: EXPRESSIONATLAS_GETDATA { + + publishDir = [ + path: { "${params.outdir}/public_data/expression_atlas/datasets/" }, + mode: params.publish_dir_mode, + saveAs: { + filename -> ["failure_reason.txt", "warning_reason.txt"].contains(filename) ? null : filename + } + ] + + } + + withName: GEO_GETACCESSIONS { + publishDir = [ + path: { "${params.outdir}/public_data/geo/accessions/" }, + mode: params.publish_dir_mode + ] + } + + withName: GEO_GETDATA { + publishDir = [ + path: { "${params.outdir}/public_data/geo/datasets/" }, + mode: params.publish_dir_mode, + saveAs: { + filename -> ["failure_reason.txt", "warning_reason.txt"].contains(filename) ? null : filename + } + ] + } + +} diff --git a/conf/modules/reporting.config b/conf/modules/reporting.config new file mode 100644 index 00000000..afb22c3f --- /dev/null +++ b/conf/modules/reporting.config @@ -0,0 +1,30 @@ +process { + + withName: AGGREGATE_RESULTS { + publishDir = [ + path: { "${params.outdir}/aggregated" }, + mode: params.publish_dir_mode + ] + } + + withName: 'MULTIQC' { + cpus = { 4 } + memory = { 8.GB * task.attempt } + ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' } + publishDir = [ + path: { "${params.outdir}/multiqc" }, + mode: 'copy' + ] + } + + withName: 'DASH_APP' { + publishDir = [ + path: { "${params.outdir}/dash_app/" }, + mode: 'copy', + saveAs: { + filename -> ['versions.yml', 'file_system_backend'].contains(filename) ? null : filename + } + ] + } + +} diff --git a/conf/modules/statistics.config b/conf/modules/statistics.config new file mode 100644 index 00000000..8527990e --- /dev/null +++ b/conf/modules/statistics.config @@ -0,0 +1,8 @@ +process { + + withName: 'NFCORE_STABLEEXPRESSION:STABLEEXPRESSION:GENE_STATISTICS:PLATFORM' { + tag = { "${meta.platform}" } + maxForks = 1 + } + +} diff --git a/conf/test.config b/conf/test.config index 6c21d04e..fdf68236 100644 --- a/conf/test.config +++ b/conf/test.config @@ -3,19 +3,19 @@ Nextflow config file for running minimal tests ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Defines input files and everything required to run a fast and simple pipeline test. + It tests the different ways to use the pipeline, with small data Use as follows: - nextflow run nf-core/stableexpression -profile test, --outdir + nextflow run nf-core/stableexpression -profile test_dataset, --outdir ---------------------------------------------------------------------------------------- */ params { - config_profile_name = 'Test profile' + config_profile_name = 'Test dataset profile' config_profile_description = 'Minimal test dataset to check pipeline function' // Input data - species = 'solanum tuberosum' - fetch_eatlas_accessions = false - eatlas_accessions = "E-MTAB-552" + species = 'beta vulgaris' + outdir = "results/test" } diff --git a/conf/test_dataset_eatlas.config b/conf/test_dataset_eatlas.config new file mode 100644 index 00000000..c46350ad --- /dev/null +++ b/conf/test_dataset_eatlas.config @@ -0,0 +1,25 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running full-size tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a full size pipeline test. + This tests the capacity of the pipeline to process a full size dataset. + + Use as follows: + nextflow run nf-core/stableexpression -profile test_full, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Full test profile' + config_profile_description = 'Full test dataset to check pipeline function' + + // Input data + species = 'mus_musculus' + accessions = "E-MTAB-2262" + skip_fetch_eatlas_accessions = true + fetch_geo_accessions = false + datasets = 'https://raw.githubusercontent.com/nf-core/test-datasets/stableexpression/test_data/input_datasets/input_big.yaml' + outdir = "results/test_dataset_eatlas" +} diff --git a/conf/test_full.config b/conf/test_full.config index 6937017d..316cd153 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -3,6 +3,7 @@ Nextflow config file for running full-size tests ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Defines input files and everything required to run a full size pipeline test. + This tests the capacity of the pipeline to process a full size dataset. Use as follows: nextflow run nf-core/stableexpression -profile test_full, --outdir @@ -15,8 +16,6 @@ params { config_profile_description = 'Full test dataset to check pipeline function' // Input data - species = 'solanum tuberosum' - eatlas_keywords = "potato,stress" - eatlas_accessions = "E-MTAB-552" - datasets = "tests/input/custom_datasets/input.csv" + species = 'arabidopsis_thaliana' + outdir = "results/test_full" } diff --git a/docs/configuration.md b/docs/configuration.md new file mode 100644 index 00000000..1b9ff481 --- /dev/null +++ b/docs/configuration.md @@ -0,0 +1,137 @@ +# Pipeline configuration + +Although many parameters are directly exposed to the user via the CLI, setting cpu and memory requirements must be done via a configuration file. +By default, the max number of CPUs and memory the pipeline can use is defined in `conf/base.config`: + +``` +executor { + cpus = 8 + memory = 24.GB +} +``` + +This was set quite low on purpose, in order to make it run easily on most data science laptops. + +## Setting hard limits of CPU and memory + +One can modify it by creating a custom config file: + +``` +executor { + cpus = 32 + memory = 200.GB +} +``` + +then launch the pipeline using: + +```bash +nextflow run nf-core/stableexpression \ + -profile \ + -config + ... +``` + +## Modifying resource allocation for processes + +Let's say you have a laptop with only 4 CPUs and 12 GB of RAM. Running the pipeline may crash your computer because of a lack of memory. +To tell the pipeline to lower down its resource consumption, you can create a custom config file with: + +``` +executor { + cpus = 4 + memory = 8.GB +} + +withLabel:process_single { + memory = { 2.GB * task.attempt } +} +withLabel:process_low { + memory = { 2.GB + 1.GB * task.attempt } +} +withLabel:process_medium { + memory = { 4.GB + 1.GB * task.attempt } +} +withLabel:process_high { + memory = { 4.GB + 2.GB * task.attempt } +} +``` + +then launch the pipeline using: + +```bash +nextflow run nf-core/stableexpression \ + -profile \ + -config + ... +``` + +> [!WARNING] +> Please keep in mind that if the total number of datasets (downloaded from public datasets or directly provided by the user) is too big for your computer, the pipeline will crash. Even if much effort was made to minimise the memory usage, some steps still require a certain amount of memory to run successfully. + +## Running with Slurm + +This is an example `launch_nf_core_stableexpression.sh` script to run the pipeline on an HPC cluster with Slurm: + +```bash +#!/bin/bash + +# set job name +#SBATCH --job-name=nf_run + +# set output file for logs +#SBATCH --output=logs/nf_run_%j.log + +# if your HPC cluster uses partitions, use a partition allowing for long runs +#SBATCH --partition= + +#to get email notifications +#SBATCH --mail-user= +#SBATCH --mail-type=END,FAIL + +# set max memory available to the Nextflow main node +#SBATCH --mem 2GB + +module load nextflow +# or load specific version: module load nextflow=25.10.04 + +# set location of apptainer cache directory +export NXF_APPTAINER_CACHEDIR=apptainer_cache + +nextflow run nf-core/stableexpression \ + -latest \ + -c slurm.config \ + -profile apptainer \ + -resume \ + --params-file params.yaml +``` + +with `slurm.config`: + +``` +executor { + name = 'slurm' + queue = // if your HPC cluster uses partitions, use a partition including fast runs + queueSize = 50 // see https://seqera.io/blog/5_tips_for_hpc_users/ + submitRateLimit = '10 sec' // see https://seqera.io/blog/5_tips_for_hpc_users/ + cpus = 64 // adjust to your needs + memory = 400.GB // adjust to your needs + time = 48.h // optional, only if you want to limit the runtime +} +``` + +and `params.yaml`: + +``` +species: +outdir: +[+ OTHER PARAMETERS] +``` + +Run this script with `sbatch`: + +``` +sbatch launch_nf_core_stableexpression.sh +``` + +For checking the status of the run, we recommend tools like [slurmer](https://crates.io/crates/slurmer). diff --git a/docs/images/nf-core-sampleexpression_logo_dark.png b/docs/images/nf-core-sampleexpression_logo_dark.png deleted file mode 100644 index dea5d4a5..00000000 Binary files a/docs/images/nf-core-sampleexpression_logo_dark.png and /dev/null differ diff --git a/docs/images/nf-core-sampleexpression_logo_light.png b/docs/images/nf-core-sampleexpression_logo_light.png deleted file mode 100644 index b3f51d03..00000000 Binary files a/docs/images/nf-core-sampleexpression_logo_light.png and /dev/null differ diff --git a/docs/images/nf-core-stableexpression_logo_dark.png b/docs/images/nf-core-stableexpression_logo_dark.png new file mode 100644 index 00000000..24d8da8b Binary files /dev/null and b/docs/images/nf-core-stableexpression_logo_dark.png differ diff --git a/docs/images/nf-core-stableexpression_logo_light.png b/docs/images/nf-core-stableexpression_logo_light.png new file mode 100644 index 00000000..c4a8482e Binary files /dev/null and b/docs/images/nf-core-stableexpression_logo_light.png differ diff --git a/docs/images/nf-core-stableexpression_metro_map.drawio b/docs/images/nf-core-stableexpression_metro_map.drawio deleted file mode 100644 index e81490fd..00000000 --- a/docs/images/nf-core-stableexpression_metro_map.drawio +++ /dev/null @@ -1,250 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/docs/images/nf-core-stableexpression_metro_map.png b/docs/images/nf-core-stableexpression_metro_map.png deleted file mode 100644 index d3224cb3..00000000 Binary files a/docs/images/nf-core-stableexpression_metro_map.png and /dev/null differ diff --git a/docs/images/nf_core_stableexpression.metromap.drawio b/docs/images/nf_core_stableexpression.metromap.drawio new file mode 100644 index 00000000..e3953c65 --- /dev/null +++ b/docs/images/nf_core_stableexpression.metromap.drawio @@ -0,0 +1,337 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/images/nf_core_stableexpression.metromap.png b/docs/images/nf_core_stableexpression.metromap.png new file mode 100644 index 00000000..54e3df8e Binary files /dev/null and b/docs/images/nf_core_stableexpression.metromap.png differ diff --git a/docs/output.md b/docs/output.md index 2f8a68eb..38dd2b30 100644 --- a/docs/output.md +++ b/docs/output.md @@ -1,68 +1,134 @@ # nf-core/stableexpression: Output +## Pipeline reports (TLDR) + +The main output of the pipeline is the MultiQC report, which summarises results at the end of the pipeline. This report is located at `/multiqc/multiqc_report.html` and can be opened in your favorite browser. + +For advanced users who seek to explore more deeply the distributions of normalised counts gene per gene or sample per sample, a Dash Plotly app is readily prepared at the end of each pipeline run. See [here](#dash-plotly-app) for explanation on how to run the app. + ## Introduction This document describes the output produced by the pipeline. -## Pipeline overview +The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. -The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: +## Main output files -- [Expression Atlas](#expression-atlas) -- [DESeq2](#deseq2) or [EdgeR](#edger): normalize raw data -- [g:Profiler](#gprofiler-idmapping): map gene IDS to Ensembl IDS -- [Variation coefficient](#variation-coefficient): Compute gene variation coefficients and get the most stable genes +### MultiQC -### Expression Atlas +This report is located at `multiqc/multiqc_report.html` and can be opened in a browser.
Output files -- `expressionatlas/` - - List of accessions found when querying Expression Atlas: `accessions.txt`. - - A list of count datasets and experimental designs download from Expression Atlas. Normalized datasets have the `normalized.csv` while not normalized datasets have the `raw.csv` extension. +- `multiqc/` + - MultiQC report file: `multiqc_report.html`. + - MultiQC data dir: `multiqc_data`. + - Plots created by MultiQC: `multiqc_plots`.
-### Deseq2 +MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. + +Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . + +### Dash Plotly app + +`dash_app/`: folder containing the Dash Plotly app + +To launch the app, you must first create and activate the appropriate conda environment: + +```bash +conda env create -n nf-core-stableexpression-dash -f /dash_app/spec-file.txt +conda activate nf-core-stableexpression-dash +``` + +then: + +``` +cd dash_app +python app.py +``` + +and open your browser at `http://localhost:8080` + +> [!NOTE] +> The app will try to use the port `8080` by default. If it is already in use, it will try `8081`, `8082` and so on. Check the logs to see which port it is using. + +### Statistics and scoring + +The gene stat summary is also bundled with the Dash Plotly app.
Output files -- `normalization/deseq2/` - - List of newly normalized datasets +- `dash_app/data/all_genes_summary.csv`: file containing all gene statistics, scores and ranked by stability score
-### EdgeR +### Merged data + +The file containing all normalised counts is bundled as a Parquet file with the Dash Plotly app.
Output files -- `normalization/edger/` - - List of newly normalized datasets +- `dash_app/data/all_counts.imputed.parquet`: parquet file containing all normalised + imputed gene counts +- `idmapping/global_gene_metadata.csv`: table containing the complete set of gene metadata, obtained either via gProfiler or via the custom file provided by the user +- `idmapping/global_gene_id_mapping.csv`: table containing the complete set of gene id mapping, obtained either via gProfiler or via the custom file - - +- `merged_datasets/whole_design.csv`: table contained designs for all datasets and all samples comprised in the analysis
-### GProfiler IDMapping +## Other output files of interest (useful for debbuging) + +### Expression Atlas
Output files -- `idmapping/` - - Count datasets whose gene IDs have been mapped to Ensembl IDs (suffix `renamed.csv`). - - Correspondencies between original gene IDs and Ensembl IDs (suffix `mapping.json`.) +- `public_data/expression_atlas/accessions/`: accessions found when querying Expression Atlas +- `public_data/expression_atlas/datasets/`: count datasets (normalized: `*.normalised.csv` / raw: `*.raw.csv`) and experimental designs (`*.design.csv`) downloaded from Expression Atlas.
-### Variation coefficient +### GEO + +
+Output files + +- `public_data/geo/accessions/`: accessions found when querying GEO +- `public_data/geo/datasets/`: count datasets (normalized: `*.normalised.csv` / raw: `*.raw.csv`) and experimental designs (`*.design.csv`) downloaded from GEO. + +
+ +### IDMapping (g:Profiler) + +
+Output files + + - `renamed`: count datasets with renamed and filtered gene IDs + +
+ +### Normalisation + +
+Output files + +- `normalised/`: Newly normalised datasets + - `tpm/`: with TPM + - `cpm/`: with CPM +- `normalised/quantile_normalised` : Quantile normalised datasets + +### Genome annotation and gene length
Output files -- `variation_coefficients/` - - An ordered list from the most stable (first line) to the least stable gene in `variation_coefficients.csv`. - - All normalized counts (for each gene and each sample) in `all_normalized_counts.csv`. +- `gene_length/`: + - `gene_trnascript_lengths.csv`: table containing gene transcript lengths + - `*.gff*`: downloaded genome annotation
diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md new file mode 100644 index 00000000..d68a0792 --- /dev/null +++ b/docs/troubleshooting.md @@ -0,0 +1,68 @@ +# nf-core/stableexpression: Troubleshooting + +## Error 139 on macOS + +If you are running the pipeline on macOS with containers (`docker`, `apptainer`, `singularity`, ...), you may encounter issues like: + +``` +NOTE: Process `NFCORE_STABLEEXPRESSION:STABLEEXPRESSION:ID_MAPPING:CLEAN_GENE_IDS ()` terminated with an error exit status (139) -- Execution is retried (1) +``` + +eventually leading to pipeline failure. + +This is likely due to the python polars library not being compatible with macOS when run inside a container. + +You should run the pipeline with `-profile micromamba` or `-profile conda`. + +## Ǹo dataset found + +For species that are not on Expression Atlas, the pipeline will not be able to find suitable datasets and will log the following message: + +``` +ERROR: Could not find any readily usable public dataset +... +``` + +> [!TIP] +> You can first try to have the pipeline fetch suitable datasets from NCBI GEO by providing the `--fetch_geo_accessions` flag. + +In case no datasets are found, you'll have to find a way to get count datasets and to prepare them for the pipeline. +A good start is to check in the folder `/public_data/geo/datasets/` if there are `rejected` subfolders. Such subfolders contain datasets that were downloaded (together with their experimental design) but failed to pass checks. Quite often, some of them be manually reprocessed to be suitable for the pipeline. + +Finally, you may want to check by yourself on [NCBI GEO](https://www.ncbi.nlm.nih.gov/gds). + +Alternatively, some public websites contain expression datasets that may be suitable for the pipeline, such as: + +- [Bgee](https://www.bgee.org/) + +## Not enough memory + +The pipeline limits the number of downloaded datasets to a certain number in order to limit RAM usage, especially for `homo sapiens`. + +However, on small computers, the limit may be too permissive and lead to RAM overhead. You can reduce the number of datasets downloaded by setting the `--random_sampling_size` to a lower value. + +## Why do I get only a fraction of the public datasets available on Expression Atlas or NCBI GEO? Give them back! + +To reduce the RAM overhead, the pipeline selects randomly a certain number of datasets, based on the number of samples they contain. To increase the number of collected datasets, you can increase the `--random_sampling_size` parameter. + +[!TIP] + +> A seed is also set in order to make the runs reproducible. You can change the subset of chosen datasets by changing the `--random_sampling_seed`. + +## The pipeline failed to find a genome annotation for the specified species + +If you know the length of the longest cDNA for each gene, you can provide gene lengths yourself with the `--gene_length` flag (see [Custom gene ID mapping / metadata / length](usage.md#5-custom-gene-id-mapping--metadata)). In case you do not have access to gene length, TPM normalisation cannot be formed. A fallback is to use CPM normalisation by setting `--normalisation_method cpm`. It will introduce a small bias towards long genes, but this should not result in big changes. + +## Java heap space + +In some cases, in particular when running the pipeline on a very large number of datasets (such as for `Homo sapiens`), the Nextflow Java virtual machines can start to request a large amount of memory. You may happen to see the following error: + +``` +java.lang.OutOfMemoryError: Java heap space +``` + +We recommend to increase the memory available to Java: + +```bash +export NXF_OPTS='-Xms1g -Xmx4g' +``` diff --git a/docs/usage.md b/docs/usage.md index 0ca822eb..2bc85a3c 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -2,118 +2,249 @@ ## :warning: Please read this documentation on the nf-core website: [https://nf-co.re/stableexpression/usage](https://nf-co.re/stableexpression/usage) +> [!WARNING] +> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files). + +> [!TIP] +> For setting number of CPUs and memory used by the pipeline, or for instruction on how to run it on an HPC, see the [configuration instructions](configuration.md). + +> [!NOTE] +> In case of issues with the pipeline, please check the [troubleshooting page](troubleshooting.md) or [report a new issue](https://github.com/nf-core/stableexpression/issues). + > _Documentation of pipeline parameters is generated automatically from the pipeline schema and can no longer be found in markdown files._ -## Pathways +## 1. Basic run + +This pipeline fetches Expression Atlas and GEO accessions for the provided species and downloads the corresponding data. + +```bash +nextflow run nf-core/stableexpression \ + -profile \ + --species \ + --outdir \ + -resume +``` + +> [!TIP] +> It is often a good practice to run the pipeline with the `-resume` flag. See the [Nextflow documentation on caching and resuming](https://www.nextflow.io/docs/latest/cache-and-resume.html) for more information. -You can run this pipeline in multiple ways. +> [!NOTE] +> See [here](#profiles) for more information about profiles. -1. Expression Atlas **automatic mode**: without keywords +## 2. Specific public datasets -This run fetches Expression Atlas accessions corresponding to the provided species and downloads the corresponding data. +You can provide keywords to restrict downloaded datasets to specific conditions. ```bash nextflow run nf-core/stableexpression \ - -profile \ - --species \ - --fetch_eatlas_accessions \ + -profile \ + --species \ + --keywords --outdir ``` -1. Expression Atlas **automatic mode**: with keywords +> [!NOTE] +> +> - Multiple keywords must be separated by commas. +> - Please note that keywords are additive: you will get datasets that fit with **either of the provided keywords**. +> - A dataset will be downloaded if a keyword is found in its summary or in the same of a sample. +> - The natural language processing [`nltk`](https://www.nltk.org/) python package is used to find keywords as well as derived words. For example, the `leaf` keyword should match 'leaf', 'leaves', 'leafy', etc. -The run fetches Expression Atlas accessions corresponding to the provided species / keywords and downloads the corresponding data. You do not need to specify the `--fetch_eatlas_accessions`when you specify keywords. +## 3. Provide your own accessions + +You may already have an idea of specific Expression Atlas / GEO accessions you want to use in the analysis. +In this case, you can provide them directly to the pipeline. ```bash nextflow run nf-core/stableexpression \ - -profile \ - --species \ - --eatlas_keywords + -profile \ + --species \ + --skip_fetch_eatlas_accessions \ + [--eatlas_accessions ] \ + [--eatlas_accessions_file ] \ + [--geo_accessions ] \ + [--geo_accessions_file ] \ --outdir ``` -3. Expression Atlas **manual mode** +> [!WARNING] +> If you want to download only the datasets corresponding to the accessions supplied, you must set the `--skip_fetch_eatlas_accessions` parameter. + +> [!NOTE] +> If you provide accessions through `--eatlas_accessions_file` or `--geo_accessions_file`, there must be one accession per line. The extension of the file does not matter. -The pipeline downloads the count datasets and experimental designs corresponding to the provided accessions. +In case you do not know which accessions you want but you would like to control precisely which datasets are included in you analysis, you may run first: ```bash nextflow run nf-core/stableexpression \ - -profile \ - --species \ - --eatlas_accessions \ + -profile \ + --species \ + --accessions_only \ --outdir ``` -4. Using local count datasets +Fetched accessions with their respective metadata will be available in `/expression_atlas/accessions/` and `/geo/accessions/` -Conversely, you can provide your own counts datasets / experiment designs. +## 4. Use your own expression datasets -First, prepare a samplesheet listing the different count datasets you want to use. Each row represents a specific dataset and must contain: +You can of course provide your own counts datasets / experimental designs. -- counts: the path to the count dataset (a CSV file) -- design: the path to the experimental design associated to this dataset (a CSV file) -- normalized: a boolean (true / false) representing whether the counts are already normalized or not +> [!NOTE] +> +> - To ensure all RNA-seq datasets are processed the same way, users should provide **raw counts**. +> - If normalised counts are provided, users should apply the same normalisation process to all of them. **The prefered method is `TPM`**. -It should look as follows: +> [!WARNING] +> Microarray data must be already normalised. When mixing your own datasets with public ones in a single run, you should use the `RMA` method in order to be compliant with Expression Atlas and GEO datasets. + +First, prepare a CSV samplesheet listing the different count datasets you want to use. Each row represents a specific dataset and must contain: -`datasets.csv`: +| Column | Description | +| ------------ | ----------------------------------------------------------------------------------------- | +| `counts` | Path to the count dataset (a CSV / TSV file) | +| `design` | Path to the experimental design associated to this dataset (a CSV / TSV file) | +| `platform` | Platform used to generate the counts (`rnaseq` or `microarray`) | +| `normalised` | Boolean (`true` / `false`) representing whether the counts are already normalised or not. | -```csv -counts,design,normalized -path/to/normalized.counts.csv,path/to/normalized.design.csv,true -path/to/raw.counts.csv,path/to/raw.design.csv,false +It should look as follows: + +```csv title=datasets.csv +counts,design,platform,normalised +path/to/normalised.counts.csv,path/to/normalised.design.csv,rnaseq,true +path/to/raw.counts.csv,path/to/raw.design.csv,rnaseq,false +path/to/microarray.counts.csv,path/to/microarray.design.csv,microarray,true ``` -While the counts and design CSV files should have the following structure: +It can also be a YAML file: + +```yaml title=datasets.yaml +- counts: path/to/normalised.counts.csv + design: path/to/normalised.design.csv + platform: rnaseq + normalised: true +- counts: path/to/raw.counts.csv + design: path/to/raw.design.csv + platform: rnaseq + normalised: false +- counts: path/to/microarray.counts.csv + design: path/to/microarray.design.csv + platform: microarray + normalised: true +``` -`counts.csv`: +The counts should have the following structure: -```csv -,sample_A,sample_B,sample_C +```csv title=counts.csv +gene_id,sample_A,sample_B,sample_C gene_1,1,2,3 gene_2,1,2,3 -... ``` -> [!WARNING] -> Remember to write a comma before the first sample name. This serves to indicate that the actual first column (gene IDs) is the index - -`design.csv`: +While the design should look like: -```csv +```csv title=design.csv sample,condition sample_A,condition_1 sample_B,condition_2 -... +sample_C,condition_1 ``` +> [!WARNING] +> +> - In the count file, the first header column (corresponding to gene IDs) should not be empty. However, its name can be anything. +> - The count file should not have any column other than the first one (gene IDs) and the sample columns. + +> [!TIP] +> Both counts and design files can also be supplied as TSV files. + Now run the pipeline with: ```bash nextflow run nf-core/stableexpression \ - -profile \ - --species \ - --datasets \ + -profile \ + --species \ + --datasets \ + --skip_fetch_eatlas_accessions \ --outdir ``` -## Running the pipeline +> [!TIP] +> The `--skip_fetch_eatlas_accessions` parameter is supplied here to show how to analyse **only your own dataset**. You may remove this parameter if you want to mix you dataset(s) with public ones. -You can run the pipeline using a mix of the different pathways. +> [!IMPORTANT] +> By default, the pipeline tries to map gene IDs to Ensembl gene IDs. **All genes that cannot be mapped are discarded from the analysis**. This ensures that all genes are named the same between datasets and allows comparing multiple datasets with each other. If you are confident that your genes have the same name between your different datasets or if you think on the contrary that your gene IDs just won't be mapped properly, you can disable this mapping by adding the `--skip_id_mapping` parameter. In such case, we recommend users to supply their own gene id mapping and gene metadata files using the `--gene_id_mapping` and `--gene_metadata` parameters respectively. +> +> Both files are totally optional, however: +> - a custom gene id mapping might help merging datasets properly +> - custom gene metadata (association between gene id, gene name and gene description) will supply relevant metadata in the final MultiQC report +> +> See [next section](#5-custom-gene-id-mapping--metadata) for further details. -Example usage: +> [!TIP] +> You can check if your gene IDs can be mapped using the [g:Profiler server](https://biit.cs.ut.ee/gprofiler/convert). -> ```bash -> nextflow run nf-core/stableexpression \ -> -profile docker \ -> --species "Arabidopsis thaliana" \ -> --eatlas_accessions "E-MTAB-552,E-GEOD-61690" \ -> --eatlas_keywords "stress,flowering" \ -> --datasets ./datasets.csv \ -> --outdir ./results -> ``` +### 5. Custom gene ID mapping / metadata + +You can supply your own gene ID mapping and / or gene metadata with the `--gene_id_mapping` and `--gene_metadata` parameters respectively. The gene ID mapping file is used to map gene IDs in count table(s) (local or downloaded) to more generic IDs that will be used as a basis for subsequent steps. The gene metadata file provides additional information about the genes, such as their common name and description. + +Structure of the gene id mapping file: + +| Column | Description | +| ------------------ | --------------------------------------------- | +| `original_gene_id` | Gene ID used in the provided count dataset(s) | +| `gene_id` | Mapped gene ID | + +Example: + +```csv title=gene_id_mapping.csv +original_gene_id,gene_id +gene_A,ENSG1234567890 +geneB,OTHERmappedgeneID +``` + +Structure of the gene metadata file: -This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. +| Column | Description | +| ------------- | ---------------- | +| `gene_id` | Mapped gene ID | +| `name` | Gene common name | +| `description` | Gene description | + +Example: + +```csv title=gene_metadata.csv +gene_id,name,description +ENSG1234567890,Gene A,Description of gene A +OTHERmappedgeneID,My OTHER Gene,Another description +``` + +### 6. Custom gene annotation / gene length + +For the computation of TPM values during gene expression normalisation, the knowledge of gene length is required. In the case where the species of interest does not have a public annotation, or if you are encountering network issues, you can supply directly either your own genome annotation or a file associating gene ids to gene lengths with the `--gff` and `--gene_length` parameters respectively. + +The genome annotation must be in `GFF` format and have the `.gff` extension. You can use the [`AGAT`](https://github.com/NBISweden/AGAT) package to convert other genome annotation formats to `GFF`. + +The gene length file must be in `CSV` or `TSV` format and have the following structure: + +| Column | Description | +| --------- | -------------------------------- | +| `gene_id` | Mapped gene ID | +| `length` | Gene length (longest transcript) | + +Example: + +```csv title=gene_length.csv +gene_id,length +ENSG1234567890,1000 +OTHERmappedgeneID,2000 +``` + + +### 7. More advanced scenarios + +For advanced scenarios, you can see the list of available parameters in the [parameter documentation](https://nf-co.re/stableexpression/parameters). + +## Pipeline output Note that the pipeline will create the following files in your working directory: @@ -124,23 +255,26 @@ work # Directory containing the nextflow working files # Other nextflow hidden files, eg. history of pipeline runs and old logs. ``` +For a detailed description of the output files, please consult the [nf-core stableexpression output directory structure](https://nf-co.re/stableexpression/output). + +## Parameters + If you wish to repeatedly use the same parameters for multiple runs, rather than specifying each flag in the command, you can specify these in a params file. Pipeline settings can be provided in a `yaml` or `json` file via `-params-file `. -:::warning -Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args). -::: +> [!WARNING] +> Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args). The above pipeline run specified with a params file in yaml format: ```bash -nextflow run nf-core/stableexpression -profile docker -params-file params.yaml +nextflow run -r dev nf-core/stableexpression -profile docker -params-file params.yaml ``` -with `params.yaml` containing: +with: -```yaml +```yaml title="params.yaml" species: 'Homo sapiens' datasets: './datasets.csv' outdir: './results/' @@ -159,44 +293,64 @@ nextflow pull nf-core/stableexpression ### Reproducibility -It is a good idea to specify a pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. +It is a good idea to specify the pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. First, go to the [nf-core/stableexpression releases page](https://github.com/nf-core/stableexpression/releases) and find the latest pipeline version - numeric only (eg. `1.3.1`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.3.1`. Of course, you can switch to another version by changing the number after the `-r` flag. This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. For example, at the bottom of the MultiQC reports. -To further assist in reproducbility, you can use share and re-use [parameter files](#running-the-pipeline) to repeat pipeline runs with the same settings without having to write out a command with every single parameter. +To further assist in reproducibility, you can use share and reuse [parameter files](#running-the-pipeline) to repeat pipeline runs with the same settings without having to write out a command with every single parameter. -:::tip -If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles. -::: +> [!TIP] +> If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles. ## Core Nextflow arguments -:::note -These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen). -::: +> [!NOTE] +> These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen) -### `-profile` +### [`-profile`](#profiles) Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments. Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Apptainer, Conda) - see below. -:::info -We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. -::: +> [!IMPORTANT] +> We highly recommend the use of Apptainer (Singularity) or Docker containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. + +> [!TIP] + +> When running the pipeline of multi-user server or on a cluster, the best practice is to use Apptainer (formerly Singularity). You can install Apptainer by following these [instructions](https://apptainer.org/docs/admin/main/installation.html#). +> In case you encounter the following error when running Apptainer: +> +> ``` +> ERROR : Could not write info to setgroups: Permission denied +> ERROR : Error while waiting event for user namespace mappings: no event received +> ``` +> +> you may need to install the `apptainer-suid` package instead of `apptainer`: +> +> ``` +> # Debian / Ubuntu +> sudo apt install apptainer-suid +> # RHEL / CentOS +> sudo yum install apptainer-suid +> # Fedora +> sudo dnf install apptainer-suid +> ``` -The pipeline also dynamically loads configurations from [https://github.com/nf-core/configs](https://github.com/nf-core/configs) when it runs, making multiple config profiles for various institutional clusters available at run time. For more information and to see if your system is available in these configs please see the [nf-core/configs documentation](https://github.com/nf-core/configs#documentation). +The pipeline also dynamically loads configurations from [https://github.com/nf-core/configs](https://github.com/nf-core/configs) when it runs, making multiple config profiles for various institutional clusters available at run time. For more information and to check if your system is supported, please see the [nf-core/configs documentation](https://github.com/nf-core/configs#documentation). Note that multiple profiles can be loaded, for example: `-profile test,docker` - the order of arguments is important! They are loaded in sequence, so later profiles can overwrite earlier profiles. -If `-profile` is not specified, the pipeline will run locally and expect all software to be installed and available on the `PATH`. This is _not_ recommended, since it can lead to different results on different machines dependent on the computer enviroment. +If `-profile` is not specified, the pipeline will run locally and expect all software to be installed and available on the `PATH`. This is _not_ recommended, since it can lead to different results on different machines dependent on the computer environment. - `test` - A profile with a complete configuration for automated testing - Includes links to test data so needs no other parameters +- `apptainer` + - A generic configuration profile to be used with [Apptainer](https://apptainer.org/) - `docker` - A generic configuration profile to be used with [Docker](https://docker.com/) - `singularity` @@ -206,13 +360,13 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof - `shifter` - A generic configuration profile to be used with [Shifter](https://nersc.gitlab.io/development/shifter/how-to-use/) - `charliecloud` - - A generic configuration profile to be used with [Charliecloud](https://hpc.github.io/charliecloud/) -- `apptainer` - - A generic configuration profile to be used with [Apptainer](https://apptainer.org/) + - A generic configuration profile to be used with [Charliecloud](https://charliecloud.io/) - `wave` - A generic configuration profile to enable [Wave](https://seqera.io/wave/) containers. Use together with one of the above (requires Nextflow ` 24.03.0-edge` or later). - `conda` - A generic configuration profile to be used with [Conda](https://conda.io/docs/). Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter, Charliecloud, or Apptainer. +- `micromamba` + - A faster, more lightweight alternative to Conda. As for Conda, use Micromamba as a last resort. ### `-resume` @@ -228,13 +382,13 @@ Specify the path to a specific config file (this is a core Nextflow command). Se ### Resource requests -Whilst the default requirements set within the pipeline will hopefully work for most people and with most input data, you may find that you want to customise the compute resources that the pipeline requests. Each step in the pipeline has a default set of requirements for number of CPUs, memory and time. For most of the steps in the pipeline, if the job exits with any of the error codes specified [here](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/base.config#L18) it will automatically be resubmitted with higher requests (2 x original, then 3 x original). If it still fails after the third attempt then the pipeline execution is stopped. +Whilst the default requirements set within the pipeline will hopefully work for most people and with most input data, you may find that you want to customise the compute resources that the pipeline requests. Each step in the pipeline has a default set of requirements for number of CPUs, memory and time. For most of the pipeline steps, if the job exits with any of the error codes specified [here](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/base.config#L18) it will automatically be resubmitted with higher resources request (2 x original, then 3 x original). If it still fails after the third attempt then the pipeline execution is stopped. To change the resource requests, please see the [max resources](https://nf-co.re/docs/usage/configuration#max-resources) and [tuning workflow resources](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources) section of the nf-core website. ### Custom Containers -In some cases you may wish to change which container or conda environment a step of the pipeline uses for a particular tool. By default nf-core pipelines use containers and software from the [biocontainers](https://biocontainers.pro/) or [bioconda](https://bioconda.github.io/) projects. However in some cases the pipeline specified version maybe out of date. +In some cases, you may wish to change the container or conda environment used by a pipeline steps for a particular tool. By default, nf-core pipelines use containers and software from the [biocontainers](https://biocontainers.pro/) or [bioconda](https://bioconda.github.io/) projects. However, in some cases the pipeline specified version maybe out of date. To use a different container from the default container or conda environment specified in a pipeline, please see the [updating tool versions](https://nf-co.re/docs/usage/configuration#updating-tool-versions) section of the nf-core website. @@ -252,14 +406,6 @@ See the main [Nextflow documentation](https://www.nextflow.io/docs/latest/config If you have any questions or issues please send us a message on [Slack](https://nf-co.re/join/slack) on the [`#configs` channel](https://nfcore.slack.com/channels/configs). -## Azure Resource Requests - -To be used with the `azurebatch` profile by specifying the `-profile azurebatch`. -We recommend providing a compute `params.vm_type` of `Standard_D16_v3` VMs by default but these options can be changed if required. - -Note that the choice of VM size depends on your quota and the overall workload during the analysis. -For a thorough list, please refer the [Azure Sizes for virtual machines in Azure](https://docs.microsoft.com/en-us/azure/virtual-machines/sizes). - ## Running in the background Nextflow handles job submissions and supervises the running jobs. The Nextflow process must run until the pipeline is finished. diff --git a/galaxy/README.md b/galaxy/README.md new file mode 100644 index 00000000..3aaf6761 --- /dev/null +++ b/galaxy/README.md @@ -0,0 +1,96 @@ +# Galaxy + +The following instructions need to performed for each release + +>[!TIP] +>For the first time setup of Galaxy for you Nextflow pipeline, see the [setup instructions](setup.md) + +## Activate environment + +>[!NOTE] +>If you're planemo environment is not set up, see the [setup instructions](setup.md) + +Activate your planemo environment: +``` +micromamba activate planemo +``` + +## At each release: build XML file + +### Optional: modify static values in template file + +If needed, you can: +- update the versions of core dependencies (Nextflow, Micromamba, OpenJDK) +- modify outputs +- modify tests + +>[!NOTE] +>The versions of core dependencies (Nextflow, Micromamba, OpenJDK) are not updated automatically, although the code necessary for this is already implemented. +>For now, we want to keep control over the versions used, to avoid versions that may contain bugs. + +### Update tool + +Update the tool XML file: +``` +python build/build_tool.py +``` + +This script will fetch : + +- all the parameters in your nextflow_schema.json +- the latest version of Nextflow, Singularity and OpenJDK in Conda channels. + +and modify the XML file located at `galaxy/tool_shed/tool/nf_core_stableexpression.xml`. + +Your tool is ready to be used! + +## Test tool + +### Launch local Galaxy server + +You may want to have a first look at what your tool looks like in the Galaxy interface. +To launch a local instance of Galaxy with your tool already installed: + +``` +./serve +``` + +You can test the behaviour of your tool by providing different inputs and check the corrsponding output. + +### Linting and testing + +To lint your tool: + +``` +./lint +``` + +>[!WARNING] +>The test script is not working for now... Planemo does not seem to find the input data for testing... +>For the moment, testing in a local webserver and linting using the provided script should be sufficient. + +To test your tool: + +``` +./test +``` + +## Publishing to the Galaxy Toolshed + + +``` +cd tool_shed +``` + +### Optional: test update on the test Toolshed + +If you have already set up an account on the test Toolshed, you can test the update of your tool: +``` +planemo shed_update --shed_target toolshed +``` + +### Official Galaxy Toolshed + +``` +planemo shed_update --shed_target toolshed +``` diff --git a/galaxy/build/build_template.py b/galaxy/build/build_template.py new file mode 100755 index 00000000..0a163465 --- /dev/null +++ b/galaxy/build/build_template.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 + +import logging +from pathlib import Path + +from formatters import ConfigFormatter + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +STATIC_TOOL_FILENAME = Path(__file__).parent / "static/template.xml" +BOILERPLATE_FILENAME = Path(__file__).parent / "static/template.boilerplate.xml" + + +def main(): + logger.info("Parsing config") + pipeline_metadata = ConfigFormatter.get_pipeline_metadata() + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # REPLACING ACTUAL PARAMS IN STATIC TOOL + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + with open(BOILERPLATE_FILENAME, "r") as fin: + boilerplate_string = fin.read() + + pipeline_name = pipeline_metadata["name"].replace("nf-core/", "") + + logger.info("Building template XML file") + template_string = boilerplate_string.replace("PIPELINE_NAME", pipeline_name) + + with open(STATIC_TOOL_FILENAME, "w") as fout: + fout.write(template_string) + + logger.info("Done") + + +if __name__ == "__main__": + main() diff --git a/galaxy/build/build_tool.py b/galaxy/build/build_tool.py new file mode 100755 index 00000000..ef8e8432 --- /dev/null +++ b/galaxy/build/build_tool.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 + +import logging +from pathlib import Path + +from formatters import ConfigFormatter, SchemaFormatter + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +tool_boilerplate_file = Path(__file__).parent / "static/template.xml" +tool_file = Path(__file__).parents[1] / "tool_shed/tool/nf_core_{}.xml" + + +def main(): + logger.info("Formatting config") + # package_versions = ConfigFormatter.get_package_versions() + pipeline_metadata = ConfigFormatter.get_pipeline_metadata() + + logger.info("Formatting schema") + schema_formatter = SchemaFormatter() + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # REPLACING ACTUAL PARAMS IN STATIC TOOL + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + with open(tool_boilerplate_file, "r") as fin: + static_string = fin.read() + + # checking if package versions were filled by the user + for package_version in ["OPENJDK_VERSION"]: + if package_version in static_string: + raise ValueError( + f"You must fill the package version in place of {package_version} before building" + ) + + logger.info("Building tool XML file") + tool_string = ( + static_string + # .replace("NEXTFLOW_VERSION", package_versions["nextflow"]) + # .replace("APPTAINER_VERSION", package_versions["apptainer"]) + # .replace("OPENJDK_VERSION", package_versions["openjdk"]) + .replace("PIPELINE_VERSION", pipeline_metadata["version"]) + .replace("DESCRIPTION", schema_formatter.pipeline_description) + .replace("PARAMETERS", schema_formatter.params_cli) + .replace("INPUTS", schema_formatter.inputs) + ) + + pipeline_name = pipeline_metadata["name"].replace("nf-core/", "") + outfile = Path(str(tool_file).format(pipeline_name)) + with open(outfile, "w") as fout: + fout.write(tool_string) + + logger.info("Done") + + +if __name__ == "__main__": + main() diff --git a/galaxy/build/formatters/__init__.py b/galaxy/build/formatters/__init__.py new file mode 100644 index 00000000..4f46ac15 --- /dev/null +++ b/galaxy/build/formatters/__init__.py @@ -0,0 +1,4 @@ +from .schema.base import SchemaFormatter +from .config.base import ConfigFormatter + +__all__ = ["SchemaFormatter", "ConfigFormatter"] diff --git a/galaxy/build/formatters/config/base.py b/galaxy/build/formatters/config/base.py new file mode 100644 index 00000000..3e7fc16d --- /dev/null +++ b/galaxy/build/formatters/config/base.py @@ -0,0 +1,89 @@ +import logging +import re +from dataclasses import dataclass +from pathlib import Path +from typing import ClassVar + +import requests +from packaging.version import parse as vparse + +logger = logging.getLogger(__name__) + + +@dataclass +class BaseConfigFormatter: + """ + Base class for extracting metadata from the pipeline's config files. + """ + + CONFIG_FILE: ClassVar[Path] = Path(__file__).parents[4] / "nextflow.config" + MAIN_FILE: ClassVar[Path] = Path(__file__).parents[4] / "main.nf" + PACKAGES_REPOS: ClassVar[dict] = { + "nextflow": "bioconda", + "micromamba": "conda-forge", + "openjdk": "conda-forge", + } + + @classmethod + def get_package_versions(cls) -> dict: + # CONDA PACKAGE VERSIONS + package_version = {} + for package, repo in cls.PACKAGES_REPOS.items(): + package_version[package] = cls.get_package_version(package, repo) + return package_version + + @staticmethod + def get_package_version(package: str, repo: str) -> str: + """ + Get latest pip version of package + """ + logger.info(f"Getting latest version of package {package}") + url = f" https://api.anaconda.org/package/{repo}/{package}" + try: + response = requests.get(url) + response.raise_for_status() + data = response.json() + versions = sorted( + data["versions"], reverse=True, key=vparse + ) # from latest to oldest + return versions[0] # most recent + except requests.RequestException as e: + raise RuntimeError(f"Error fetching version info: {e}") + + @classmethod + def get_pipeline_metadata(cls) -> dict: + # PARSING CONFIG + with open(cls.CONFIG_FILE, "r") as f: + pipeline_config = f.read() + + # regular expression to find the manifest block and extract the version + manifest_pattern = re.compile(r"manifest\s*{\s*(.*?)\s*}", re.DOTALL) + manifest_match = manifest_pattern.search(pipeline_config) + version = None + name = None + + if manifest_match: + manifest_content = manifest_match.group(1) + + # regular expression to find the version field + name_pattern = re.compile(r'name\s*=\s*[\'"](.*?)[\'"]') + name_match = name_pattern.search(manifest_content) + if name_match: + name = name_match.group(1) + else: + raise ValueError("No name found in pipeline config") + + # regular expression to find the version field + version_pattern = re.compile(r'version\s*=\s*[\'"](.*?)[\'"]') + version_match = version_pattern.search(manifest_content) + if version_match: + version = version_match.group(1) + else: + raise ValueError("No version found in pipeline config") + + return dict(name=name, version=version) + + +@dataclass +class ConfigFormatter(BaseConfigFormatter): + pass diff --git a/galaxy/build/formatters/schema/base.py b/galaxy/build/formatters/schema/base.py new file mode 100644 index 00000000..50a702f0 --- /dev/null +++ b/galaxy/build/formatters/schema/base.py @@ -0,0 +1,98 @@ +from pathlib import Path +import json +from dataclasses import dataclass, field +from typing import ClassVar +from . import parameter + + +@dataclass +class SchemaFormatter: + SCHEMA_FILE: ClassVar[Path] = Path(__file__).parents[4] / "nextflow_schema.json" + PARAMS_TO_IGNORE: ClassVar[list] = ["outdir", "email", "multiqc_title"] + SECTIONS_TO_IGNORE: ClassVar[list] = [ + "institutional_config_options", + "generic_options", + ] + SECTIONS_TO_EXPAND: ClassVar[list] = ["input_output_options"] + + pipeline_description: str = field(init=False) + inputs: str = field(init=False) + params_cli: str = field(init=False) + _pipeline_params: dict = field(init=False) + + _inputs: list = field(init=False, default_factory=list) + _params_cli: list = field(init=False, default_factory=list) + + def __post_init__(self): + self.parse_schema_file() + + def parse_schema_file(self): + with open(self.SCHEMA_FILE, "r") as f: + pipeline_schema = json.load(f) + + self.pipeline_description = pipeline_schema["description"].strip("\n") + self._pipeline_params = pipeline_schema["$defs"] + + # PARSING PARAMETERS AND BUILDING STRINGS + for section, section_dict in self._pipeline_params.items(): + if section in self.SECTIONS_TO_IGNORE: + continue + + section_inputs, section_params_cli, section_usage_options = ( + self.format_input_section(section, section_dict) + ) + + self._inputs += section_inputs + self._params_cli += section_params_cli + + self.inputs = "\n".join(self._inputs) + self.params_cli = "\n".join(self._params_cli) + + def format_input_section( + self, section: str, section_dict: dict + ) -> tuple[list, list, list]: + section_inputs = [] + section_params_cli = [] + section_usage_options = [] + + section_title = "" + section_help = "" + + if title := section_dict.get("title"): + section_title = f' title="{title}"' + if description := section_dict.get("description"): + section_help = f' help="{description}"' + + section_expanded = ( + ' expanded="true"' + if section in self.SECTIONS_TO_EXPAND + else ' expanded="false"' + ) + + section_inputs.append( + f'\t\t
' + ) + section_usage_options.append("\n\t" + section.capitalize().replace("_", " ")) + + required_params = section_dict.get("required", []) + + for param, param_dict in section_dict["properties"].items(): + if param not in self.PARAMS_TO_IGNORE: + optional = param not in required_params + + # checking if param must be parsed in a generic or in a custom way + if param in parameter.PARAMETER_TO_CUSTOM_CLASS: + class_ = parameter.PARAMETER_TO_CUSTOM_CLASS[param] + else: + class_ = parameter.BaseParameterFormatter + + param_formatter = class_(param, section, param_dict, optional) + + # input arguments + section_inputs.append(param_formatter.get_input()) + # cli + section_params_cli.append(param_formatter.get_cli()) + + section_inputs.append("\t\t
") + + return section_inputs, section_params_cli, section_usage_options diff --git a/galaxy/build/formatters/schema/parameter/__init__.py b/galaxy/build/formatters/schema/parameter/__init__.py new file mode 100644 index 00000000..2708d3be --- /dev/null +++ b/galaxy/build/formatters/schema/parameter/__init__.py @@ -0,0 +1,14 @@ +from .base import BaseParameterFormatter +from .datasets import DatasetsParameterFormatter +from .required import RequiredParameterFormatter + +# from .default_value import DefaultValueParameterFormatter + +PARAMETER_TO_CUSTOM_CLASS = { + "datasets": DatasetsParameterFormatter, + "normalisation_method": RequiredParameterFormatter, + "nb_top_gene_candidates": RequiredParameterFormatter, + # "species": DefaultValueParameterFormatter, +} + +__all__ = ["BaseParameterFormatter"] diff --git a/galaxy/build/formatters/schema/parameter/base.py b/galaxy/build/formatters/schema/parameter/base.py new file mode 100644 index 00000000..87402b0c --- /dev/null +++ b/galaxy/build/formatters/schema/parameter/base.py @@ -0,0 +1,201 @@ +from dataclasses import dataclass +from typing import ClassVar, override + + +@dataclass +class Validator: + """ """ + + PATTERN: ClassVar[str] = ( + '\t\t\t{expression}\n' + ) + + type: str + message: str + expression: str + + @override + def __str__(self): + return self.PATTERN.format( + type=self.type, message=self.message, expression=self.expression + ) + + +@dataclass +class Option: + """ + Represents an option for a parameter. + + Attributes: + value (str): The value of the option. + default_value (str): The default value of the option. + optional (bool): Whether the option is optional. + """ + + PATTERN: ClassVar[str] = ( + '\t\t\t\n' + ) + + value: str + default_value: str | None + optional: bool + + @override + def __str__(self): + selected_arg = ' selected="true"' if self.value == self.default_value else "" + return self.PATTERN.format( + option=self.value, label=self.value.capitalize(), selected_arg=selected_arg + ) + + +@dataclass +class BaseParameterFormatter: + NF_TYPES_TO_GALAXY: ClassVar[dict] = { + "string": "text", + "boolean": "boolean", + "integer": "integer", + "number": "float", + } + BASE_INPUT_PARAM: ClassVar[str] = ( + '\t\t\t' + ) + + param: str + section: str + param_dict: dict + optional: bool + + @staticmethod + def enrich_input_param(input_param_str: str, args: list[str]) -> str: + # opening param for enrichment + input_param_str = input_param_str.replace(" />", ">\n") + # adding each arg in a separate line + for arg in args: + input_param_str += "\t" + arg + # closing + input_param_str += "\t\t\t" + return input_param_str + + @staticmethod + def extract_extensions(extension_str: str): + def clean_extension(ext: str) -> str: + ext = ext.strip().lower() + if ext == "yml": + return "yaml" + return ext + + # removing the .dat extension, that is only used in the pipeline + # in order to allow files from the Galaxy file system (all renamed in .dat) + base_extensions = [ext for ext in extension_str.split("|") if ext != "dat"] + # Galaxy does not allow 'yml', only 'yaml' + return list(set([clean_extension(ext) for ext in base_extensions])) + + def process_file_param(self): + input_type = "data" + # removing extension check as files are renamed in .dat files by Galaxy + if pattern := self.param_dict.get( + "pattern" + ): # going from something like "^\\S+\\.(csv|yaml)$" to "csv,ya + # getting the extensions part + extension_str = pattern.split(".")[-1] + # removes recursively all leading and traling "(", ")" and "$" + extension_str = extension_str.strip("$()") + # getting list of extensions; removing dat because this extension is specifically made to handle Galaxy filename + formated_extensions_str = ",".join(self.extract_extensions(extension_str)) + param_format = f' format="{formated_extensions_str}"' + else: + # there is no specific pattern provided in the schema, this means that the format does not matter much + # however, the planemo linter needs a format, so we specify format="data" + param_format = ' format="data"' + return input_type, param_format + + def get_input(self) -> str: + """ + building input param + """ + + # making copy of base input param string + input_param_str = self.BASE_INPUT_PARAM + + param_format = "" + param_label = "" + param_help = "" + param_true_false = "" + param_value = "" + param_min = "" + param_max = "" + param_optional = ' optional="true"' if self.optional else ' optional="false"' + + param_type = self.param_dict["type"] + default_value = self.param_dict.get("default") + + # special case when parameter is a file + if param_type == "string" and self.param_dict.get("format") == "file-path": + input_type, param_format = self.process_file_param() + + # all other types + else: + input_type = self.NF_TYPES_TO_GALAXY[param_type] + + if param_type == "boolean": + param_true_false = f' truevalue="--{self.param}" falsevalue=""' + + elif param_type in ["integer", "number"]: + if minimum := self.param_dict.get("minimum"): + param_min = f' min="{minimum}"' + if maximum := self.param_dict.get("maximum"): + param_max = f' max="{maximum}"' + + elif param_type == "string": + # if there is a pattern for this string, we need to enrich this XML section with a validator + # TODO: handle (rare) case where bot enum and pattern are given + if pattern := self.param_dict.get("pattern"): # regex + msg = f"must match regular expression {pattern}" + validator = Validator(type="regex", message=msg, expression=pattern) + input_param_str = self.enrich_input_param( + input_param_str, args=[str(validator)] + ) + + # handle parameter with enum (options) + if option_values := self.param_dict.get("enum"): + input_type = "select" + options = [ + Option(value, default_value, self.optional) for value in option_values + ] + input_param_str = self.enrich_input_param( + input_param_str, args=[str(option) for option in options] + ) + + else: + if default_value is not None: + param_value = f' value="{default_value}"' + + if description := self.param_dict.get("description"): + param_label = f'label="{description}"' + if help_text := self.param_dict.get("help_text"): + param_help = f' help="{help_text}"' + + return input_param_str.format( + param=self.param, + type=input_type, + label=param_label, + format=param_format, + value=param_value, + min=param_min, + max=param_max, + true_false=param_true_false, + help=param_help, + optional=param_optional, + ) + + def get_cli(self) -> str: + # extra quotes if string parameter + value = ( + f'"${self.section}.{self.param}"' + if self.param_dict["type"] == "string" + else f"${self.section}.{self.param}" + ) + if self.optional: + return f"\t\t\t#if ${self.section}.{self.param}\n\t\t\t --{self.param} {value}\n\t\t\t#end if" + else: + return f"\t\t\t--{self.param} {value}" diff --git a/galaxy/build/formatters/schema/parameter/datasets.py b/galaxy/build/formatters/schema/parameter/datasets.py new file mode 100644 index 00000000..be28002d --- /dev/null +++ b/galaxy/build/formatters/schema/parameter/datasets.py @@ -0,0 +1,47 @@ +import re +from dataclasses import dataclass +from typing import override + +from .base import BaseParameterFormatter + + +@dataclass +class DatasetsParameterFormatter(BaseParameterFormatter): + # if param is an optional file with multiple possible values, it requires special handling + # see https://docs.galaxyproject.org/en/latest/dev/schema.html#id51 + + @override + def get_input(self) -> str: + input_param_str = super().get_input() + # setting to required + # changing param name + input_param_str = input_param_str.replace( + 'optional="true"', 'optional="false"' + ).replace(self.param, "samplesheet") + # changing label + input_param_str = re.sub( + r'label="[\s\w]*"', 'label="Samplesheet"', input_param_str + ) + + # adding conditional statement + return f""" \t\t\t + + + + + + {input_param_str} + + + + + + """ + + @override + def get_cli(self) -> str: + # see https://planemo.readthedocs.io/en/latest/writing_advanced.html#consuming-collections + return f""" + \t#if ${self.section}.datasets.provide_datasets == "true": + \t\t--datasets renamed_samplesheet.csv + \t#end if""" diff --git a/galaxy/build/formatters/schema/parameter/default_value.py b/galaxy/build/formatters/schema/parameter/default_value.py new file mode 100644 index 00000000..e141f461 --- /dev/null +++ b/galaxy/build/formatters/schema/parameter/default_value.py @@ -0,0 +1,8 @@ +from dataclasses import dataclass +from .base import BaseParameterFormatter + + +@dataclass +class DefaultValueParameterFormatter(BaseParameterFormatter): + def __post_init__(self): + self.param_dict["default"] = "Solanum tuberosum" diff --git a/galaxy/build/formatters/schema/parameter/required.py b/galaxy/build/formatters/schema/parameter/required.py new file mode 100644 index 00000000..52cdbb82 --- /dev/null +++ b/galaxy/build/formatters/schema/parameter/required.py @@ -0,0 +1,8 @@ +from dataclasses import dataclass +from .base import BaseParameterFormatter + + +@dataclass +class RequiredParameterFormatter(BaseParameterFormatter): + def __post_init__(self): + self.optional = False diff --git a/galaxy/build/static/template.boilerplate.xml b/galaxy/build/static/template.boilerplate.xml new file mode 100644 index 00000000..f2429c20 --- /dev/null +++ b/galaxy/build/static/template.boilerplate.xml @@ -0,0 +1,53 @@ + + DESCRIPTION + + nextflow + apptainer + openjdk + + + + + +INPUTS + + + + + + + + + @misc{nf-core/PIPELINE_NAME, + author = {}, + year = {}, + title = {nf-core/PIPELINE_NAME}, + publisher = {GitHub}, + journal = {GitHub repository}, + url = {https://github.com/nf-core/PIPELINE_NAME}, + } + + + diff --git a/galaxy/build/static/template.xml b/galaxy/build/static/template.xml new file mode 100644 index 00000000..d10fd6fc --- /dev/null +++ b/galaxy/build/static/template.xml @@ -0,0 +1,171 @@ + + DESCRIPTION + + nextflow + micromamba + openjdk + + + + + +INPUTS + + + + + + + + +
+ + + + + + + +
+
+ +
+ + + + + + + + + + +
+ +
+ + + + + +
+ + + + + + + + + + +
+ +
+ + + + + + + +
+ + + + + + + + + + +
+
+ + + + @misc{nf-core/stableexpression, + author = {Coen, Olivier}, + year = {2025}, + title = {nf-core/stableexpression}, + publisher = {GitHub}, + journal = {GitHub repository}, + url = {https://github.com/OlivierCoen/stableexpression}, + } + + +
diff --git a/galaxy/dev/nextflow_apptainer.xml b/galaxy/dev/nextflow_apptainer.xml new file mode 100644 index 00000000..27f1f851 --- /dev/null +++ b/galaxy/dev/nextflow_apptainer.xml @@ -0,0 +1,34 @@ + + This pipeline is dedicated to finding the most stable genes across count datasets + + nextflow + apptainer + fuse-overlayfs + openjdk + + + results/species.txt + + && zip -r results.zip results + + ]]> + + + + + + + + diff --git a/galaxy/environment.yml b/galaxy/environment.yml new file mode 100644 index 00000000..feae659f --- /dev/null +++ b/galaxy/environment.yml @@ -0,0 +1,10 @@ +name: planemo +channels: + - defaults + - conda-forge + - bioconda + - nodefaults +dependencies: + - python=3.12 + - pip: + - planemo==0.75.33 diff --git a/galaxy/lint b/galaxy/lint new file mode 100755 index 00000000..dd141d2f --- /dev/null +++ b/galaxy/lint @@ -0,0 +1,8 @@ +#!/bin/bash + +galaxy_dir="$(dirname $(readlink -f "$0"))" +tool_file="${galaxy_dir}/tool_shed/tool/nf_core_stableexpression.xml" + +planemo lint $tool_file + +planemo shed_lint tool_shed/tool --tools diff --git a/galaxy/serve b/galaxy/serve new file mode 100755 index 00000000..019e9ee0 --- /dev/null +++ b/galaxy/serve @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +galaxy_dir="$(dirname $(readlink -f "$0"))" +tool_dir="${galaxy_dir}/tool_shed/tool" + +planemo serve \ + $tool_dir + +# add --no_cleanup to keep the pipelines workdirs after a run +# very useful for debugging diff --git a/galaxy/setup.md b/galaxy/setup.md new file mode 100644 index 00000000..d6e28afc --- /dev/null +++ b/galaxy/setup.md @@ -0,0 +1,60 @@ +# First time setup + +>[!NOTE] +>The following instructions need only to be performed once when you want to initialise your Galaxy tool and your repository on the Galaxy Toolshed. + +## Setup build / testing environment + +Create a new environment with python and planemo installed: +``` +micromamba env create -f environment.yml -y +micromamba activate planemo +``` + +## Initialise Galaxy tool boilerplate + +The XML definition file is partially generated dynamically by: + +- parsing nextflow_schema.json +- fetching latest version of Nextflow, Singularity and OpenJDK in Conda channels + +However, you need to build a boilerplate file with things that cannot be directly interpreted from nextflow_schema.json, such as: + +- path to selected output files +- tests +- specific conditions for the inputs + +### Build template XML file + +``` +python build/build_boilerplate.py +``` + +The boilerplate XML file is generated at `galaxy/build/static/boilerplate.xml`. + +### Customise template XML file + +You must edit the boilerplate XML file to add your customisations: + +- Mandatory (at least if your pipeline uses a samplesheet): modify file paths in the samplesheet + Galaxy has its own path system, and you must retrieve dynamically the paths of the files provided, in order to modify them in the samplesheet + "Running the pipeline" + In this cas, add "&&" before "nextflow drop ..." + +- modify outputs +- add tests + +## Create repository on Toolshed + +All necessary instructions are available in the [Galaxy Toolshed documentation](https://planemo.readthedocs.io/en/master/publishing.html). + +For now, you just need to : +- [configure a shed account](https://planemo.readthedocs.io/en/master/publishing.html#configuring-a-shed-account) +- [create a new repository on the Toolshed](https://planemo.readthedocs.io/en/master/publishing.html#creating-a-repository) + +Create a new folder for your tool and place the .shed.yml file in it: + +``` +mkdir -p tool_shed/tool +mv .shed.yml tool_shed +``` diff --git a/galaxy/test b/galaxy/test new file mode 100755 index 00000000..c11e3e34 --- /dev/null +++ b/galaxy/test @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +ARGS="$@" + +GALAXY_DIR="$(dirname $(readlink -f "$0"))" +TOOL_FOLDER="${GALAXY_DIR}/tool_shed/tool" +TOOL_FILE="${TOOL_FOLDER}/nf_core_stableexpression.xml" + + +TEST_OUTDIR="test_output" +mkdir -p $TEST_OUTDIR + +OUTPUT_REPORT="${TEST_OUTDIR}/report.html" +OUTPUT_JSON="${TEST_OUTDIR}/report.json" + +# add --update_test_data to create output file +planemo test \ + $TOOL_FILE \ + --install_galaxy \ + --job_output_files $TEST_OUTDIR \ + --test_output $OUTPUT_REPORT \ + --test_output_json $OUTPUT_JSON \ + --tool_data $TOOL_FOLDER \ + --update_test_data \ + $ARGS diff --git a/galaxy/tool_shed/.shed.yml b/galaxy/tool_shed/.shed.yml new file mode 100644 index 00000000..3c5b94c7 --- /dev/null +++ b/galaxy/tool_shed/.shed.yml @@ -0,0 +1,13 @@ +categories: + - Transcriptomics + - RNA + - Micro-array Analysis +description: Pipeline dedicated to finding the most stable genes across count datasets +homepage_url: https://nf-co.re/stableexpression/ +long_description: | + nf-core/stableexpression is a bioinformatics pipeline that aims at finding the most stable genes among a single or multiple public / local count datasets. + It takes as input a species name (mandatory), keywords for expression atlas search (optional) and / or a CSV input file listing local raw / normalised count datasets (optional). + A typical usage is to find the most suitable qPCR housekeeping genes for a specific species (and optionally specific conditions). +name: nf_core_stableexpression +owner: ocoen +remote_repository_url: https://github.com/nf-core/stableexpression/ diff --git a/galaxy/tool_shed/tool/nf_core_stableexpression.xml b/galaxy/tool_shed/tool/nf_core_stableexpression.xml new file mode 100644 index 00000000..57d81dd3 --- /dev/null +++ b/galaxy/tool_shed/tool/nf_core_stableexpression.xml @@ -0,0 +1,648 @@ + + This pipeline is dedicated to identifying the most stable genes within a single or multiple expression dataset(s). This is particularly useful for identifying the most suitable RT-qPCR reference genes for a specific species. + + nextflow + micromamba + openjdk + + + + + +
+ + ^([a-zA-Z]+)[_ ]([a-zA-Z]+)[_ a-zA-Z]*$ + + + + + + + + + + + + + + + + (([a-zA-Z,]+))? + + + + + + + + + +
+
+ + + + ([A-Z0-9-]+,?)+ + + + + ([A-Z0-9-]+,?)+ + + +
+
+ + + + + + + + + + + + +
+
+ + + +
+
+ + + + + + + + + + + + + + + +
+
+ + + + + ^\d+(\.\d+)?,\d+(\.\d+)?,\d+(\.\d+)?,\d+(\.\d+)?$ + +
+
+ + +
+
+ + + + + + + +
+ + + + + + + +
+
+ +
+ + + + + + + + + + +
+ +
+ + + + + +
+ + + + + + + + + + +
+ +
+ + + + + + + +
+ + + + + + + + + + +
+
+ + + + @misc{nf-core/stableexpression, + author = {Coen, Olivier}, + year = {2025}, + title = {nf-core/stableexpression}, + publisher = {GitHub}, + journal = {GitHub repository}, + url = {https://github.com/OlivierCoen/stableexpression}, + } + + +
diff --git a/galaxy/tool_shed/tool/rebuild_samplesheet.py b/galaxy/tool_shed/tool/rebuild_samplesheet.py new file mode 100644 index 00000000..43666e66 --- /dev/null +++ b/galaxy/tool_shed/tool/rebuild_samplesheet.py @@ -0,0 +1,70 @@ +#!/usr/env/bin python +""" +Script dedicated to renaming files in the samplesheet provided. +In Galaxy, data files provided by users are given a new file name. +However, original file names can be retrieved from the name attribute of the file object (inside the tool XML file). +In this script, we replace the original name with the actual Galaxy path. + +""" + +import argparse +import logging +from pathlib import Path +import csv + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--in", dest="samplesheet", type=Path, required=True) + parser.add_argument("--count-files", dest="count_files", type=str, required=True) + parser.add_argument( + "--count-filenames", dest="count_filenames", type=str, nargs="+", required=True + ) + parser.add_argument("--design-files", dest="design_files", type=str, required=True) + parser.add_argument( + "--design-filenames", + dest="design_filenames", + type=str, + nargs="+", + required=True, + ) + parser.add_argument("--out", dest="outfile", type=Path, required=True) + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + + # files and names arrive in the same order + count_files = args.count_files.split(",") + design_files = args.design_files.split(",") + + count_names_to_files = { + name: file for file, name in zip(count_files, args.count_filenames) + } + design_names_to_files = { + name: file for file, name in zip(design_files, args.design_filenames) + } + + renamed_rows = [] + with open(args.samplesheet, "r", newline="") as fin: + reader = csv.DictReader(fin) + header = reader.fieldnames + for row in reader: + # getting original names (file names as written in the samplesheet) + original_count_filename = Path(row["counts"]).name + original_design_filename = Path(row["design"]).name + # turning original names into new names (Galaxy file names) + row["counts"] = count_names_to_files[original_count_filename] + row["design"] = design_names_to_files[original_design_filename] + renamed_rows.append(row) + + with open(args.outfile, "w", newline="") as fout: + writer = csv.DictWriter(fout, fieldnames=header) + + writer.writeheader() + for row in renamed_rows: + writer.writerow(row) diff --git a/galaxy/tool_shed/tool/test_data/input.csv b/galaxy/tool_shed/tool/test_data/input.csv new file mode 100644 index 00000000..6ea4aa16 --- /dev/null +++ b/galaxy/tool_shed/tool/test_data/input.csv @@ -0,0 +1,3 @@ +counts,design,platform,normalised +tests/test_data/input_datasets/microarray.normalised.csv,tests/test_data/input_datasets/microarray.normalised.design.csv,microarray,true +tests/test_data/input_datasets/rnaseq.raw.csv,tests/test_data/input_datasets/rnaseq.raw.design.csv,rnaseq,false diff --git a/galaxy/tool_shed/tool/test_data/microarray.normalised.csv b/galaxy/tool_shed/tool/test_data/microarray.normalised.csv new file mode 100644 index 00000000..81f3f904 --- /dev/null +++ b/galaxy/tool_shed/tool/test_data/microarray.normalised.csv @@ -0,0 +1,10 @@ +gene_id,GSM1528575,GSM1528576,GSM1528579,GSM1528583,GSM1528584,GSM1528585,GSM1528580,GSM1528586,GSM1528582,GSM1528578,GSM1528581,GSM1528577 +ENSRNA049453121,20925.1255070264,136184.261516502,144325.370645564,89427.0987612997,164143.182734208,34178.6378088171,28842.7323281157,76973.395782103,41906.9367255656,44756.5602263121,252562.049703724,6953.65643340122 +ENSRNA049453138,196173.051628372,16607.8367703051,344972.83715281,22602.4535330758,13678.598561184,104546.421532852,15451.4637472048,71664.8857281649,160643.257448002,91459.0578537683,88396.7173963033,281623.08555275 +ENSRNA049454388,91547.4240932405,11625.4857392136,84483.143792525,80582.6604222701,218857.576978944,58304.7350856292,42234.0009090266,88475.1675656357,87306.1181782617,17513.436610296,90922.3378933406,76490.2207674135 +ENSRNA049454416,20925.1255070264,106290.155329953,193607.204524536,47170.3378081581,392119.825420608,190998.270108096,90648.5873169351,81397.1541603848,83813.8734511313,165404.67909724,111127.301869638,194702.380135234 +ENSRNA049454647,99394.3461583754,91343.1022366783,3520.13099135521,71738.2220832404,118547.854196928,20105.0810640101,81377.7090686122,15040.7784861581,66352.6498154789,110918.431865208,55563.6509348192,111258.50293442 +ENSRNA049454661,175247.926121346,66431.3470812206,24640.9169394865,52083.9146631746,360203.095444512,36189.1459152181,70046.6356539953,85820.9125386666,13968.9789085219,50594.3724297441,25256.2049703724,52152.4232505092 +ENSRNA049454747,117703.830977024,154452.881963838,281610.479308417,29481.4611300988,191500.379856576,152798.616086476,53565.0743236435,14156.0268105017,293348.557078959,155674.99209152,63140.5124259309,243377.975169043 +ENSRNA049454887,2615.6406883783,164417.584026021,28161.0479308417,82548.0911642767,50154.861391008,136714.551235268,97859.270398964,64586.872322914,328271.004350264,159566.866893808,151537.229822234,86920.7054175153 +ENSRNA049454931,177863.566809724,81378.4001744952,235848.776420799,88444.3833902964,18238.131414912,120630.48638406,82407.8066517592,50430.8455124123,118736.320722436,68107.8090400402,232357.085727426,163410.926184929 diff --git a/tests/input/custom_datasets/normalized.design.csv b/galaxy/tool_shed/tool/test_data/microarray.normalised.design.csv similarity index 100% rename from tests/input/custom_datasets/normalized.design.csv rename to galaxy/tool_shed/tool/test_data/microarray.normalised.design.csv diff --git a/galaxy/tool_shed/tool/test_data/rnaseq.raw.csv b/galaxy/tool_shed/tool/test_data/rnaseq.raw.csv new file mode 100644 index 00000000..a9a6bdb4 --- /dev/null +++ b/galaxy/tool_shed/tool/test_data/rnaseq.raw.csv @@ -0,0 +1,10 @@ +,ESM1528575,ESM1528576,ESM1528579,ESM1528583,ESM1528584,ESM1528585,ESM1528580,ESM1528586,ESM1528582,ESM1528578,ESM1528581,ESM1528577 +ENSRNA049453121,1,82,8,82,4,68,88,73,46,57,25,22 +ENSRNA049453138,68,93,41,84,36,18,28,92,84,85,92,32 +ENSRNA049454388,38,10,0,23,11,17,95,57,25,82,10,70 +ENSRNA049454416,75,55,7,30,79,60,15,97,12,35,60,56 +ENSRNA049454647,35,64,55,91,48,95,68,100,24,26,100,47 +ENSRNA049454661,8,99,80,48,86,29,80,17,19,9,44,2 +ENSRNA049454747,67,7,98,53,3,10,52,87,4,80,22,15 +ENSRNA049454887,8,40,24,90,42,52,79,81,94,23,35,81 +ENSRNA049454931,45,49,67,73,26,76,41,16,34,47,36,25 diff --git a/tests/input/custom_datasets/raw.design.csv b/galaxy/tool_shed/tool/test_data/rnaseq.raw.design.csv similarity index 100% rename from tests/input/custom_datasets/raw.design.csv rename to galaxy/tool_shed/tool/test_data/rnaseq.raw.design.csv diff --git a/main.nf b/main.nf index 43fa183a..be4d42ea 100644 --- a/main.nf +++ b/main.nf @@ -9,15 +9,13 @@ ---------------------------------------------------------------------------------------- */ -nextflow.enable.dsl = 2 - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT FUNCTIONS / MODULES / SUBWORKFLOWS / WORKFLOWS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { STABLEEXPRESSION } from './workflows/stableexpression' +include { STABLEEXPRESSION } from './workflows/stableexpression' include { PIPELINE_INITIALISATION } from './subworkflows/local/utils_nfcore_stableexpression_pipeline' include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_stableexpression_pipeline' @@ -31,7 +29,19 @@ include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_stab // WORKFLOW: Run main analysis pipeline depending on type of input // workflow NFCORE_STABLEEXPRESSION { - STABLEEXPRESSION() + + take: + input_datasets + + main: + + // + // WORKFLOW: Run pipeline + // + STABLEEXPRESSION( input_datasets ) + + emit: + multiqc_report = STABLEEXPRESSION.out.multiqc_report // channel: /path/to/multiqc_report.html } /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -42,24 +52,27 @@ workflow NFCORE_STABLEEXPRESSION { workflow { main: - // // SUBWORKFLOW: Run initialisation tasks // PIPELINE_INITIALISATION ( params.version, - params.help, params.validate_params, params.monochrome_logs, args, - params.outdir + params.outdir, + params.datasets, + params.help, + params.help_full, + params.show_hidden ) // // WORKFLOW: Run main workflow // - NFCORE_STABLEEXPRESSION () - + NFCORE_STABLEEXPRESSION ( + PIPELINE_INITIALISATION.out.input_datasets + ) // // SUBWORKFLOW: Run completion tasks // @@ -69,7 +82,8 @@ workflow { params.plaintext_email, params.outdir, params.monochrome_logs, - params.hook_url + params.hook_url, + NFCORE_STABLEEXPRESSION.out.multiqc_report ) } diff --git a/modules.json b/modules.json index 7b7a08ad..ca279222 100644 --- a/modules.json +++ b/modules.json @@ -3,21 +3,30 @@ "homePage": "https://github.com/nf-core/stableexpression", "repos": { "https://github.com/nf-core/modules.git": { + "modules": { + "nf-core": { + "multiqc": { + "branch": "master", + "git_sha": "79b36b51048048374b642289bfe9e591ef56fe05", + "installed_by": ["modules"] + } + } + }, "subworkflows": { "nf-core": { "utils_nextflow_pipeline": { "branch": "master", - "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", + "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", "installed_by": ["subworkflows"] }, "utils_nfcore_pipeline": { "branch": "master", - "git_sha": "92de218a329bfc9a9033116eb5f65fd270e72ba3", + "git_sha": "df4d1c8cdee98a1bbbed8fc51e82296568e0f9c1", "installed_by": ["subworkflows"] }, - "utils_nfvalidation_plugin": { + "utils_nfschema_plugin": { "branch": "master", - "git_sha": "e6c055a7bb7551ef18436a8ef410dd0e27393c61", + "git_sha": "e753770db613ce014b3c4bc94f6cba443427b726", "installed_by": ["subworkflows"] } } diff --git a/modules/local/aggregate_results/environment.yml b/modules/local/aggregate_results/environment.yml new file mode 100644 index 00000000..93ba05a4 --- /dev/null +++ b/modules/local/aggregate_results/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 + - conda-forge::pyyaml==6.0.3 diff --git a/modules/local/aggregate_results/main.nf b/modules/local/aggregate_results/main.nf new file mode 100644 index 00000000..56173a85 --- /dev/null +++ b/modules/local/aggregate_results/main.nf @@ -0,0 +1,43 @@ +process AGGREGATE_RESULTS { + debug true + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/7e/7e08ea26f496697870f6afe87a9def87c1038c000306c9280719d40ee9797293/data': + 'community.wave.seqera.io/library/polars_python_pyyaml:0d7b8bed8db11ef1' }" + + input: + path count_file + path stat_score_files + path platform_stat_files, stageAs: "?/*" + val target_genes + path metadata_files + path mapping_files + path multiqc_config + + output: + path 'all_genes_summary.csv', emit: all_genes_summary + path '*most_stable_genes_summary.csv', emit: most_stable_genes_summary + path '*most_stable_genes_transposed_counts.csv', emit: most_stable_genes_transposed_counts_filtered + path 'custom_content_multiqc_config.yaml', emit: custom_content_multiqc_config + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + tuple val("${task.process}"), val('pyyaml'), eval('python3 -c "import yaml; print(yaml.__version__)"'), topic: versions + + script: + def mapping_files_arg = mapping_files ? "--mappings " + "$mapping_files" : "" + def metadata_files_arg = metadata_files ? "--metadata " + "$metadata_files" : "" + def target_genes_arg = target_genes ? "--target-genes " + "${target_genes.join(' ')}" : "" + """ + aggregate_results.py \\ + --counts $count_file \\ + --stats-with-scores $stat_score_files \\ + --platform-stats $platform_stat_files \\ + --multiqc-config $multiqc_config \\ + $mapping_files_arg \\ + $metadata_files_arg \\ + $target_genes_arg + """ + +} diff --git a/modules/local/clean_gene_ids/environment.yml b/modules/local/clean_gene_ids/environment.yml new file mode 100644 index 00000000..df720d08 --- /dev/null +++ b/modules/local/clean_gene_ids/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.12.8 + - conda-forge::polars==1.37.1 diff --git a/modules/local/clean_gene_ids/main.nf b/modules/local/clean_gene_ids/main.nf new file mode 100644 index 00000000..6165ab07 --- /dev/null +++ b/modules/local/clean_gene_ids/main.nf @@ -0,0 +1,27 @@ +process CLEAN_GENE_IDS { + + label 'process_low' + + tag "${meta.dataset}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/87/878943dcc1b8e30cd535a41886e0f75fcd8bbe667b2d2b0bc4adb0c549539e64/data': + 'community.wave.seqera.io/library/polars_python:07cce0ec1b0aeb84' }" + + input: + tuple val(meta), path(count_file) + + output: + tuple val(meta), path('*.cleaned.parquet'), optional: true, emit: counts + tuple val(meta.dataset), path("failure_reason.txt"), optional: true, topic: id_cleaning_failure_reason + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + """ + clean_gene_ids.py \\ + --count-file "$count_file" + """ + +} diff --git a/modules/local/collect_all_gene_ids/environment.yml b/modules/local/collect_all_gene_ids/environment.yml new file mode 100644 index 00000000..75afc696 --- /dev/null +++ b/modules/local/collect_all_gene_ids/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.2 + - conda-forge::tqdm==4.67.1 diff --git a/modules/local/collect_all_gene_ids/main.nf b/modules/local/collect_all_gene_ids/main.nf new file mode 100644 index 00000000..6f72b3c1 --- /dev/null +++ b/modules/local/collect_all_gene_ids/main.nf @@ -0,0 +1,25 @@ +process COLLECT_ALL_GENE_IDS { + + label "process_high" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/70/70c17cde84896904c0620d614cba74ff029f1255db64e66416e63c91b7c959a2/data': + 'community.wave.seqera.io/library/python_tqdm:4e039400f75bdad0' }" + + input: + path count_files, stageAs: "?/*" + + output: + path 'unique_gene_ids.txt', emit: unique_gene_ids + path 'gene_id_occurrences.csv', emit: gene_id_occurrences + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('tqdm'), eval('python3 -c "import tqdm; print(tqdm.__version__)"'), topic: versions + + script: + """ + collect_gene_ids.py \\ + --ids "$count_files" + """ + +} diff --git a/modules/local/collect_statistics/environment.yml b/modules/local/collect_statistics/environment.yml new file mode 100644 index 00000000..5d27c0af --- /dev/null +++ b/modules/local/collect_statistics/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.13.7 + - conda-forge::pandas==2.3.3 diff --git a/modules/local/collect_statistics/main.nf b/modules/local/collect_statistics/main.nf new file mode 100644 index 00000000..83e3d5c7 --- /dev/null +++ b/modules/local/collect_statistics/main.nf @@ -0,0 +1,25 @@ +process COLLECT_STATISTICS { + + tag "${file.baseName}" + label "process_high" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/3d/3d7126100b0eb7cb53dfb50291707ea8dda3b9738b76551ab73605d0acbe114b/data': + 'community.wave.seqera.io/library/pandas:2.3.3--5a902bf824a79745' }" + + input: + path file + + output: + path '*.transposed.csv', emit: csv + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('pandas'), eval('python3 -c "import pandas; print(pandas.__version__)"'), topic: versions + + script: + """ + collect_statistics.py \\ + --file $file + """ + +} diff --git a/modules/local/compute_dataset_statistics/environment.yml b/modules/local/compute_dataset_statistics/environment.yml new file mode 100644 index 00000000..df720d08 --- /dev/null +++ b/modules/local/compute_dataset_statistics/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.12.8 + - conda-forge::polars==1.37.1 diff --git a/modules/local/compute_dataset_statistics/main.nf b/modules/local/compute_dataset_statistics/main.nf new file mode 100644 index 00000000..7a97e31c --- /dev/null +++ b/modules/local/compute_dataset_statistics/main.nf @@ -0,0 +1,27 @@ +process COMPUTE_DATASET_STATISTICS { + + label 'process_single' + + tag "${meta.dataset}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/87/878943dcc1b8e30cd535a41886e0f75fcd8bbe667b2d2b0bc4adb0c549539e64/data': + 'community.wave.seqera.io/library/polars_python:07cce0ec1b0aeb84' }" + + input: + tuple val(meta), path(count_file) + + output: + tuple val(meta.dataset), path("skewness.txt"), topic: skewness + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + def prefix = task.ext.prefix ?: "${meta.dataset}" + """ + compute_dataset_statistics.py \\ + --counts $count_file + """ + +} diff --git a/modules/local/compute_gene_statistics/environment.yml b/modules/local/compute_gene_statistics/environment.yml new file mode 100644 index 00000000..45a97ad0 --- /dev/null +++ b/modules/local/compute_gene_statistics/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 diff --git a/modules/local/compute_gene_statistics/main.nf b/modules/local/compute_gene_statistics/main.nf new file mode 100644 index 00000000..ba84fd16 --- /dev/null +++ b/modules/local/compute_gene_statistics/main.nf @@ -0,0 +1,36 @@ +process COMPUTE_GENE_STATISTICS { + + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/00/00f1434368763cebf37466cfaaaf069f971f7eae65b010169975c50d084e5af3/data': + 'community.wave.seqera.io/library/polars_python:1a4a3322c56bfeb9' }" + + input: + tuple val(meta), path(count_file, name: 'count_file.parquet'), path(imputed_count_file, name: 'imputed_count_file.parquet') + path ratio_nulls_per_samples + val max_null_ratio_valid_sample + + output: + path '*stats_all_genes.csv', emit: stats + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + def args = task.ext.args ?: '' + if ( meta.platform != "all" ) { + args += " --platform $meta.platform" + } + if ( imputed_count_file ) { + args += " --imputed-counts imputed_count_file.parquet" + } + """ + compute_gene_statistics.py \\ + --counts count_file.parquet \\ + --ratio-nulls-per-sample $ratio_nulls_per_samples \\ + --max-ratio-null-valid-sample $max_null_ratio_valid_sample \\ + $args + """ + +} diff --git a/modules/local/compute_gene_transcript_lengths/environment.yml b/modules/local/compute_gene_transcript_lengths/environment.yml new file mode 100644 index 00000000..5d27c0af --- /dev/null +++ b/modules/local/compute_gene_transcript_lengths/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.13.7 + - conda-forge::pandas==2.3.3 diff --git a/modules/local/compute_gene_transcript_lengths/main.nf b/modules/local/compute_gene_transcript_lengths/main.nf new file mode 100644 index 00000000..ada61de4 --- /dev/null +++ b/modules/local/compute_gene_transcript_lengths/main.nf @@ -0,0 +1,38 @@ +process COMPUTE_GENE_TRANSCRIPT_LENGTHS { + + label 'process_single' + + tag "${gff3.baseName}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/3d/3d7126100b0eb7cb53dfb50291707ea8dda3b9738b76551ab73605d0acbe114b/data': + 'community.wave.seqera.io/library/pandas:2.3.3--5a902bf824a79745' }" + + input: + path gff3 + + output: + path('gene_transcript_lengths.csv'), emit: csv + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('pandas'), eval('python3 -c "import pandas; print(pandas.__version__)"'), topic: versions + + script: + def is_compressed = gff3.getExtension() == "gz" ? true : false + def gff3_name = is_compressed ? gff3.getBaseName() : gff3 + """ + if [ "${is_compressed}" == "true" ]; then + gzip -c -d ${gff3} > ${gff3_name} + fi + + compute_gene_transcript_lengths.py \\ + --annotation ${gff3_name} + """ + + + stub: + """ + touch gene_transcript_lengths.csv + """ + +} diff --git a/modules/local/compute_stability_scores/environment.yml b/modules/local/compute_stability_scores/environment.yml new file mode 100644 index 00000000..45a97ad0 --- /dev/null +++ b/modules/local/compute_stability_scores/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 diff --git a/modules/local/compute_stability_scores/main.nf b/modules/local/compute_stability_scores/main.nf new file mode 100644 index 00000000..48f8f0e3 --- /dev/null +++ b/modules/local/compute_stability_scores/main.nf @@ -0,0 +1,32 @@ +process COMPUTE_STABILITY_SCORES { + + tag "${meta.section}" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/00/00f1434368763cebf37466cfaaaf069f971f7eae65b010169975c50d084e5af3/data': + 'community.wave.seqera.io/library/polars_python:1a4a3322c56bfeb9' }" + + input: + tuple val(meta), path(normfinder_stability_file), path(genorm_stability_file), path(section_stat_file) + val stability_score_weights + + output: + path "${meta.section}.stats_with_scores.csv", emit: stats_with_stability_scores + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + def genorm_stability_file_arg = genorm_stability_file ? "--genorm-stability $genorm_stability_file" : "" + """ + compute_stability_scores.py \\ + --stats $section_stat_file \\ + --weights "$stability_score_weights" \\ + --normfinder-stability $normfinder_stability_file \\ + $genorm_stability_file_arg + + mv stats_with_scores.csv ${meta.section}.stats_with_scores.csv + """ + +} diff --git a/modules/local/dash_app/app/app.py b/modules/local/dash_app/app/app.py new file mode 100755 index 00000000..27fc7bf6 --- /dev/null +++ b/modules/local/dash_app/app/app.py @@ -0,0 +1,91 @@ +import socket +import dash_mantine_components as dmc + +from dash_extensions.enrich import ( + DashProxy, + html, + ServersideOutputTransform, + TriggerTransform, +) +from dash_extensions.logging import NotificationsLogHandler + +from src.utils import config, style +from src.components import stores, tooltips +from src.components import top, right_sidebar +from src.callbacks import common, genes, samples + +DEBUG = True +# DEBUG = False + +# -------------------- SETUP LOGGING -------------------- + +log_handler = NotificationsLogHandler() +logger = log_handler.setup_logger(__name__) + +# -------------------- APP -------------------- +# init the application +logger.info("Creating app") + +app = DashProxy( + __name__, + title=config.APP_TITLE, + prevent_initial_callbacks="initial_duplicate", + suppress_callback_exceptions=(not DEBUG), + update_title=config.UPDATE_TITLE, + external_stylesheets=[dmc.styles.ALL], + transforms=[TriggerTransform(), ServersideOutputTransform()], +) + +# -------------------- LAYOUT -------------------- + + +def serve_layout(): + return dmc.MantineProvider( + children=[ + html.Div( + [ + top.header, + right_sidebar.drawer, + *stores.stores_to_load, + *tooltips.tooltips_to_load, + ] + + log_handler.embed(), + id="layout", + style=style.LAYOUT, + ) + ] + ) + + +app.layout = serve_layout + +# -------------------- IMPORTING CALLBACKS -------------------- + +common.register_callbacks() +genes.register_callbacks() +samples.register_callbacks() + +# -------------------- LAUNCH SERVER -------------------- + + +def find_port(port: int) -> int: + """Find a port not in use starting at given port""" + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + if s.connect_ex(("localhost", port)) == 0: + return find_port(port=port + 1) + else: + return port + + +if __name__ == "__main__": + logger.info("Running server") + # setting prune_errors to False avoids error message pruning + # in order to get original tracebacks + # (very useful for debugging) + prune_errors = False if DEBUG else True + app.run( + debug=DEBUG, + host=config.HOST, + port=find_port(port=config.PLOTLY_APP_PORT), + dev_tools_prune_errors=prune_errors, + ) diff --git a/modules/local/dash_app/app/assets/style.css b/modules/local/dash_app/app/assets/style.css new file mode 100755 index 00000000..f32fc1a6 --- /dev/null +++ b/modules/local/dash_app/app/assets/style.css @@ -0,0 +1,9 @@ +.modebar { + background: transparent; + left: 50%; + transform: translateX(-50%); +} + +.mantine-Drawer-root { + width: 0.1em !important; +} diff --git a/modules/local/dash_app/app/environment.yml b/modules/local/dash_app/app/environment.yml new file mode 100644 index 00000000..ba925002 --- /dev/null +++ b/modules/local/dash_app/app/environment.yml @@ -0,0 +1,16 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::pandas==2.3.3 + - conda-forge::polars==1.39.2 + - conda-forge::pyarrow==23.0.1 + - conda-forge::scipy==1.17.1 + - conda-forge::dash==3.3.0 + - conda-forge::dash-mantine-components==2.4.0 + - conda-forge::dash-extensions==2.0.4 + - conda-forge::dash-iconify==0.1.2 + - conda-forge::dash-ag-grid==32.3.2 diff --git a/modules/local/dash_app/app/src/callbacks/common.py b/modules/local/dash_app/app/src/callbacks/common.py new file mode 100644 index 00000000..bd349a21 --- /dev/null +++ b/modules/local/dash_app/app/src/callbacks/common.py @@ -0,0 +1,36 @@ +from dash_extensions.enrich import Input, Trigger, Output, State, callback + + +############################################## +############################################## +# CALLBACKS +############################################## +############################################## + + +def register_callbacks(): + @callback( + Output("drawer", "opened"), + Trigger("settings-button", "n_clicks"), + prevent_initial_call=True, + ) + def open_drawer(): + return True + + @callback( + Output("sidebar-genes-items", "style"), + Output("sidebar-samples-items", "style"), + Input("tabs", "value"), + State("sidebar-genes-items", "style"), + State("sidebar-samples-items", "style"), + ) + def manage_drawer_content( + tabs_value: str, gene_stack_style: dict, sample_stack_style: dict + ): + if tabs_value == "genes": + gene_stack_style["display"] = "block" + sample_stack_style["display"] = "none" + else: # tabs_value == 'samples': + gene_stack_style["display"] = "none" + sample_stack_style["display"] = "block" + return gene_stack_style, sample_stack_style diff --git a/modules/local/dash_app/app/src/callbacks/genes.py b/modules/local/dash_app/app/src/callbacks/genes.py new file mode 100644 index 00000000..10cd2f74 --- /dev/null +++ b/modules/local/dash_app/app/src/callbacks/genes.py @@ -0,0 +1,109 @@ +import plotly.graph_objects as go +from dash_extensions.enrich import Input, Output, Serverside, State, callback, ctx +from src.utils.data_management import DataManager + +data_manager = DataManager() + + +############################################## +############################################## +# CALLBACKS +############################################## +############################################## + + +def get_selected_rows(selected_genes: list[str]) -> list[dict]: + return data_manager.all_genes_stat_df.filter( + data_manager.all_genes_stat_df["gene_id"].is_in(selected_genes) + ).to_dicts() + + +def register_callbacks(): + @callback( + Output("gene-counts", "data"), + Output("gene-dropdown", "value"), + Output("gene-stats-table", "selectedRows"), + Input("gene-dropdown", "value"), + Input("gene-stats-table", "selectedRows"), + State("gene-counts", "data"), + # prevent_initial_call=True, + ) + def update_gene_stored_data( + selected_genes: list[str], table_selected_rows: list[dict], stored_data: dict + ) -> dict: + if ctx.triggered_id == "gene-stats-table": + # updating selected genes + if table_selected_rows is not None: + selected_genes = [row["gene_id"] for row in table_selected_rows] + else: + selected_genes = [] + else: + # ctx.triggered_id is None (callback triggered at app launch / refresh) + # or ctx.triggered_id == "gene-dropdown": + # taking the dropdown values as reference (since there is persistence on it) + table_selected_rows = get_selected_rows(selected_genes) + + # deleting stored data for genes not anymore in the selected list + for stored_gene in list( + stored_data.keys() + ): # we need to copy the list of keys before changing the dict + if stored_gene not in selected_genes: + del stored_data[stored_gene] + + # storing data for new genes in the selected list + for gene in selected_genes: + if gene not in stored_data: + gene_data = data_manager.get_gene_counts(gene) + stored_data[gene] = { + "counts": gene_data.to_list(), + "samples": gene_data.index.to_list(), + } + + return Serverside(stored_data), selected_genes, table_selected_rows + + @callback( + Output("gene-graph", "figure"), + Output("gene-graph", "style"), + Input("gene-counts", "data"), + Input("gene-graph-jitter", "value"), + Input("gene-graph-pointpos", "value"), + Input("gene-graph-boxmean", "value"), + Input("gene-graph-display-points", "value"), + State("gene-graph", "style"), + # prevent_initial_call=True, + ) + def update_gene_graph( + gene_stored_data: dict, + jitter: float, + pointpos: float, + boxmean: str | bool, + point_display_mode: str, + graph_style: dict, + ): + if not gene_stored_data: + graph_style["display"] = "none" + return {}, graph_style + + graph_style["display"] = "block" + + fig = go.Figure() + + # we need to use the reversed order, otherwise the last traced added is at the top of the graph + for gene, gene_data in reversed(gene_stored_data.items()): + fig.add_trace( + go.Box( + name=gene, + x=gene_data["counts"], + boxmean=boxmean, + jitter=jitter, + pointpos=pointpos, + boxpoints=point_display_mode, + customdata=gene_data["samples"], + hovertemplate="Sample: %{customdata}
Normalised count: %{x}
", + showlegend=False, + ) + ) + + fig.update_layout(xaxis=dict(range=[0, 1]), yaxis=dict(ticklabelstandoff=10)) + + return fig, graph_style diff --git a/modules/local/dash_app/app/src/callbacks/samples.py b/modules/local/dash_app/app/src/callbacks/samples.py new file mode 100644 index 00000000..1ac296c8 --- /dev/null +++ b/modules/local/dash_app/app/src/callbacks/samples.py @@ -0,0 +1,126 @@ +import plotly.graph_objects as go +import numpy as np +from scipy.stats import gaussian_kde +from dash_extensions.enrich import Input, Output, State, callback, Serverside + +from src.utils.data_management import DataManager + +data_manager = DataManager() + + +############################################## +############################################## +# CALLBACKS +############################################## +############################################## + + +def register_callbacks(): + @callback( + Output("sample-counts", "data"), + Input("sample-dropdown", "value"), + State("sample-counts", "data"), + # prevent_initial_call=True, + ) + def update_stored_data( + sample_dropdown_values: list[str], stored_sample_counts: dict + ): + updated_stored_sample_counts = dict(stored_sample_counts) # deep copy + + # deleting stored data for samples not anymore in the selected list + for stored_sample in ( + stored_sample_counts + ): # we need to copy the list of keys before changing the dict + if stored_sample not in sample_dropdown_values: + del updated_stored_sample_counts[stored_sample] + + # storing data for new samples in the selected list + for sample in sample_dropdown_values: + if sample not in updated_stored_sample_counts: + sample_data = data_manager.get_sample_counts(sample) + updated_stored_sample_counts[sample] = { + "counts": sample_data.to_list(), + "genes": sample_data.index.to_list(), + } + + return Serverside(updated_stored_sample_counts) + + @callback( + Output("sample-graph", "figure"), + Output("sample-graph", "style"), + Output("sample_stats_display_accordion_control", "disabled"), + Output("sample_points_display_accordion_control", "disabled"), + Output("sample_plot_customisation_accordion_control", "disabled"), + Input("sample-counts", "data"), + Input("curve-type", "value"), + Input("sample-graph-jitter", "value"), + Input("sample-graph-pointpos", "value"), + Input("sample-graph-boxmean", "value"), + Input("sample-graph-display-points", "value"), + State("sample-graph", "style"), + # prevent_initial_call=True, + ) + def update_sample_histogram( + sample_counts: dict, + curve_type: str, + jitter: float, + pointpos: float, + boxmean: str | bool, + point_display_mode: str, + graph_style: dict, + ): + if not sample_counts: + graph_style["display"] = "none" + return {}, graph_style, True, True, True + + graph_style["display"] = "block" + + fig = go.Figure() + + sample_stats_display_ac_disabled = True + sample_points_display_ac_disabled = True + sample_plot_customisation_ac_disabled = True + + # we need to use the reversed order, otherwise the last traced added is at the top of the graph + for sample, sample_data in reversed(sample_counts.items()): + counts = sample_data["counts"] + + if curve_type == "histogram": + fig.add_trace(go.Histogram(name=sample, x=counts)) + + elif curve_type == "kde": + kde_function = gaussian_kde(counts) + xvals = np.linspace(min(counts), max(counts), 1000) + yvals = kde_function(xvals) + fig.add_trace(go.Scatter(name=sample, x=xvals, y=yvals)) + + else: # boxplot + # we need to use the reversed order, otherwise the last traced added is at the top of the graph + fig.add_trace( + go.Box( + name=sample, + x=counts, + jitter=jitter, + pointpos=pointpos, + boxpoints=point_display_mode, + boxmean=boxmean, + customdata=sample_data["genes"], + hovertemplate="Gene: %{customdata}
Count: %{x}
", + ) + ) + # update the layout to remove y-axis labels + fig.update_layout(yaxis=dict(showticklabels=False)) + + sample_stats_display_ac_disabled = False + sample_points_display_ac_disabled = False + sample_plot_customisation_ac_disabled = False + + fig.update_layout(xaxis=dict(range=[0, 1]), yaxis=dict(ticklabelstandoff=10)) + + return ( + fig, + graph_style, + sample_stats_display_ac_disabled, + sample_points_display_ac_disabled, + sample_plot_customisation_ac_disabled, + ) diff --git a/modules/local/dash_app/app/src/components/graphs.py b/modules/local/dash_app/app/src/components/graphs.py new file mode 100644 index 00000000..94c7f66c --- /dev/null +++ b/modules/local/dash_app/app/src/components/graphs.py @@ -0,0 +1,11 @@ +from dash_extensions.enrich import dcc +from src.utils import style + + +def get_graph(graph_id: str): + return dcc.Graph(id=graph_id, figure={}, style=style.GRAPH) + + +gene_graph = get_graph("gene-graph") + +sample_graph = get_graph("sample-graph") diff --git a/modules/local/dash_app/app/src/components/icons.py b/modules/local/dash_app/app/src/components/icons.py new file mode 100755 index 00000000..6801d88b --- /dev/null +++ b/modules/local/dash_app/app/src/components/icons.py @@ -0,0 +1,7 @@ +from dash_iconify import DashIconify + +# all dash-iconify icons can be found at +# https://icon-sets.iconify.design/ +# --------------- SIDEBAR --------------------- +magnifying_glass_icon = DashIconify(icon="radix-icons:magnifying-glass") +data_loaded_icon = DashIconify(icon="akar-icons:circle-check", color="white", width=30) diff --git a/modules/local/dash_app/app/src/components/right_sidebar.py b/modules/local/dash_app/app/src/components/right_sidebar.py new file mode 100644 index 00000000..98aaf072 --- /dev/null +++ b/modules/local/dash_app/app/src/components/right_sidebar.py @@ -0,0 +1,19 @@ +import dash_mantine_components as dmc +from src.components.settings import genes, samples +from src.utils import style + +drawer = dmc.Drawer( + children=[ + genes.sidebar_stack, + samples.sidebar_stack, + ], + id="drawer", + opened=True, + position="right", + withCloseButton=True, + closeOnEscape=True, + overlayProps=dict(backgroundOpacity=0), + trapFocus=False, + zIndex=10000, + style=style.SIDEBAR, +) diff --git a/modules/local/dash_app/app/src/components/settings/genes.py b/modules/local/dash_app/app/src/components/settings/genes.py new file mode 100644 index 00000000..764b41c9 --- /dev/null +++ b/modules/local/dash_app/app/src/components/settings/genes.py @@ -0,0 +1,140 @@ +import dash_mantine_components as dmc +from src.utils import style +from src.utils.data_management import DataManager + +data_manager = DataManager() + +sorted_genes = data_manager.get_sorted_genes() + +nb_sections = data_manager.get_nb_sections() + +gene_selection_stack = dmc.Stack( + [ + dmc.MultiSelect( + id="gene-dropdown", + label=dmc.Text("Genes to display", fw=600, style={"paddingBottom": "5px"}), + placeholder="Select genes of interest", + nothingFoundMessage="No gene found", + data=sorted_genes, + value=sorted_genes[:nb_sections], + w=400, + clearable=True, + searchable=True, + limit=100, + maxValues=20, + size="sm", + checkIconPosition="right", + hidePickedOptions=True, + disabled=False, + persistence=True, + persisted_props=["value"], + persistence_type="session", + # style=style.DROPDOWN, + comboboxProps={ + "shadow": "md", + "transitionProps": {"transition": "pop", "duration": 200}, + }, + ) + ], + align="stretch", + gap="xl", +) + +gene_graph_stats_display_stack = dmc.Stack( + [ + dmc.Text( + "Display mean / standard deviation", style=style.STACK_SUBSECTION_TITLE + ), + dmc.SegmentedControl( + id="gene-graph-boxmean", + value="sd", + color="teal", + data=[ + {"value": False, "label": "None"}, + {"value": True, "label": "Mean only"}, + {"value": "sd", "label": "Mean + Std"}, + ], + mb=10, + ), + ], + align="center", + gap="xl", +) + +gene_graph_points_display_stack = dmc.Stack( + [ + dmc.Text("Display points", style=style.STACK_SUBSECTION_TITLE), + dmc.SegmentedControl( + id="gene-graph-display-points", + value="outliers", + color="teal", + data=[ + {"value": "outliers", "label": "Outliers"}, + {"value": "suspectedoutliers", "label": "Suspected Outliers"}, + {"value": "all", "label": "All points"}, + ], + mb=10, + ), + dmc.Text( + "Position of points relatively to boxes", style=style.STACK_SUBSECTION_TITLE + ), + dmc.Slider( + id="gene-graph-pointpos", + value=-1.8, + color="teal", + min=-2, + max=2, + step=0.1, + persistence=True, + persisted_props=["value"], + persistence_type="session", + mb=35, + ), + dmc.Text( + "Spreading of displayed points (jitter)", style=style.STACK_SUBSECTION_TITLE + ), + dmc.Slider( + id="gene-graph-jitter", + value=0.3, + color="teal", + min=0, + max=1, + step=0.1, + persistence=True, + persisted_props=["value"], + persistence_type="session", + mb=35, + ), + ], + align="center", + gap="xl", +) + +sidebar_stack = dmc.Accordion( + value="gene_selection", + children=[ + dmc.AccordionItem( + [ + dmc.AccordionControl("Gene selection"), + dmc.AccordionPanel(gene_selection_stack), + ], + value="gene_selection", + ), + dmc.AccordionItem( + [ + dmc.AccordionControl("Statistics display"), + dmc.AccordionPanel(gene_graph_stats_display_stack), + ], + value="gene_stats_display", + ), + dmc.AccordionItem( + [ + dmc.AccordionControl("Points display"), + dmc.AccordionPanel(gene_graph_points_display_stack), + ], + value="gene_points_display", + ), + ], + id="sidebar-genes-items", + style={"marginTop": "20px", "display": "none"}, +) diff --git a/modules/local/dash_app/app/src/components/settings/samples.py b/modules/local/dash_app/app/src/components/settings/samples.py new file mode 100644 index 00000000..04126f1c --- /dev/null +++ b/modules/local/dash_app/app/src/components/settings/samples.py @@ -0,0 +1,176 @@ +import dash_mantine_components as dmc +from src.utils import style +from src.utils.data_management import DataManager + +data_manager = DataManager() + +sorted_samples = data_manager.get_sorted_samples() + +NB_SAMPLES_DEFAULT = 10 + +sample_selection_stack = dmc.Stack( + [ + dmc.MultiSelect( + id="sample-dropdown", + label="Select list of samples to visualise", + placeholder="Select samples", + nothingFoundMessage="No samples found", + data=sorted_samples, + value=sorted_samples[:NB_SAMPLES_DEFAULT], + w=400, + clearable=True, + searchable=True, + limit=100, + maxValues=20, + size="sm", + checkIconPosition="right", + hidePickedOptions=True, + disabled=False, + persistence=True, + persisted_props=["value"], + persistence_type="session", + # style=style.DROPDOWN, + comboboxProps={ + "shadow": "md", + "transitionProps": {"transition": "pop", "duration": 200}, + }, + ) + ], + align="left", + gap="xl", +) + + +sample_graph_plot_type_stack = dmc.Stack( + [ + dmc.Text("Type of plot", style=style.STACK_SUBSECTION_TITLE), + dmc.SegmentedControl( + id="curve-type", + value="ng", + color="teal", + data=[ + {"value": "histogram", "label": "Histogram"}, + {"value": "kde", "label": "Kde"}, + {"value": "boxplot", "label": "Box-plot"}, + ], + mb=10, + ), + ], + align="left", + gap="xl", +) + + +sample_graph_stats_display_stack = dmc.Stack( + [ + dmc.Text( + "Display mean / standard deviation", style=style.STACK_SUBSECTION_TITLE + ), + dmc.SegmentedControl( + id="sample-graph-boxmean", + value="sd", + color="teal", + data=[ + {"value": False, "label": "None"}, + {"value": True, "label": "Mean only"}, + {"value": "sd", "label": "Mean + Std"}, + ], + mb=10, + ), + ], + align="left", + gap="xl", +) + +sample_graph_points_display_stack = dmc.Stack( + [ + dmc.Text("Display points", style=style.STACK_SUBSECTION_TITLE), + dmc.SegmentedControl( + id="sample-graph-display-points", + value="outliers", + color="teal", + data=[ + {"value": "outliers", "label": "Outliers"}, + {"value": "suspectedoutliers", "label": "Suspected Outliers"}, + {"value": "all", "label": "All points"}, + ], + mb=10, + ), + dmc.Text( + "Position of points relatively to boxes", style=style.STACK_SUBSECTION_TITLE + ), + dmc.Slider( + id="sample-graph-pointpos", + value=-1.8, + color="teal", + min=-2, + max=2, + step=0.1, + persistence=True, + persisted_props=["value"], + persistence_type="session", + mb=35, + ), + dmc.Text( + "Spreading of displayed points (jitter)", style=style.STACK_SUBSECTION_TITLE + ), + dmc.Slider( + id="sample-graph-jitter", + value=0.3, + color="teal", + min=0, + max=1, + step=0.1, + persistence=True, + persisted_props=["value"], + persistence_type="session", + mb=35, + ), + ], + align="left", + gap="xl", +) + + +sidebar_stack = dmc.Accordion( + value="sample_selection", + children=[ + dmc.AccordionItem( + [ + dmc.AccordionControl("Sample selection"), + dmc.AccordionPanel(sample_selection_stack), + ], + value="sample_selection", + ), + dmc.AccordionItem( + [ + dmc.AccordionControl( + "Plot customisation", + id="sample_plot_customisation_accordion_control", + ), + dmc.AccordionPanel(sample_graph_plot_type_stack), + ], + value="sample_plot_customisation", + ), + dmc.AccordionItem( + [ + dmc.AccordionControl( + "Statistics display", id="sample_stats_display_accordion_control" + ), + dmc.AccordionPanel(sample_graph_stats_display_stack), + ], + value="sample_stats_display", + ), + dmc.AccordionItem( + [ + dmc.AccordionControl( + "Points display", id="sample_points_display_accordion_control" + ), + dmc.AccordionPanel(sample_graph_points_display_stack), + ], + value="sample_points_display", + ), + ], + id="sidebar-samples-items", + style={"marginTop": "20px", "display": "none"}, +) diff --git a/modules/local/dash_app/app/src/components/stores.py b/modules/local/dash_app/app/src/components/stores.py new file mode 100755 index 00000000..ee748532 --- /dev/null +++ b/modules/local/dash_app/app/src/components/stores.py @@ -0,0 +1,9 @@ +from dash_extensions.enrich import dcc + +selected_samples = dcc.Store("selected-sample", storage_type="session") +gene_counts = dcc.Store(id="gene-counts", storage_type="session", data={}) +sample_counts = dcc.Store(id="sample-counts", storage_type="session", data={}) +stores_to_load = [ + gene_counts, + sample_counts, +] diff --git a/modules/local/dash_app/app/src/components/tables.py b/modules/local/dash_app/app/src/components/tables.py new file mode 100644 index 00000000..3c832931 --- /dev/null +++ b/modules/local/dash_app/app/src/components/tables.py @@ -0,0 +1,46 @@ +import dash_ag_grid as dag +from src.utils import style +from src.utils.data_management import DataManager + +data_manager = DataManager() + +NB_GENES_SELECTED_DEFAULT = 10 + +row_data = data_manager.get_table_raw_data() +# default_selected_rows = data_manager.all_genes_stat_df.head(NB_GENES_SELECTED_DEFAULT).to_dicts() + +column_defs = [ + {"field": col, "headerName": col.replace("_", " ").capitalize()} + for col in data_manager.all_genes_stat_df.columns +] + + +all_genes_stats_table = dag.AgGrid( + rowData=row_data, + columnDefs=column_defs, + className="ag-theme-alpine", + # columnSizeOptions=dict(skipHeader=False), + # columnSize="autoSizetoFit", + defaultColDef=dict( + filter=True, + resizable=True, + editable=False, + sortable=True, + ), + dashGridOptions=dict( + pagination=True, + paginationAutoPageSize=True, + enableCellTextSelection=True, + ensureDomOrder=True, + animateRows=False, + rowSelection=dict(mode="multiRow"), + headerCheckboxSelection=False, + getRowId="params.data.gene_id", + ), + # selectedRows=default_selected_rows, + style=style.AG_GRID, + # persistence=True, + # persistence_type="session", + # persisted_props=["selectedRows"], + id="gene-stats-table", +) diff --git a/modules/local/dash_app/app/src/components/tooltips.py b/modules/local/dash_app/app/src/components/tooltips.py new file mode 100644 index 00000000..cf0e9421 --- /dev/null +++ b/modules/local/dash_app/app/src/components/tooltips.py @@ -0,0 +1,43 @@ +import dash_mantine_components as dmc + + +def get_tooltip( + classname: str, label: str, position: str = "bottom", multiline: bool = True +): + return dmc.Tooltip( + target=f".{classname}", + label=label, + multiline=multiline, + position=position, + color="grey", + withArrow=True, + arrowSize=8, + zIndex=20000, + radius=4, + transitionProps={ + "transition": "fade", + "duration": 200, + "timingFunction": "ease", + }, + ) + + +genes_tabitem_tooltip = get_tooltip( + classname="genes-tabitem", label="Distribution of normalised counts gene per gene" +) + +samples_tabitem_tooltip = get_tooltip( + classname="samples-tabitem", + label="Distribution of normalised counts sample per sample", +) + +settings_button_tooltip = get_tooltip( + classname="settings-button", + label="Open settings to select genes / samples and to customise display", +) + +tooltips_to_load = [ + genes_tabitem_tooltip, + samples_tabitem_tooltip, + settings_button_tooltip, +] diff --git a/modules/local/dash_app/app/src/components/top.py b/modules/local/dash_app/app/src/components/top.py new file mode 100755 index 00000000..2ac66959 --- /dev/null +++ b/modules/local/dash_app/app/src/components/top.py @@ -0,0 +1,91 @@ +import dash_mantine_components as dmc +from dash_iconify import DashIconify +from src.components import graphs, tables +from src.utils import style + +gene_icon = DashIconify(icon="material-symbols:genetics", width=20) + +sample_icon = DashIconify(icon="ic:baseline-dashboard-customize", width=20) + + +tabs = dmc.Tabs( + children=[ + dmc.TabsList( + children=[ + dmc.TabsTab( + dmc.Text("Counts / gene", fw=500), + className="genes-tabitem", + color="teal", + leftSection=gene_icon, + value="genes", + style=style.HEADER_TABLIST_ITEM, + ), + dmc.TabsTab( + dmc.Text("Counts / sample", fw=500), + className="samples-tabitem", + leftSection=sample_icon, + value="samples", + color="red", + style=style.HEADER_TABLIST_ITEM, + ), + dmc.TabsTab( + dmc.Text("Statistics - all genes", fw=500), + leftSection=sample_icon, + value="gene_stats", + color="orange", + style=style.HEADER_TABLIST_ITEM, + ), + ], + style=style.HEADER_TABLIST, + ), + dmc.TabsPanel( + children=[ + dmc.Text("dhkhg"), + graphs.gene_graph, + ], + style=style.TABS_PANEL, + value="genes", + ), + dmc.TabsPanel( + children=[ + graphs.sample_graph, + ], + style=style.TABS_PANEL, + value="samples", + ), + dmc.TabsPanel( + children=[tables.all_genes_stats_table], + style=style.TABS_PANEL, + value="gene_stats", + ), + ], + id="tabs", + variant="default", + radius="md", + orientation="horizontal", + placement="right", + value="genes", + persistence=True, + persisted_props=["value"], + persistence_type="session", + style=style.TAB, +) + +settings_button = dmc.Button( + "Select data / options", + id="settings-button", + className="settings-button", + color="teal", + style=style.SETTINGS_BUTTON, +) + +header = dmc.Grid( + children=[ + dmc.GridCol(tabs, span=10), + dmc.GridCol( + settings_button, span=2, style={"textAlign": "right", "marginTop": "20px"} + ), + ], + style={"marginRight": "20px"}, + # gutter="xl", +) diff --git a/modules/local/dash_app/app/src/utils/config.py b/modules/local/dash_app/app/src/utils/config.py new file mode 100644 index 00000000..070a8b04 --- /dev/null +++ b/modules/local/dash_app/app/src/utils/config.py @@ -0,0 +1,31 @@ +PLOTLY_APP_PORT = 8080 +HOST = "0.0.0.0" + +LOGO_FILENAME = "assets/nf-core-stableexpression_logo_light_small.png" + +LOGGING_FORMAT = "[%(asctime)s] [%(name)s] %(levelname)s - %(message)s" +DATE_FORMAT = "%Y-%m-%d_%H-%M-%S" + +APP_TITLE = "Counts" +UPDATE_TITLE = "Updating ..." + +DATA_FOLDER = "data" + +ALL_COUNT_FILENAME = "all_counts.imputed.parquet" +ALL_GENES_STAT_FILENAME = "all_genes_summary.csv" +ALL_DESIGNS_FILENAME = "whole_design.csv" + +GENE_ID_COLNAME = "gene_id" +STD_COLNAME = "standard_deviation" +STABILITY_SCORE_COLNAME = "stability_score" +RANK_COLNAME = "rank" +SECTION_COLNAME = "section" + +AG_GRID_DEFAULT_COLUMN_DEF = { + "filter": True, + "resizable": True, + "editable": False, + "sortable": True, +} + +AG_GRID_DEFAULT_OPTIONS = {"pagination": True, "paginationAutoPageSize": True} diff --git a/modules/local/dash_app/app/src/utils/data_management.py b/modules/local/dash_app/app/src/utils/data_management.py new file mode 100644 index 00000000..bfa09ba7 --- /dev/null +++ b/modules/local/dash_app/app/src/utils/data_management.py @@ -0,0 +1,93 @@ +from functools import lru_cache + +import pandas as pd +import polars as pl +from src.utils import config + + +@lru_cache(maxsize=None) +class DataManager: + def __init__(self): + self.all_counts_lf: pl.LazyFrame = self.get_all_count_data() + self.all_genes_stat_df: pl.DataFrame = self.get_all_genes_stat_data() + + @staticmethod + def get_all_count_data() -> pl.LazyFrame: + file = f"{config.DATA_FOLDER}/{config.ALL_COUNT_FILENAME}" + return pl.scan_parquet(file) + + def get_sorted_samples(self) -> list[str]: + return sorted( + self.all_counts_lf.select(pl.exclude(config.GENE_ID_COLNAME)) + .collect_schema() + .names() + ) + + def get_all_genes_stat_data(self) -> pl.DataFrame: + file = f"{config.DATA_FOLDER}/{config.ALL_GENES_STAT_FILENAME}" + stat_df = pl.read_csv(file) + cols_to_select = ["rank"] + [ + col for col in stat_df.columns if col not in ["rank", "is_candidate"] + ] + return stat_df.select(cols_to_select) + + """ + def get_samples_grouped_by_dataset(self) -> list[dict]: + + samples_grouped_by_dataset = [] + + design_file = f"{config.DATA_FOLDER}/{config.ALL_DESIGNS_FILENAME}" + design_df = pd.read_csv(design_file) + + for group, samples in design_df.groupby(["batch", "condition"])["sample"]: + batch, condition = group # unpacking + batch_condition_samples_dict = { + "group": f"Dataset: {batch} || Condition: {condition}", + "items": [ + {"value": sample, "label": sample} + for sample in samples.to_list() + if sample in samples_in_count_data + ], + } + samples_grouped_by_dataset.append(batch_condition_samples_dict) + + return samples_grouped_by_dataset + """ + + def get_sorted_genes(self) -> list[str]: + return ( + self.all_genes_stat_df.sort( + by=[config.RANK_COLNAME, config.SECTION_COLNAME], + descending=False, + ) + .select(config.GENE_ID_COLNAME) + .to_series() + .to_list() + ) + + def get_gene_counts(self, gene: str) -> pd.Series: + return ( + self.all_counts_lf.filter(pl.col(config.GENE_ID_COLNAME) == gene) + .select(pl.exclude(config.GENE_ID_COLNAME)) + .collect() + .to_pandas() + .iloc[0] + ) + + def get_sample_counts(self, sample: str) -> pd.Series: + return ( + self.all_counts_lf.select(sample) + .drop_nulls() + .collect() + .to_pandas() + .iloc[:, 0] + ) + + def get_nb_sections(self) -> int: + return self.all_genes_stat_df.select(config.SECTION_COLNAME).n_unique() + + def get_table_raw_data(self) -> list[dict]: + return self.all_genes_stat_df.sort( + by=[config.RANK_COLNAME, config.SECTION_COLNAME], + descending=False, + ).to_dicts() diff --git a/modules/local/dash_app/app/src/utils/style.py b/modules/local/dash_app/app/src/utils/style.py new file mode 100644 index 00000000..65814f06 --- /dev/null +++ b/modules/local/dash_app/app/src/utils/style.py @@ -0,0 +1,83 @@ +LAYOUT = { + "left": "0px", + "top": "0px", + "position": "absolute", + "width": "100%", + "height": "100%", +} + +HEADER_HEIGHT = "5em" + +TAB = { + "position": "fixed", + "top": 0, + "left": 10, + "right": 0, + "width": "100%", + "height": "100%", + #'zIndex': '1001', +} + +HEADER_TABLIST = { + "position": "fixed", + "top": 0, + "left": 10, + "right": 0, + "width": "60%", + "height": HEADER_HEIGHT, + #'zIndex': '1001' +} + +HEADER_TABLIST_ITEM = { + #'width': '15vh', + # "text-align": "center", + "paddingRight": "20px", + #'paddingTop': '26px', + #'paddingBottom': '26px', + #'width': LEFT_SIDEBAR_WIDTH +} + +TABS_PANEL = { + "margin-top": HEADER_HEIGHT, + "height": f"calc(100% - {HEADER_HEIGHT})", +} + + +SETTINGS_BUTTON = { + "right": "20px", +} + +SIDEBAR_WIDTH = "15em" + +SIDEBAR = { + "position": "fixed", + "top": HEADER_HEIGHT, + "bottom": 0, + "width": SIDEBAR_WIDTH, + "height": "100vh", + "alignItems": "center", +} + + +DROPDOWN = {"marginTop": "10px", "paddingLeft": "4.2em", "paddingRight": "4.5em"} + +STACK_SUBSECTION_TITLE = {"marginBottom": "-20px"} + +AG_GRID = { + "height": "calc(100% - 10px)", + "top": HEADER_HEIGHT, + "paddingTop": "10px", + "marginRight": "15px", + "paddingRight": "25px", + "marginLeft": "5px", +} + +GRAPH = { + #'width': '100vh', + "top": HEADER_HEIGHT, + "marginLeft": "0px", + "marginRight": "3em", + "marginTop": "2px", + "marginBottom": "3px", + "display": "none", +} diff --git a/modules/local/dash_app/main.nf b/modules/local/dash_app/main.nf new file mode 100644 index 00000000..d64416b7 --- /dev/null +++ b/modules/local/dash_app/main.nf @@ -0,0 +1,59 @@ +process DASH_APP { + + label 'process_high' + + conda "${moduleDir}/app/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/fc/fc4abd76b9424d5f5397a6c97e8ed8c2e3a5a454773595204ceb55b39057d812/data': + 'community.wave.seqera.io/library/dash-ag-grid_dash-extensions_dash-iconify_dash-mantine-components_pruned:be6021fe1944629c' }" + + errorStrategy { + if (task.exitStatus == 100) { + log.warn("Could not start the Dash application.") + return 'ignore' // only report errors but ignores it + } else { + log.warn("Could not start the Dash application due to unhandled error.") + return 'ignore' // ignore anyway + } + } + + input: + path all_counts + path whole_design + path all_genes_summary + + output: + path("*"), emit: app + path "versions.yml", emit: versions + + script: + """ + # limiting number of threads to polars / python + export POLARS_MAX_THREADS=${task.cpus} + export OMP_NUM_THREADS=${task.cpus} + + mkdir -p data + mv ${all_counts} ${whole_design} ${all_genes_summary} data/ + cp -r ${moduleDir}/app/* . + + # as of Nextflow version 25.04.8, having these versions sent to the versions topic channel + # results in ERROR ~ No such file or directory: /.command.env + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$( python3 --version | sed "s/Python //" ) + dash: \$( python3 -c "import dash; print(dash.__version__)" ) + dash-extensions: \$( python3 -c "import dash_extensions; print(dash_extensions.__version__)" ) + dash-mantine-components: \$( python3 -c "import dash_mantine_components; print(dash_mantine_components.__version__)" ) + dash-ag-grid: \$( python3 -c "import dash_ag_grid; print(dash_ag_grid.__version__)" ) + polars: \$( python3 -c "import polars; print(polars.__version__)" ) + pandas: \$( python3 -c "import pandas; print(pandas.__version__)" ) + pyarrow: \$( python3 -c "import pyarrow; print(pyarrow.__version__)" ) + scipy: \$( python3 -c "import scipy; print(scipy.__version__)" ) + END_VERSIONS + + # trying to launch the app + # if the resulting exit code is not 124 (exit code of timeout) then there is an error + timeout 10 python -B app.py || exit_code=\$?; [ "\$exit_code" -eq 124 ] && exit 0 || exit 100 + """ + +} diff --git a/modules/local/deseq2/normalize/environment.yml b/modules/local/deseq2/normalize/environment.yml deleted file mode 100644 index dc33b239..00000000 --- a/modules/local/deseq2/normalize/environment.yml +++ /dev/null @@ -1,8 +0,0 @@ -name: deseq_normalize -channels: - - conda-forge - - bioconda -dependencies: - - conda-forge::r-base==4.3.3 - - bioconda::bioconductor-deseq2==1.42.0 - - conda-forge::r-optparse==1.7.5 diff --git a/modules/local/deseq2/normalize/main.nf b/modules/local/deseq2/normalize/main.nf deleted file mode 100644 index f6274bdb..00000000 --- a/modules/local/deseq2/normalize/main.nf +++ /dev/null @@ -1,31 +0,0 @@ -process DESEQ2_NORMALIZE { - - // debug true - - publishDir "${params.outdir}/normalization/deseq2" - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/ce/cef7164b168e74e5db11dcd9acf6172d47ed6753e4814c68f39835d0c6c22f6d/data': - 'community.wave.seqera.io/library/bioconductor-deseq2_r-base_r-optparse:c84cd7ffdb298fa7' }" - - input: - tuple val(meta), path(count_file) - - output: - tuple val(meta), path('*.log_cpm.csv'), emit: csv - tuple val("${task.process}"), val('R'), eval('Rscript -e "cat(R.version.string)" | sed "s/R version //"'), topic: versions - tuple val("${task.process}"), val('DESeq2'), eval('Rscript -e "cat(as.character(packageVersion(\'DESeq2\')))"'), topic: versions - - - when: - task.ext.when == null || task.ext.when - - script: - def design_file = meta.design - """ - deseq2_normalize.R --counts "$count_file" --design "$design_file" - """ - - -} diff --git a/modules/local/detect_rare_genes/environment.yml b/modules/local/detect_rare_genes/environment.yml new file mode 100644 index 00000000..45a97ad0 --- /dev/null +++ b/modules/local/detect_rare_genes/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 diff --git a/modules/local/detect_rare_genes/main.nf b/modules/local/detect_rare_genes/main.nf new file mode 100644 index 00000000..1e2af20d --- /dev/null +++ b/modules/local/detect_rare_genes/main.nf @@ -0,0 +1,40 @@ +process DETECT_RARE_GENES { + + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/00/00f1434368763cebf37466cfaaaf069f971f7eae65b010169975c50d084e5af3/data': + 'community.wave.seqera.io/library/polars_python:1a4a3322c56bfeb9' }" + + input: + path(gene_id_mapping_file) + path(gene_id_occurrences_file) + val(nb_datasets) + val(min_occurrence_frequency) + val(min_occurrence_quantile) + + output: + path('valid_gene_ids.txt'), emit: valid_gene_ids + path('total_gene_id_occurrence_quantiles.csv'), topic: total_gene_id_occurrence_quantiles + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + """ + detect_rare_genes.py \\ + --occurrences $gene_id_occurrences_file \\ + --mappings $gene_id_mapping_file \\ + --nb-datasets $nb_datasets \\ + --min-occurrence-frequency $min_occurrence_frequency \\ + --min-occurrence-quantile $min_occurrence_quantile + + """ + + + stub: + """ + touch fake.validated_genes.txt + """ + +} diff --git a/modules/local/download_ensembl_annotation/environment.yml b/modules/local/download_ensembl_annotation/environment.yml new file mode 100644 index 00000000..8d1ff111 --- /dev/null +++ b/modules/local/download_ensembl_annotation/environment.yml @@ -0,0 +1,12 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::pandas==3.0.1 + - conda-forge::httpx==0.28.1 + - conda-forge::tqdm==4.67.3 + - conda-forge::bs4==4.14.3 + - conda-forge::tenacity==9.1.4 diff --git a/modules/local/download_ensembl_annotation/main.nf b/modules/local/download_ensembl_annotation/main.nf new file mode 100644 index 00000000..16bb27cb --- /dev/null +++ b/modules/local/download_ensembl_annotation/main.nf @@ -0,0 +1,34 @@ +process DOWNLOAD_ENSEMBL_ANNOTATION { + + label 'process_single' + + tag "${species}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/98/980a21a12b628a41a6c08a91d4f6646d1122f0d0e38387f724d4f4ee020b8b1d/data': + 'community.wave.seqera.io/library/bs4_httpx_pandas_python_pruned:13dbe891a99b6884' }" + + input: + val species + + output: + path "*.gff3.gz", emit: gff3 + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('httpx'), eval('python3 -c "import httpx; print(httpx.__version__)"'), topic: versions + tuple val("${task.process}"), val('pandas'), eval('python3 -c "import pandas; print(pandas.__version__)"'), topic: versions + tuple val("${task.process}"), val('bs4'), eval('python3 -c "import bs4; print(bs4.__version__)"'), topic: versions + tuple val("${task.process}"), val('tqdm'), eval('python3 -c "import tqdm; print(tqdm.__version__)"'), topic: versions + + script: + """ + download_latest_ensembl_annotation.py \\ + --species ${species} + """ + + stub: + """ + touch fake.gff3.gz.txt + """ + +} diff --git a/modules/local/edger/normalize/environment.yml b/modules/local/edger/normalize/environment.yml deleted file mode 100644 index 8c8def5c..00000000 --- a/modules/local/edger/normalize/environment.yml +++ /dev/null @@ -1,8 +0,0 @@ -name: edger_normalize -channels: - - conda-forge - - bioconda -dependencies: - - conda-forge::r-base==4.3.3 - - bioconda::bioconductor-edger==4.0.16 - - conda-forge::r-optparse==1.7.5 diff --git a/modules/local/edger/normalize/main.nf b/modules/local/edger/normalize/main.nf deleted file mode 100644 index 71d9f13b..00000000 --- a/modules/local/edger/normalize/main.nf +++ /dev/null @@ -1,28 +0,0 @@ -process EDGER_NORMALIZE { - - publishDir "${params.outdir}/normalization/edger" - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/89/89bbc9544e18b624ed6d0a30e701cf8cec63e063cc9b5243e1efde362fe92228/data': - 'community.wave.seqera.io/library/bioconductor-edger_r-base_r-optparse:400aaabddeea1574' }" - - input: - tuple val(meta), path(count_file) - - output: - tuple val(meta), path('*.log_cpm.csv'), emit: csv - tuple val("${task.process}"), val('R'), eval('Rscript -e "cat(R.version.string)" | sed "s/R version //"'), topic: versions - tuple val("${task.process}"), val('edgeR'), eval('Rscript -e "cat(as.character(packageVersion(\'edgeR\')))"'), topic: versions - - - when: - task.ext.when == null || task.ext.when - - script: - def design_file = meta.design - """ - edger_normalize.R --counts "$count_file" --design "$design_file" - """ - -} diff --git a/modules/local/expressionatlas/getaccessions/environment.yml b/modules/local/expressionatlas/getaccessions/environment.yml index db0221d3..58ac41c2 100644 --- a/modules/local/expressionatlas/getaccessions/environment.yml +++ b/modules/local/expressionatlas/getaccessions/environment.yml @@ -1,7 +1,12 @@ -name: eatlas_get_accessions +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json channels: - conda-forge + - bioconda dependencies: - - conda-forge::requests==2.32.3 - - conda-forge::nltk==3.9.1 - - conda-forge::retry==0.9.2 + - conda-forge::python=3.14.3 + - conda-forge::pandas==3.0.1 + - conda-forge::httpx==0.28.1 + - conda-forge::tenacity==9.1.4 + - conda-forge::pyyaml==6.0.3 + - conda-forge::nltk==3.9.2 diff --git a/modules/local/expressionatlas/getaccessions/main.nf b/modules/local/expressionatlas/getaccessions/main.nf index cb31afc8..43434bd2 100644 --- a/modules/local/expressionatlas/getaccessions/main.nf +++ b/modules/local/expressionatlas/getaccessions/main.nf @@ -1,44 +1,69 @@ process EXPRESSIONATLAS_GETACCESSIONS { + label 'process_high' + + tag "${species}" + conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/e4/e40fdee15db481a7c9018d85d73fde63235faad794513039a198f3343f2b0e04/data': - 'community.wave.seqera.io/library/nltk_retry_pip_requests:0e24055eb62456ae' }" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/f9/f943893a85d82f720432e83fd8d4755e5b42a92deca9d49c06930eaa7fc0c968/data': + 'community.wave.seqera.io/library/httpx_nltk_pandas_python_pruned:ab2f10d1d67a7603' }" input: val species val keywords + val platform + val random_sampling_size + val random_sampling_seed output: - path 'accessions.txt', emit: txt - tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions - tuple val("${task.process}"), val('requests'), eval('python3 -c "import requests; print(requests.__version__)"'), topic: versions - tuple val("${task.process}"), val('nltk'), eval('python3 -c "import nltk; print(nltk.__version__)"'), topic: versions - - - when: - task.ext.when == null || task.ext.when - + path "accessions.txt", optional: true, emit: accessions + env("SAMPLING_QUOTA"), emit: sampling_quota + path "selected_experiments.metadata.tsv", optional: true, topic: eatlas_selected_datasets + path "species_experiments.metadata.tsv", optional: true, topic: eatlas_all_datasets + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('httpx'), eval('python3 -c "import httpx; print(httpx.__version__)"'), topic: versions + tuple val("${task.process}"), val('nltk'), eval('python3 -c "import nltk; print(nltk.__version__)"'), topic: versions + tuple val("${task.process}"), val('pyyaml'), eval('python3 -c "import yaml; print(yaml.__version__)"'), topic: versions + tuple val("${task.process}"), val('pandas'), eval('python3 -c "import pandas; print(pandas.__version__)"'), topic: versions script: - def keywords_string = keywords.split(',').collect { it.trim() }.join(' ') - - // the folder where nltk will download data needs to be writable (necessary for singularity) - if (keywords_string == "") { - """ - NLTK_DATA=$PWD get_eatlas_accessions.py --species $species - """ - } else { - """ - NLTK_DATA=$PWD get_eatlas_accessions.py --species $species --keywords $keywords_string - """ + def args = " --species $species" + if ( keywords_string != "" ) { + args += " --keywords $keywords_string" + } + if ( platform ) { + args += " --platform $platform" } + if ( random_sampling_size ) { + args += " --random-sampling-size $random_sampling_size" + } + if ( random_sampling_seed ) { + args += " --random-sampling-seed $random_sampling_seed" + } + """ + # limiting CPU usage + export OMP_NUM_THREADS=${task.cpus} + + # the folder where nltk will download data needs to be writable (necessary for singularity) + export NLTK_DATA=\${PWD} + get_eatlas_accessions.py \\ + $args \\ + --cpus ${task.cpus} + + SAMPLING_QUOTA=\$(cat sampling_quota.txt) + """ stub: """ - touch accessions.csv + touch accessions.txt \\ + all_experiments.metadata.tsv \\ + filtered_experiments.metadata.tsv \\ + filtered_experiments.keywords.yaml + + SAMPLING_QUOTA="ok" """ } diff --git a/modules/local/expressionatlas/getdata/environment.yml b/modules/local/expressionatlas/getdata/environment.yml index 156f457b..cdb6c8ed 100644 --- a/modules/local/expressionatlas/getdata/environment.yml +++ b/modules/local/expressionatlas/getdata/environment.yml @@ -1,8 +1,9 @@ -name: eatlas_get_data +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json channels: - conda-forge - bioconda dependencies: - - conda-forge::r-base==4.3.3 - - bioconda::bioconductor-expressionatlas==1.30.0 + - conda-forge::r-base==4.4.3 - conda-forge::r-optparse==1.7.5 + - bioconda::bioconductor-expressionatlas==1.34.0 diff --git a/modules/local/expressionatlas/getdata/main.nf b/modules/local/expressionatlas/getdata/main.nf index 53a9d4f7..1902185f 100644 --- a/modules/local/expressionatlas/getdata/main.nf +++ b/modules/local/expressionatlas/getdata/main.nf @@ -1,40 +1,36 @@ process EXPRESSIONATLAS_GETDATA { - // when there are network issues, we retry the download with a backoff - // errorStrategy { sleep(Math.pow(2, task.attempt) * 200 as long); return 'retry' } - // maxRetries 5 - - // limiting threads to avoid issues with the Expression Atlas API - maxForks 4 + label 'process_single' + label 'can_fail' tag "$accession" + maxForks 8 // limiting to 8 threads at a time to avoid 429 errors with the Expression Atlas API server + conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/7f/7fd21450c3a3f7df37fa0480170780019e9686be319da1c9e10712f7f17cca26/data': - 'community.wave.seqera.io/library/bioconductor-expressionatlas_r-base_r-optparse:ca0f8cd9d3f44af9' }" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/96/963bb5cfef2f27d3c5b2a428b18319c65e4d6ff428be08cf3e124e4f9a25a234/data': + 'community.wave.seqera.io/library/bioconductor-expressionatlas_r-base_r-optparse:e15047a6b3701e2c' }" input: - val(accession) + val accession output: - tuple val(accession), path("*.design.csv"), path("*raw.csv"), optional: true, emit: raw - tuple val(accession), path("*.design.csv"), path("*normalized.csv"), optional: true, emit: normalized + path("*.counts.csv"), optional: true, emit: counts + path("*.design.csv"), optional: true, emit: design + tuple val(accession), path("failure_reason.txt"), optional: true, topic: eatlas_failure_reason + tuple val(accession), path("warning_reason.txt"), optional: true, topic: eatlas_warning_reason tuple val("${task.process}"), val('R'), eval('Rscript -e "cat(R.version.string)" | sed "s/R version //"'), topic: versions tuple val("${task.process}"), val('ExpressionAtlas'), eval('Rscript -e "cat(as.character(packageVersion(\'ExpressionAtlas\')))"'), topic: versions - - when: - task.ext.when == null || task.ext.when - script: """ - get_eatlas_data.R --accession $accession + download_eatlas_data.R --accession $accession """ stub: """ - touch acc.raw.csv + touch acc.raw.counts.csv touch acc.design.csv """ diff --git a/modules/local/extract_gene_ids/environment.yml b/modules/local/extract_gene_ids/environment.yml new file mode 100644 index 00000000..45a97ad0 --- /dev/null +++ b/modules/local/extract_gene_ids/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 diff --git a/modules/local/extract_gene_ids/main.nf b/modules/local/extract_gene_ids/main.nf new file mode 100644 index 00000000..962e36ee --- /dev/null +++ b/modules/local/extract_gene_ids/main.nf @@ -0,0 +1,25 @@ +process EXTRACT_GENE_IDS { + + label 'process_low' + + tag "${meta.dataset}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/00/00f1434368763cebf37466cfaaaf069f971f7eae65b010169975c50d084e5af3/data': + 'community.wave.seqera.io/library/polars_python:1a4a3322c56bfeb9' }" + + input: + tuple val(meta), path(count_file) + + output: + path('*.gene_ids.txt'), optional: true, emit: gene_ids + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + """ + extract_gene_ids.py \\ + --count-file "$count_file" + """ +} diff --git a/modules/local/filter_and_rename_genes/environment.yml b/modules/local/filter_and_rename_genes/environment.yml new file mode 100644 index 00000000..45a97ad0 --- /dev/null +++ b/modules/local/filter_and_rename_genes/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 diff --git a/modules/local/filter_and_rename_genes/main.nf b/modules/local/filter_and_rename_genes/main.nf new file mode 100644 index 00000000..e63769a7 --- /dev/null +++ b/modules/local/filter_and_rename_genes/main.nf @@ -0,0 +1,46 @@ +process FILTER_AND_RENAME_GENES { + + label 'process_low' + + tag "${meta.dataset}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/00/00f1434368763cebf37466cfaaaf069f971f7eae65b010169975c50d084e5af3/data': + 'community.wave.seqera.io/library/polars_python:1a4a3322c56bfeb9' }" + + input: + tuple val(meta), path(count_file) + path gene_id_mapping_file + path valid_gene_ids_file + + output: + tuple val(meta), path('*.renamed.parquet'), optional: true, emit: counts + tuple val(meta.dataset), path("failure_reason.txt"), optional: true, topic: renaming_failure_reason + tuple val(meta.dataset), path("warning_reason.txt"), optional: true, topic: renaming_warning_reason + tuple val(meta.dataset), env("NB_FINAL"), env("NB_MERGED"), env("NB_NOT_VALID"), env("NB_UNMAPPED"), topic: mqc_id_mapping_stats + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + def mapping_arg = gene_id_mapping_file ? "--mappings $gene_id_mapping_file" : "" + def valid_ids_arg = valid_gene_ids_file ? "--valid-gene-ids $valid_gene_ids_file" : "" + """ + filter_and_rename_genes.py \\ + --count-file "$count_file" \\ + $mapping_arg \\ + $valid_ids_arg + + NB_UNMAPPED=\$(cat unmapped.txt) + NB_MERGED=\$(cat merged.txt) + NB_NOT_VALID=\$(cat not_valid.txt) + NB_FINAL=\$(cat final.txt) + """ + + + stub: + """ + touch fake_renamed.csv + """ + +} diff --git a/modules/local/filter_out_samples/with_too_many_missing_values/environment.yml b/modules/local/filter_out_samples/with_too_many_missing_values/environment.yml new file mode 100644 index 00000000..45a97ad0 --- /dev/null +++ b/modules/local/filter_out_samples/with_too_many_missing_values/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 diff --git a/modules/local/filter_out_samples/with_too_many_missing_values/main.nf b/modules/local/filter_out_samples/with_too_many_missing_values/main.nf new file mode 100644 index 00000000..ebbdc119 --- /dev/null +++ b/modules/local/filter_out_samples/with_too_many_missing_values/main.nf @@ -0,0 +1,36 @@ +process FILTER_OUT_SAMPLES_WITH_TOO_MANY_MISSING_VALUES { + + label 'process_single' + + tag "${meta.dataset}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/00/00f1434368763cebf37466cfaaaf069f971f7eae65b010169975c50d084e5af3/data': + 'community.wave.seqera.io/library/polars_python:1a4a3322c56bfeb9' }" + + input: + tuple val(meta), path(count_file) + path valid_gene_ids + val max_null_ratio + + output: + tuple val(meta), path("*.nulls_filtered.parquet"), optional: true, emit: counts + path("ratio_null_values_per_sample.csv"), emit: ratio_nulls_per_sample + tuple val(meta.dataset), path("ratio_null_values.csv"), topic: ratio_nulls + tuple val(meta.dataset), env("NB_KEPT_SAMPLES"), env("NB_REJECTED_SAMPLES"), topic: mqc_missing_values_filter_stats + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + """ + filter_out_samples_with_too_many_missing_values.py \\ + --counts $count_file \\ + --valid-gene-ids $valid_gene_ids \\ + --max-null-ratio $max_null_ratio + + NB_REJECTED_SAMPLES=\$(cat nb_rejected_samples.csv) + NB_KEPT_SAMPLES=\$(cat nb_kept_samples.csv) + """ + +} diff --git a/modules/local/filter_out_samples/with_too_many_zeros/environment.yml b/modules/local/filter_out_samples/with_too_many_zeros/environment.yml new file mode 100644 index 00000000..45a97ad0 --- /dev/null +++ b/modules/local/filter_out_samples/with_too_many_zeros/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 diff --git a/modules/local/filter_out_samples/with_too_many_zeros/main.nf b/modules/local/filter_out_samples/with_too_many_zeros/main.nf new file mode 100644 index 00000000..c380fb00 --- /dev/null +++ b/modules/local/filter_out_samples/with_too_many_zeros/main.nf @@ -0,0 +1,33 @@ +process FILTER_OUT_SAMPLES_WITH_TOO_MANY_ZEROS { + + label 'process_single' + + tag "${meta.dataset}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/00/00f1434368763cebf37466cfaaaf069f971f7eae65b010169975c50d084e5af3/data': + 'community.wave.seqera.io/library/polars_python:1a4a3322c56bfeb9' }" + + input: + tuple val(meta), path(count_file) + val(max_zero_ratio) + + output: + tuple val(meta), path("*.zeros_filtered.parquet"), optional: true, emit: counts + tuple val(meta.dataset), path("ratio_zeros.csv"), topic: ratio_zeros + tuple val(meta.dataset), env("NB_KEPT_SAMPLES"), env("NB_REJECTED_SAMPLES"), topic: mqc_zero_values_filter_stats + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + """ + filter_out_samples_with_too_many_zeros.py \\ + --counts $count_file \\ + --max-zero-ratio $max_zero_ratio + + NB_REJECTED_SAMPLES=\$(cat nb_rejected_samples.csv) + NB_KEPT_SAMPLES=\$(cat nb_kept_samples.csv) + """ + +} diff --git a/modules/local/genorm/compute_m_measure/environment.yml b/modules/local/genorm/compute_m_measure/environment.yml new file mode 100644 index 00000000..45a97ad0 --- /dev/null +++ b/modules/local/genorm/compute_m_measure/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 diff --git a/modules/local/genorm/compute_m_measure/main.nf b/modules/local/genorm/compute_m_measure/main.nf new file mode 100644 index 00000000..0dadf854 --- /dev/null +++ b/modules/local/genorm/compute_m_measure/main.nf @@ -0,0 +1,28 @@ +process COMPUTE_M_MEASURE { + + tag "${meta.section}" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/00/00f1434368763cebf37466cfaaaf069f971f7eae65b010169975c50d084e5af3/data': + 'community.wave.seqera.io/library/polars_python:1a4a3322c56bfeb9' }" + + input: + tuple val(meta), path(count_file), path(ratio_files) + + output: + tuple val(meta), path("m_measures.csv"), emit: m_measures + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + def args = "--task-attempts ${task.attempt}" + """ + compute_m_measures.py \\ + --counts $count_file \\ + --std-files "$ratio_files" \\ + $args + """ + +} diff --git a/modules/local/genorm/cross_join/environment.yml b/modules/local/genorm/cross_join/environment.yml new file mode 100644 index 00000000..45a97ad0 --- /dev/null +++ b/modules/local/genorm/cross_join/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 diff --git a/modules/local/genorm/cross_join/main.nf b/modules/local/genorm/cross_join/main.nf new file mode 100644 index 00000000..aad36bfa --- /dev/null +++ b/modules/local/genorm/cross_join/main.nf @@ -0,0 +1,31 @@ +process CROSS_JOIN { + + tag "${meta.section} :: ${meta.index_1} vs ${meta.index_2}" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/00/00f1434368763cebf37466cfaaaf069f971f7eae65b010169975c50d084e5af3/data': + 'community.wave.seqera.io/library/polars_python:1a4a3322c56bfeb9' }" + + input: + tuple val(meta), path("count_chunk_file_1"), path("count_chunk_file_2") + + output: + tuple val(meta), path('cross_join.*.parquet'), emit: data + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + + script: + def args = "--task-attempts ${task.attempt}" + """ + make_cross_join.py \\ + --file1 count_chunk_file_1 \\ + --file2 count_chunk_file_2 \\ + --index1 ${meta.index_1} \\ + --index2 ${meta.index_2} \\ + ${args} + """ + +} diff --git a/modules/local/genorm/expression_ratio/environment.yml b/modules/local/genorm/expression_ratio/environment.yml new file mode 100644 index 00000000..45a97ad0 --- /dev/null +++ b/modules/local/genorm/expression_ratio/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 diff --git a/modules/local/genorm/expression_ratio/main.nf b/modules/local/genorm/expression_ratio/main.nf new file mode 100644 index 00000000..6a3d1dd6 --- /dev/null +++ b/modules/local/genorm/expression_ratio/main.nf @@ -0,0 +1,28 @@ +process EXPRESSION_RATIO { + + tag "${meta.section} :: ${meta.index_1} vs ${meta.index_2}" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/00/00f1434368763cebf37466cfaaaf069f971f7eae65b010169975c50d084e5af3/data': + 'community.wave.seqera.io/library/polars_python:1a4a3322c56bfeb9' }" + + input: + tuple val(meta), path(file) + + output: + tuple val(meta), path('ratios.*.parquet'), emit: data + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + + script: + def args = "--task-attempts ${task.attempt}" + """ + make_pairwise_gene_expression_ratio.py \\ + --file $file \\ + ${args} + """ + +} diff --git a/modules/local/genorm/make_chunks/environment.yml b/modules/local/genorm/make_chunks/environment.yml new file mode 100644 index 00000000..45a97ad0 --- /dev/null +++ b/modules/local/genorm/make_chunks/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 diff --git a/modules/local/genorm/make_chunks/main.nf b/modules/local/genorm/make_chunks/main.nf new file mode 100644 index 00000000..e2802482 --- /dev/null +++ b/modules/local/genorm/make_chunks/main.nf @@ -0,0 +1,28 @@ +process MAKE_CHUNKS { + + tag "${meta.section}" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/00/00f1434368763cebf37466cfaaaf069f971f7eae65b010169975c50d084e5af3/data': + 'community.wave.seqera.io/library/polars_python:1a4a3322c56bfeb9' }" + + input: + tuple val(meta), path(count_file) + + output: + tuple val(meta), path('count_chunk.*.parquet'), emit: chunks + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + + script: + def args = "--task-attempts ${task.attempt}" + """ + make_parquet_chunks.py \\ + --counts $count_file \\ + ${args} + """ + +} diff --git a/modules/local/genorm/ratio_standard_variation/environment.yml b/modules/local/genorm/ratio_standard_variation/environment.yml new file mode 100644 index 00000000..45a97ad0 --- /dev/null +++ b/modules/local/genorm/ratio_standard_variation/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 diff --git a/modules/local/genorm/ratio_standard_variation/main.nf b/modules/local/genorm/ratio_standard_variation/main.nf new file mode 100644 index 00000000..f0279938 --- /dev/null +++ b/modules/local/genorm/ratio_standard_variation/main.nf @@ -0,0 +1,28 @@ +process RATIO_STANDARD_VARIATION { + + tag "${meta.section} :: ${meta.index_1} vs ${meta.index_2}" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/00/00f1434368763cebf37466cfaaaf069f971f7eae65b010169975c50d084e5af3/data': + 'community.wave.seqera.io/library/polars_python:1a4a3322c56bfeb9' }" + + input: + tuple val(meta), path(file) + + output: + tuple val(meta), path('std.*.parquet'), emit: data + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + + script: + def args = "--task-attempts ${task.attempt}" + """ + get_ratio_standard_variation.py \\ + --file $file \\ + ${args} + """ + +} diff --git a/modules/local/geo/getaccessions/environment.yml b/modules/local/geo/getaccessions/environment.yml new file mode 100644 index 00000000..1071fce5 --- /dev/null +++ b/modules/local/geo/getaccessions/environment.yml @@ -0,0 +1,14 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::pandas==3.0.1 + - conda-forge::httpx==0.28.1 + - conda-forge::tenacity==9.1.4 + - conda-forge::nltk==3.9.2 + - conda-forge::tqdm==4.67.3 + - conda-forge::xmltodict==1.0.3 + - conda-forge::biopython==1.86 diff --git a/modules/local/geo/getaccessions/main.nf b/modules/local/geo/getaccessions/main.nf new file mode 100644 index 00000000..aec9c97d --- /dev/null +++ b/modules/local/geo/getaccessions/main.nf @@ -0,0 +1,75 @@ +process GEO_GETACCESSIONS { + + label 'process_high' + + tag "${species}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/e8/e8be45bdbe57d56f7d452513c4799a878fdfeb2f8ff8351f1c02ee99627dc50e/data': + 'community.wave.seqera.io/library/biopython_httpx_nltk_pandas_pruned:f692df8e1f55b14b' }" + + input: + val species + val keywords + val platform + path excluded_accessions_file + val random_sampling_size + val random_sampling_seed + + output: + path "accessions.txt", optional: true, emit: accessions + path "geo_selected_datasets.metadata.tsv", optional: true, topic: geo_selected_datasets + path "geo_all_datasets.metadata.tsv", optional: true, topic: geo_all_datasets + path "geo_rejected_datasets.metadata.tsv", optional: true, topic: geo_rejected_datasets + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('httpx'), eval('python3 -c "import httpx; print(httpx.__version__)"'), topic: versions + tuple val("${task.process}"), val('nltk'), eval('python3 -c "import nltk; print(nltk.__version__)"'), topic: versions + tuple val("${task.process}"), val('pandas'), eval('python3 -c "import pandas; print(pandas.__version__)"'), topic: versions + tuple val("${task.process}"), val('biopython'), eval('python3 -c "import Bio; print(Bio.__version__)"'), topic: versions + tuple val("${task.process}"), val('tqdm'), eval('python3 -c "import tqdm; print(tqdm.__version__)"'), topic: versions + + script: + def keywords_string = keywords.split(',').collect { it.trim() }.join(' ') + def args = " --species $species" + if ( keywords_string != "" ) { + args += " --keywords $keywords_string" + } + if ( platform ) { + args += " --platform $platform" + } + if ( excluded_accessions_file ) { + args += " --exclude-accessions-in $excluded_accessions_file" + } + if ( random_sampling_size ) { + args += " --random-sampling-size $random_sampling_size" + } + if ( random_sampling_seed ) { + args += " --random-sampling-seed $random_sampling_seed" + } + // the folder where nltk will download data needs to be writable (necessary for singularity) + """ + # limiting CPU usage + export OMP_NUM_THREADS=${task.cpus} + + # the Entrez module from biopython automatically stores temp results in /.config + # if this directory is not writable, the script fails + export HOME=/tmp/biopython + mkdir -p /tmp/biopython + + export NLTK_DATA=\${PWD} + + get_geo_dataset_accessions.py \\ + $args \\ + --cpus ${task.cpus} + """ + + stub: + """ + touch accessions.txt \\ + all_experiments.metadata.tsv \\ + filtered_experiments.metadata.tsv \\ + filtered_experiments.keywords.yaml + """ + +} diff --git a/modules/local/geo/getdata/environment.yml b/modules/local/geo/getdata/environment.yml new file mode 100644 index 00000000..a9e4fb27 --- /dev/null +++ b/modules/local/geo/getdata/environment.yml @@ -0,0 +1,11 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::r-base==4.5.3 + - conda-forge::r-optparse==1.7.5 + - conda-forge::r-dplyr==1.2.0 + - bioconda::bioconductor-geoquery==2.78.0 + - conda-forge::wget==1.25.0 diff --git a/modules/local/geo/getdata/main.nf b/modules/local/geo/getdata/main.nf new file mode 100644 index 00000000..5996862c --- /dev/null +++ b/modules/local/geo/getdata/main.nf @@ -0,0 +1,42 @@ +process GEO_GETDATA { + + label 'process_single' + label 'can_fail' + + tag "$accession" + + maxForks 8 // limiting to 8 threads at a time to avoid 429 errors with the NCBI server + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/2d/2dd2efcca10168936aabe4209344f952df791be9a7530ddbd9e89cdbfc426a7c/data': + 'community.wave.seqera.io/library/bioconductor-geoquery_r-base_r-dplyr_r-optparse_wget:f425756c75602053' }" + + input: + val accession + val species + + output: + path("*.counts.csv"), optional: true, emit: counts + path("*.design.csv"), optional: true, emit: design + path("rejected/**"), optional: true, emit: rejected + tuple val(accession), path("failure_reason.txt"), optional: true, topic: geo_failure_reason + tuple val(accession), path("warning_reason.txt"), optional: true, topic: geo_warning_reason + tuple val("${task.process}"), val('R'), eval('Rscript -e "cat(R.version.string)" | sed "s/R version //"'), topic: versions + tuple val("${task.process}"), val('GEOquery'), eval('Rscript -e "cat(as.character(packageVersion(\'GEOquery\')))"'), topic: versions + tuple val("${task.process}"), val('dplyr'), eval('Rscript -e "cat(as.character(packageVersion(\'dplyr\')))"'), topic: versions + + script: + """ + download_geo_data.R \\ + --accession $accession \\ + --species $species + """ + + stub: + """ + touch acc.microarray.normalised.counts.csv + touch acc.design.csv + """ + +} diff --git a/modules/local/get_candidate_genes/environment.yml b/modules/local/get_candidate_genes/environment.yml new file mode 100644 index 00000000..45a97ad0 --- /dev/null +++ b/modules/local/get_candidate_genes/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 diff --git a/modules/local/get_candidate_genes/main.nf b/modules/local/get_candidate_genes/main.nf new file mode 100644 index 00000000..f0d97e92 --- /dev/null +++ b/modules/local/get_candidate_genes/main.nf @@ -0,0 +1,31 @@ +process GET_CANDIDATE_GENES { + + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/00/00f1434368763cebf37466cfaaaf069f971f7eae65b010169975c50d084e5af3/data': + 'community.wave.seqera.io/library/polars_python:1a4a3322c56bfeb9' }" + + input: + path count_file + path stat_file + val nb_candidates_per_section + val nb_sections + + output: + path 'section_*.candidate_counts.parquet', emit: counts + path 'section_*.stats.parquet', emit: section_stats + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + """ + get_candidate_genes.py \\ + --counts $count_file \\ + --stats $stat_file \\ + --nb-candidates-per-section $nb_candidates_per_section \\ + --nb-sections $nb_sections + """ + +} diff --git a/modules/local/gprofiler/idmapping/environment.yml b/modules/local/gprofiler/idmapping/environment.yml index 03785ed8..317e648d 100644 --- a/modules/local/gprofiler/idmapping/environment.yml +++ b/modules/local/gprofiler/idmapping/environment.yml @@ -1,6 +1,10 @@ -name: GPROFILER_IDMAPPING +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json channels: - conda-forge + - bioconda dependencies: - - conda-forge::pandas==2.2.3 - - conda-forge::requests==2.32.3 + - conda-forge::python=3.14.3 + - conda-forge::pandas==3.0.1 + - conda-forge::httpx==0.28.1 + - conda-forge::tenacity==9.1.4 diff --git a/modules/local/gprofiler/idmapping/main.nf b/modules/local/gprofiler/idmapping/main.nf index 8e974471..a08392da 100644 --- a/modules/local/gprofiler/idmapping/main.nf +++ b/modules/local/gprofiler/idmapping/main.nf @@ -1,36 +1,53 @@ process GPROFILER_IDMAPPING { - - publishDir "${params.outdir}/idmapping" - - // limiting to 8 threads at a time to avoid 429 errors with the G Profiler API server - maxForks 8 + label 'process_medium' + + tag "${species} IDs to ${gprofiler_target_db}" + + errorStrategy { + if (task.exitStatus == 100 ) { + log.error("Could not map gene IDs to ${gprofiler_target_db} database.") + 'terminate' + } else if (task.exitStatus in ((130..145) + 104 + 175) && task.attempt <= 10) { // OOM & related errors; should be retried as long as memory does not fit + sleep(Math.pow(2, task.attempt) * 200 as long) + 'retry' + } else if (task.attempt <= 3) { // all other errors should be retried with exponential backoff with max retry = 3 + sleep(Math.pow(2, task.attempt) * 200 as long) + 'retry' + } else { // after 3 retries, ignore the error + 'finish' + } + } conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/fe/fe3f927f5032b9f0749fd5a2d3431b483f1c8cb1613d0290e2326fec10bf8268/data': - 'community.wave.seqera.io/library/pandas_requests:c7451d98ba573475' }" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/76/767aed0eb8001eaede58f71b9ca72a658c9ca1929b129ed9cf209a8510541c39/data': + 'community.wave.seqera.io/library/httpx_pandas_python_tenacity:233acc91f7920d99' }" input: - tuple val(meta), path(count_file), val(species) + path gene_id_file + val species + val gprofiler_target_db output: - path('*.csv'), emit: csv - tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions - tuple val("${task.process}"), val('pandas'), eval('python3 -c "import pandas; print(pandas.__version__)"'), topic: versions - tuple val("${task.process}"), val('requests'), eval('python3 -c "import requests; print(requests.__version__)"'), topic: versions - - when: - task.ext.when == null || task.ext.when + path('mapped_gene_ids.csv'), emit: mapping + path('gene_metadata.csv'), emit: metadata + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('pandas'), eval('python3 -c "import pandas; print(pandas.__version__)"'), topic: versions + tuple val("${task.process}"), val('httpx'), eval('python3 -c "import httpx; print(httpx.__version__)"'), topic: versions script: """ - map_ids_to_ensembl.py --count-file "$count_file" --species "$species" + gprofiler_map_ids.py \\ + --gene-ids $gene_id_file \\ + --species "$species" \\ + --target-db "$gprofiler_target_db" """ stub: """ - touch fake_renamed.csv + touch mapped_gene_ids.csv + touch gene_metadata.csv """ } diff --git a/modules/local/impute_missing_values/environment.yml b/modules/local/impute_missing_values/environment.yml new file mode 100644 index 00000000..7be13b15 --- /dev/null +++ b/modules/local/impute_missing_values/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 + - conda-forge::scikit-learn==1.8.0 diff --git a/modules/local/impute_missing_values/main.nf b/modules/local/impute_missing_values/main.nf new file mode 100644 index 00000000..d2618d0e --- /dev/null +++ b/modules/local/impute_missing_values/main.nf @@ -0,0 +1,27 @@ +process IMPUTE_MISSING_VALUES { + + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/57/5751f4c7c1eb17d92c2863dec2b7505295e56eafb65ea5a9df66876fbffd24e3/data': + 'community.wave.seqera.io/library/polars_python_scikit-learn:041254a8f0633213' }" + + input: + tuple val(meta), path(count_file) + val missing_value_imputer + + output: + tuple val(meta), path('*.imputed.parquet'), emit: counts + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + tuple val("${task.process}"), val('scikit-learn'), eval('python3 -c "import sklearn; print(sklearn.__version__)"'), topic: versions + + script: + """ + impute_missing_values.py \\ + --counts $count_file \\ + --imputer $missing_value_imputer + """ + +} diff --git a/modules/local/merge_counts/environment.yml b/modules/local/merge_counts/environment.yml new file mode 100644 index 00000000..45a97ad0 --- /dev/null +++ b/modules/local/merge_counts/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 diff --git a/modules/local/merge_counts/main.nf b/modules/local/merge_counts/main.nf new file mode 100644 index 00000000..d35b428a --- /dev/null +++ b/modules/local/merge_counts/main.nf @@ -0,0 +1,24 @@ +process MERGE_COUNTS { + + label "process_high" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/00/00f1434368763cebf37466cfaaaf069f971f7eae65b010169975c50d084e5af3/data': + 'community.wave.seqera.io/library/polars_python:1a4a3322c56bfeb9' }" + + input: + tuple val(meta), path(count_files, stageAs: "?/*") + + output: + tuple val(meta), path('all_counts.parquet'), emit: counts + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + """ + merge_counts.py \\ + --counts "$count_files" + """ + +} diff --git a/modules/local/normalisation/compute_cpm/environment.yml b/modules/local/normalisation/compute_cpm/environment.yml new file mode 100644 index 00000000..45a97ad0 --- /dev/null +++ b/modules/local/normalisation/compute_cpm/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 diff --git a/modules/local/normalisation/compute_cpm/main.nf b/modules/local/normalisation/compute_cpm/main.nf new file mode 100644 index 00000000..005ef619 --- /dev/null +++ b/modules/local/normalisation/compute_cpm/main.nf @@ -0,0 +1,29 @@ +process NORMALISATION_COMPUTE_CPM { + + label 'process_single' + + tag "${meta.dataset}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/00/00f1434368763cebf37466cfaaaf069f971f7eae65b010169975c50d084e5af3/data': + 'community.wave.seqera.io/library/polars_python:1a4a3322c56bfeb9' }" + + input: + tuple val(meta), path(count_file) + + output: + tuple val(meta), path('*.cpm.parquet'), optional: true, emit: counts + tuple val(meta.dataset), path("failure_reason.txt"), optional: true, topic: normalisation_failure_reason + tuple val(meta.dataset), path("warning_reason.txt"), optional: true, topic: normalisation_warning_reason + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + """ + compute_cpm.py \\ + --counts $count_file + """ + + +} diff --git a/modules/local/normalisation/compute_tpm/environment.yml b/modules/local/normalisation/compute_tpm/environment.yml new file mode 100644 index 00000000..45a97ad0 --- /dev/null +++ b/modules/local/normalisation/compute_tpm/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 diff --git a/modules/local/normalisation/compute_tpm/main.nf b/modules/local/normalisation/compute_tpm/main.nf new file mode 100644 index 00000000..d90b35fc --- /dev/null +++ b/modules/local/normalisation/compute_tpm/main.nf @@ -0,0 +1,31 @@ +process NORMALISATION_COMPUTE_TPM { + + label 'process_single' + + tag "${meta.dataset}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/00/00f1434368763cebf37466cfaaaf069f971f7eae65b010169975c50d084e5af3/data': + 'community.wave.seqera.io/library/polars_python:1a4a3322c56bfeb9' }" + + input: + tuple val(meta), path(count_file) + path gene_lengths_file + + output: + tuple val(meta), path('*.tpm.parquet'), optional: true, emit: counts + tuple val(meta.dataset), path("failure_reason.txt"), optional: true, topic: normalisation_failure_reason + tuple val(meta.dataset), path("warning_reason.txt"), optional: true, topic: normalisation_warning_reason + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + """ + compute_tpm.py \\ + --counts $count_file \\ + --gene-lengths $gene_lengths_file + """ + + +} diff --git a/modules/local/normfinder/environment.yml b/modules/local/normfinder/environment.yml new file mode 100644 index 00000000..3d7ed06f --- /dev/null +++ b/modules/local/normfinder/environment.yml @@ -0,0 +1,11 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 + - conda-forge::tqdm==4.67.3 + - conda-forge::numpy==2.4.3 + - conda-forge::numba==0.64.0 diff --git a/modules/local/normfinder/main.nf b/modules/local/normfinder/main.nf new file mode 100644 index 00000000..d3ac7809 --- /dev/null +++ b/modules/local/normfinder/main.nf @@ -0,0 +1,36 @@ +process NORMFINDER { + + tag "${meta.section}" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/05/0526f3dbdd23175430f0af81763c1079b3b1425b2cdb2491ab54bb9c0d93d480/data': + 'community.wave.seqera.io/library/numba_numpy_polars_python_tqdm:f42e9bc9f30a29ff' }" + + input: + tuple val(meta), path(count_file) + path design_file + + output: + tuple val(meta), path('stability_values.normfinder.csv'), emit: stability_values + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + tuple val("${task.process}"), val('tqdm'), eval('python3 -c "import tqdm; print(tqdm.__version__)"'), topic: versions + tuple val("${task.process}"), val('numpy'), eval('python3 -c "import numpy; print(numpy.__version__)"'), topic: versions + tuple val("${task.process}"), val('numba'), eval('python3 -c "import numba; print(numba.__version__)"'), topic: versions + + script: + """ + normfinder.py \\ + --counts $count_file \\ + --design $design_file + """ + + stub: + + """ + touch stability_values.normfinder.csv + """ + +} diff --git a/modules/local/quantile_normalisation/environment.yml b/modules/local/quantile_normalisation/environment.yml new file mode 100644 index 00000000..7be13b15 --- /dev/null +++ b/modules/local/quantile_normalisation/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 + - conda-forge::scikit-learn==1.8.0 diff --git a/modules/local/quantile_normalisation/main.nf b/modules/local/quantile_normalisation/main.nf new file mode 100644 index 00000000..a9106c2e --- /dev/null +++ b/modules/local/quantile_normalisation/main.nf @@ -0,0 +1,34 @@ +process QUANTILE_NORMALISATION { + + label 'process_low' + + tag "${meta.dataset}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/57/5751f4c7c1eb17d92c2863dec2b7505295e56eafb65ea5a9df66876fbffd24e3/data': + 'community.wave.seqera.io/library/polars_python_scikit-learn:041254a8f0633213' }" + + input: + tuple val(meta), path(count_file) + val target_distribution + + output: + tuple val(meta), path('*.quant_norm.parquet'), emit: counts + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + tuple val("${task.process}"), val('scikit-learn'), eval('python3 -c "import sklearn; print(sklearn.__version__)"'), topic: versions + + script: + """ + quantile_normalise.py \\ + --counts $count_file \\ + --target-distrib $target_distribution + """ + + stub: + """ + touch count.cpm.quant_norm.parquet + """ + +} diff --git a/modules/local/variation_coefficient/environment.yml b/modules/local/variation_coefficient/environment.yml deleted file mode 100644 index 5beee07a..00000000 --- a/modules/local/variation_coefficient/environment.yml +++ /dev/null @@ -1,7 +0,0 @@ -name: variation_coefficient -channels: - - conda-forge - - bioconda -dependencies: - - conda-forge::r-base==4.3.3 - - conda-forge::r-optparse==1.7.5 diff --git a/modules/local/variation_coefficient/main.nf b/modules/local/variation_coefficient/main.nf deleted file mode 100644 index 29532d6c..00000000 --- a/modules/local/variation_coefficient/main.nf +++ /dev/null @@ -1,26 +0,0 @@ -process VARIATION_COEFFICIENT { - - publishDir "${params.outdir}/variation_coefficients" - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/73/733cbf61013292f639ec24adcec9548a119ea6254d3ba51c6503ffaba6acda4f/data': - 'community.wave.seqera.io/library/r-base_r-optparse:f7a5d8afb6d6fa3d' }" - - input: - path(count_files, stageAs: "?/*") - - output: - path 'variation_coefficients.csv', emit: csv - tuple val("${task.process}"), val('R'), eval('Rscript -e "cat(R.version.string)" | sed "s/R version //"'), topic: versions - - - when: - task.ext.when == null || task.ext.when - - script: - """ - get_variation_coefficient.R --count-files "$count_files" - """ - -} diff --git a/modules/nf-core/multiqc/.conda-lock/linux_amd64-bd-c1f4a7982b743963_1.txt b/modules/nf-core/multiqc/.conda-lock/linux_amd64-bd-c1f4a7982b743963_1.txt new file mode 100644 index 00000000..76190304 --- /dev/null +++ b/modules/nf-core/multiqc/.conda-lock/linux_amd64-bd-c1f4a7982b743963_1.txt @@ -0,0 +1,1552 @@ + +version: 6 +environments: +default: +channels: +- url: https://conda.anaconda.org/conda-forge/ +- url: https://conda.anaconda.org/bioconda/ +- url: https://conda.anaconda.org/bioconda/ +options: +pypi-prerelease-mode: if-necessary-or-explicit +packages: +linux-64: +- conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-20_gnu.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/_python_abi3_support-1.0-hd8ed1ab_2.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/annotated-types-0.7.0-pyhd8ed1ab_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/attrs-26.1.0-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/backports.zstd-1.3.0-py314h680f03e_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/brotli-python-1.2.0-py314h3de4e8d_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_9.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.2.25-hbd8a1cb_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/certifi-2026.2.25-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/charset-normalizer-3.4.6-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/click-8.3.1-pyh8f84b5b_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/coloredlogs-15.0.1-pyhd8ed1ab_4.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/colormath-3.0.0-pyhd8ed1ab_4.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/cpython-3.14.3-py314hd8ed1ab_101.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/expat-2.7.4-hecca717_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-dejavu-sans-mono-2.37-hab24e00_0.tar.bz2 +- conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-inconsolata-3.000-h77eed37_0.tar.bz2 +- conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-source-code-pro-2.038-h77eed37_0.tar.bz2 +- conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-ubuntu-0.83-h77eed37_3.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/fontconfig-2.17.1-h27c8c51_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/fonts-conda-forge-1-hc364b38_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/h2-4.3.0-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/hpack-4.1.0-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/humanfriendly-10.0-pyh707e725_8.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/humanize-4.15.0-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.1.0-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/icu-78.3-h33c6efd_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/idna-3.11-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.8.0-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.6-pyhcf101f3_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/jsonschema-4.26.0-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/jsonschema-specifications-2025.9.1-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/kaleido-core-0.2.1-h3644ca4_0.tar.bz2 +- conda: https://conda.anaconda.org/conda-forge/linux-64/lcms2-2.18-h0c24ade_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_102.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/lerc-4.1.0-hdb68285_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.25-h17f619e_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.4-hecca717_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.5.2-h3435931_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libfreetype-2.14.3-ha770c72_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libfreetype6-2.14.3-h73754d4_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.2.0-he0feb66_18.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.2.0-h69a702a_18.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.2.0-h69a702a_18.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.2.0-h68bc16d_18.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_18.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libjpeg-turbo-3.1.2-hb03c661_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.11.0-5_h47877c9_openblas.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.2-hb03c661_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb03c661_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.30-pthreads_h94d23a6_4.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libpng-1.6.55-h421ea60_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.52.0-hf4e2dac_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.2.0-h934c35e_18.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libtiff-4.7.1-h9d88235_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.41.3-h5347b49_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libwebp-base-1.6.0-hd42ef1d_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libxcb-1.17.0-h8a09558_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.2-h25fd6f3_2.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/markdown-3.10.2-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/markdown-it-py-4.0.0-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.3-py314h67df5f8_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/mathjax-2.7.7-ha770c72_3.tar.bz2 +- conda: https://conda.anaconda.org/conda-forge/noarch/mdurl-0.1.2-pyhd8ed1ab_1.conda +- conda: https://conda.anaconda.org/bioconda/noarch/multiqc-1.33-pyhdfd78af_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/narwhals-2.18.1-pyhcf101f3_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/natsort-8.4.0-pyhcf101f3_2.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/networkx-3.6.1-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/nspr-4.38-h29cc59b_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/nss-3.118-h445c969_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-2.4.3-py314h2b28147_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/openjpeg-2.5.4-h55fea9a_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.6.1-h35e630c_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.0-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/pillow-12.1.1-py314h8ec4b1a_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/plotly-6.6.0-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/polars-1.39.3-pyh58ad624_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/polars-lts-cpu-1.34.0.deprecated-hc364b38_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/polars-runtime-32-1.39.3-py310hffdcd12_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/polars-runtime-compat-1.39.3-py310hbcd5346_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/procps-ng-4.0.6-h18c060e_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/pthread-stubs-0.4-hb9d3cd8_1002.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/pyaml-env-1.2.2-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/pydantic-2.12.5-pyhcf101f3_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/pydantic-core-2.41.5-py314h2e6c369_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.14.3-h32b2ec7_101_cp314.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/python-dotenv-1.2.2-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/python-gil-3.14.3-h4df99d1_101.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/python-kaleido-0.2.1-pyhd8ed1ab_0.tar.bz2 +- conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.14-8_cp314.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.3-py314h67df5f8_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/referencing-0.37.0-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/regex-2026.2.28-py314h5bd0f2a_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/requests-2.32.5-pyhcf101f3_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/rich-14.3.3-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/rich-click-1.9.7-pyh8f84b5b_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/rpds-py-0.30.0-py314h2e6c369_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/spectra-0.0.11-pyhd8ed1ab_2.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/sqlite-3.52.0-h04a0ce9_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/tiktoken-0.12.0-py314h67fec18_3.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h366c992_103.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/tqdm-4.67.3-pyh8f84b5b_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/typeguard-4.5.1-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/typing-extensions-4.15.0-h396c80c_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/typing-inspection-0.4.2-pyhd8ed1ab_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.15.0-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.6.3-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxau-1.0.12-hb03c661_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxdmcp-1.1.5-hb03c661_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h280c20c_3.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/zlib-ng-2.3.3-hceb46e0_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda +packages: +- conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-20_gnu.conda +build_number: 20 +sha256: 1dd3fffd892081df9726d7eb7e0dea6198962ba775bd88842135a4ddb4deb3c9 +md5: a9f577daf3de00bca7c3c76c0ecbd1de +depends: +- __glibc >=2.17,<3.0.a0 +- libgomp >=7.5.0 +constrains: +- openmp_impl <0.0a0 +license: BSD-3-Clause +license_family: BSD +size: 28948 +timestamp: 1770939786096 +- conda: https://conda.anaconda.org/conda-forge/noarch/_python_abi3_support-1.0-hd8ed1ab_2.conda +sha256: a3967b937b9abf0f2a99f3173fa4630293979bd1644709d89580e7c62a544661 +md5: aaa2a381ccc56eac91d63b6c1240312f +depends: +- cpython +- python-gil +license: MIT +license_family: MIT +size: 8191 +timestamp: 1744137672556 +- conda: https://conda.anaconda.org/conda-forge/noarch/annotated-types-0.7.0-pyhd8ed1ab_1.conda +sha256: e0ea1ba78fbb64f17062601edda82097fcf815012cf52bb704150a2668110d48 +md5: 2934f256a8acfe48f6ebb4fce6cde29c +depends: +- python >=3.9 +- typing-extensions >=4.0.0 +license: MIT +license_family: MIT +size: 18074 +timestamp: 1733247158254 +- conda: https://conda.anaconda.org/conda-forge/noarch/attrs-26.1.0-pyhcf101f3_0.conda +sha256: 1b6124230bb4e571b1b9401537ecff575b7b109cc3a21ee019f65e083b8399ab +md5: c6b0543676ecb1fb2d7643941fe375f2 +depends: +- python >=3.10 +- python +license: MIT +license_family: MIT +size: 64927 +timestamp: 1773935801332 +- conda: https://conda.anaconda.org/conda-forge/noarch/backports.zstd-1.3.0-py314h680f03e_0.conda +noarch: generic +sha256: c31ab719d256bc6f89926131e88ecd0f0c5d003fe8481852c6424f4ec6c7eb29 +md5: a2ac7763a9ac75055b68f325d3255265 +depends: +- python >=3.14 +license: BSD-3-Clause AND MIT AND EPL-2.0 +size: 7514 +timestamp: 1767044983590 +- conda: https://conda.anaconda.org/conda-forge/linux-64/brotli-python-1.2.0-py314h3de4e8d_1.conda +sha256: 3ad3500bff54a781c29f16ce1b288b36606e2189d0b0ef2f67036554f47f12b0 +md5: 8910d2c46f7e7b519129f486e0fe927a +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +- libstdcxx >=14 +- python >=3.14,<3.15.0a0 +- python_abi 3.14.* *_cp314 +constrains: +- libbrotlicommon 1.2.0 hb03c661_1 +license: MIT +license_family: MIT +size: 367376 +timestamp: 1764017265553 +- conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_9.conda +sha256: 0b75d45f0bba3e95dc693336fa51f40ea28c980131fec438afb7ce6118ed05f6 +md5: d2ffd7602c02f2b316fd921d39876885 +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +license: bzip2-1.0.6 +license_family: BSD +size: 260182 +timestamp: 1771350215188 +- conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.2.25-hbd8a1cb_0.conda +sha256: 67cc7101b36421c5913a1687ef1b99f85b5d6868da3abbf6ec1a4181e79782fc +md5: 4492fd26db29495f0ba23f146cd5638d +depends: +- __unix +license: ISC +size: 147413 +timestamp: 1772006283803 +- conda: https://conda.anaconda.org/conda-forge/noarch/certifi-2026.2.25-pyhd8ed1ab_0.conda +sha256: a6b118fd1ed6099dc4fc03f9c492b88882a780fadaef4ed4f93dc70757713656 +md5: 765c4d97e877cdbbb88ff33152b86125 +depends: +- python >=3.10 +license: ISC +size: 151445 +timestamp: 1772001170301 +- conda: https://conda.anaconda.org/conda-forge/noarch/charset-normalizer-3.4.6-pyhd8ed1ab_0.conda +sha256: d86dfd428b2e3c364fa90e07437c8405d635aa4ef54b25ab51d9c712be4112a5 +md5: 49ee13eb9b8f44d63879c69b8a40a74b +depends: +- python >=3.10 +license: MIT +license_family: MIT +size: 58510 +timestamp: 1773660086450 +- conda: https://conda.anaconda.org/conda-forge/noarch/click-8.3.1-pyh8f84b5b_1.conda +sha256: 38cfe1ee75b21a8361c8824f5544c3866f303af1762693a178266d7f198e8715 +md5: ea8a6c3256897cc31263de9f455e25d9 +depends: +- python >=3.10 +- __unix +- python +license: BSD-3-Clause +license_family: BSD +size: 97676 +timestamp: 1764518652276 +- conda: https://conda.anaconda.org/conda-forge/noarch/coloredlogs-15.0.1-pyhd8ed1ab_4.conda +sha256: 8021c76eeadbdd5784b881b165242db9449783e12ce26d6234060026fd6a8680 +md5: b866ff7007b934d564961066c8195983 +depends: +- humanfriendly >=9.1 +- python >=3.9 +license: MIT +license_family: MIT +size: 43758 +timestamp: 1733928076798 +- conda: https://conda.anaconda.org/conda-forge/noarch/colormath-3.0.0-pyhd8ed1ab_4.conda +sha256: 59c9e29800b483b390467f90e82b0da3a4fbf0612efe1c90813fca232780e160 +md5: 071cf7b0ce333c81718b054066c15102 +depends: +- networkx >=2.0 +- numpy +- python >=3.9 +license: BSD-3-Clause +license_family: BSD +size: 39326 +timestamp: 1735759976140 +- conda: https://conda.anaconda.org/conda-forge/noarch/cpython-3.14.3-py314hd8ed1ab_101.conda +noarch: generic +sha256: 91b06300879df746214f7363d6c27c2489c80732e46a369eb2afc234bcafb44c +md5: 3bb89e4f795e5414addaa531d6b1500a +depends: +- python >=3.14,<3.15.0a0 +- python_abi * *_cp314 +license: Python-2.0 +size: 50078 +timestamp: 1770674447292 +- conda: https://conda.anaconda.org/conda-forge/linux-64/expat-2.7.4-hecca717_0.conda +sha256: 0cc345e4dead417996ce9a1f088b28d858f03d113d43c1963d29194366dcce27 +md5: a0535741a4934b3e386051065c58761a +depends: +- __glibc >=2.17,<3.0.a0 +- libexpat 2.7.4 hecca717_0 +- libgcc >=14 +license: MIT +license_family: MIT +size: 145274 +timestamp: 1771259434699 +- conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-dejavu-sans-mono-2.37-hab24e00_0.tar.bz2 +sha256: 58d7f40d2940dd0a8aa28651239adbf5613254df0f75789919c4e6762054403b +md5: 0c96522c6bdaed4b1566d11387caaf45 +license: BSD-3-Clause +license_family: BSD +size: 397370 +timestamp: 1566932522327 +- conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-inconsolata-3.000-h77eed37_0.tar.bz2 +sha256: c52a29fdac682c20d252facc50f01e7c2e7ceac52aa9817aaf0bb83f7559ec5c +md5: 34893075a5c9e55cdafac56607368fc6 +license: OFL-1.1 +license_family: Other +size: 96530 +timestamp: 1620479909603 +- conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-source-code-pro-2.038-h77eed37_0.tar.bz2 +sha256: 00925c8c055a2275614b4d983e1df637245e19058d79fc7dd1a93b8d9fb4b139 +md5: 4d59c254e01d9cde7957100457e2d5fb +license: OFL-1.1 +license_family: Other +size: 700814 +timestamp: 1620479612257 +- conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-ubuntu-0.83-h77eed37_3.conda +sha256: 2821ec1dc454bd8b9a31d0ed22a7ce22422c0aef163c59f49dfdf915d0f0ca14 +md5: 49023d73832ef61042f6a237cb2687e7 +license: LicenseRef-Ubuntu-Font-Licence-Version-1.0 +license_family: Other +size: 1620504 +timestamp: 1727511233259 +- conda: https://conda.anaconda.org/conda-forge/linux-64/fontconfig-2.17.1-h27c8c51_0.conda +sha256: aa4a44dba97151221100a637c7f4bde619567afade9c0265f8e1c8eed8d7bd8c +md5: 867127763fbe935bab59815b6e0b7b5c +depends: +- __glibc >=2.17,<3.0.a0 +- libexpat >=2.7.4,<3.0a0 +- libfreetype >=2.14.1 +- libfreetype6 >=2.14.1 +- libgcc >=14 +- libuuid >=2.41.3,<3.0a0 +- libzlib >=1.3.1,<2.0a0 +license: MIT +license_family: MIT +size: 270705 +timestamp: 1771382710863 +- conda: https://conda.anaconda.org/conda-forge/noarch/fonts-conda-forge-1-hc364b38_1.conda +sha256: 54eea8469786bc2291cc40bca5f46438d3e062a399e8f53f013b6a9f50e98333 +md5: a7970cd949a077b7cb9696379d338681 +depends: +- font-ttf-ubuntu +- font-ttf-inconsolata +- font-ttf-dejavu-sans-mono +- font-ttf-source-code-pro +license: BSD-3-Clause +license_family: BSD +size: 4059 +timestamp: 1762351264405 +- conda: https://conda.anaconda.org/conda-forge/noarch/h2-4.3.0-pyhcf101f3_0.conda +sha256: 84c64443368f84b600bfecc529a1194a3b14c3656ee2e832d15a20e0329b6da3 +md5: 164fc43f0b53b6e3a7bc7dce5e4f1dc9 +depends: +- python >=3.10 +- hyperframe >=6.1,<7 +- hpack >=4.1,<5 +- python +license: MIT +license_family: MIT +size: 95967 +timestamp: 1756364871835 +- conda: https://conda.anaconda.org/conda-forge/noarch/hpack-4.1.0-pyhd8ed1ab_0.conda +sha256: 6ad78a180576c706aabeb5b4c8ceb97c0cb25f1e112d76495bff23e3779948ba +md5: 0a802cb9888dd14eeefc611f05c40b6e +depends: +- python >=3.9 +license: MIT +license_family: MIT +size: 30731 +timestamp: 1737618390337 +- conda: https://conda.anaconda.org/conda-forge/noarch/humanfriendly-10.0-pyh707e725_8.conda +sha256: fa2071da7fab758c669e78227e6094f6b3608228740808a6de5d6bce83d9e52d +md5: 7fe569c10905402ed47024fc481bb371 +depends: +- __unix +- python >=3.9 +license: MIT +license_family: MIT +size: 73563 +timestamp: 1733928021866 +- conda: https://conda.anaconda.org/conda-forge/noarch/humanize-4.15.0-pyhd8ed1ab_0.conda +sha256: 6c4343b376d0b12a4c75ab992640970d36c933cad1fd924f6a1181fa91710e80 +md5: daddf757c3ecd6067b9af1df1f25d89e +depends: +- python >=3.10 +license: MIT +license_family: MIT +size: 67994 +timestamp: 1766267728652 +- conda: https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.1.0-pyhd8ed1ab_0.conda +sha256: 77af6f5fe8b62ca07d09ac60127a30d9069fdc3c68d6b256754d0ffb1f7779f8 +md5: 8e6923fc12f1fe8f8c4e5c9f343256ac +depends: +- python >=3.9 +license: MIT +license_family: MIT +size: 17397 +timestamp: 1737618427549 +- conda: https://conda.anaconda.org/conda-forge/linux-64/icu-78.3-h33c6efd_0.conda +sha256: fbf86c4a59c2ed05bbffb2ba25c7ed94f6185ec30ecb691615d42342baa1a16a +md5: c80d8a3b84358cb967fa81e7075fbc8a +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +- libstdcxx >=14 +license: MIT +license_family: MIT +size: 12723451 +timestamp: 1773822285671 +- conda: https://conda.anaconda.org/conda-forge/noarch/idna-3.11-pyhd8ed1ab_0.conda +sha256: ae89d0299ada2a3162c2614a9d26557a92aa6a77120ce142f8e0109bbf0342b0 +md5: 53abe63df7e10a6ba605dc5f9f961d36 +depends: +- python >=3.10 +license: BSD-3-Clause +license_family: BSD +size: 50721 +timestamp: 1760286526795 +- conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.8.0-pyhcf101f3_0.conda +sha256: 82ab2a0d91ca1e7e63ab6a4939356667ef683905dea631bc2121aa534d347b16 +md5: 080594bf4493e6bae2607e65390c520a +depends: +- python >=3.10 +- zipp >=3.20 +- python +license: Apache-2.0 +license_family: APACHE +size: 34387 +timestamp: 1773931568510 +- conda: https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.6-pyhcf101f3_1.conda +sha256: fc9ca7348a4f25fed2079f2153ecdcf5f9cf2a0bc36c4172420ca09e1849df7b +md5: 04558c96691bed63104678757beb4f8d +depends: +- markupsafe >=2.0 +- python >=3.10 +- python +license: BSD-3-Clause +license_family: BSD +size: 120685 +timestamp: 1764517220861 +- conda: https://conda.anaconda.org/conda-forge/noarch/jsonschema-4.26.0-pyhcf101f3_0.conda +sha256: db973a37d75db8e19b5f44bbbdaead0c68dde745407f281e2a7fe4db74ec51d7 +md5: ada41c863af263cc4c5fcbaff7c3e4dc +depends: +- attrs >=22.2.0 +- jsonschema-specifications >=2023.3.6 +- python >=3.10 +- referencing >=0.28.4 +- rpds-py >=0.25.0 +- python +license: MIT +license_family: MIT +size: 82356 +timestamp: 1767839954256 +- conda: https://conda.anaconda.org/conda-forge/noarch/jsonschema-specifications-2025.9.1-pyhcf101f3_0.conda +sha256: 0a4f3b132f0faca10c89fdf3b60e15abb62ded6fa80aebfc007d05965192aa04 +md5: 439cd0f567d697b20a8f45cb70a1005a +depends: +- python >=3.10 +- referencing >=0.31.0 +- python +license: MIT +license_family: MIT +size: 19236 +timestamp: 1757335715225 +- conda: https://conda.anaconda.org/conda-forge/linux-64/kaleido-core-0.2.1-h3644ca4_0.tar.bz2 +sha256: 7f243680ca03eba7457b7a48f93a9440ba8181a8eac20a3eb5ef165ab6c96664 +md5: b3723b235b0758abaae8c82ce4d80146 +depends: +- __glibc >=2.17,<3.0.a0 +- expat >=2.2.10,<3.0.0a0 +- fontconfig +- fonts-conda-forge +- libgcc-ng >=9.3.0 +- mathjax 2.7.* +- nspr >=4.29,<5.0a0 +- nss >=3.62,<4.0a0 +- sqlite >=3.34.0,<4.0a0 +license: MIT +license_family: MIT +size: 62099926 +timestamp: 1615199463039 +- conda: https://conda.anaconda.org/conda-forge/linux-64/lcms2-2.18-h0c24ade_0.conda +sha256: 836ec4b895352110335b9fdcfa83a8dcdbe6c5fb7c06c4929130600caea91c0a +md5: 6f2e2c8f58160147c4d1c6f4c14cbac4 +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +- libjpeg-turbo >=3.1.2,<4.0a0 +- libtiff >=4.7.1,<4.8.0a0 +license: MIT +license_family: MIT +size: 249959 +timestamp: 1768184673131 +- conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_102.conda +sha256: 3d584956604909ff5df353767f3a2a2f60e07d070b328d109f30ac40cd62df6c +md5: 18335a698559cdbcd86150a48bf54ba6 +depends: +- __glibc >=2.17,<3.0.a0 +- zstd >=1.5.7,<1.6.0a0 +constrains: +- binutils_impl_linux-64 2.45.1 +license: GPL-3.0-only +license_family: GPL +size: 728002 +timestamp: 1774197446916 +- conda: https://conda.anaconda.org/conda-forge/linux-64/lerc-4.1.0-hdb68285_0.conda +sha256: f84cb54782f7e9cea95e810ea8fef186e0652d0fa73d3009914fa2c1262594e1 +md5: a752488c68f2e7c456bcbd8f16eec275 +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +- libstdcxx >=14 +license: Apache-2.0 +license_family: Apache +size: 261513 +timestamp: 1773113328888 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda +build_number: 5 +sha256: 18c72545080b86739352482ba14ba2c4815e19e26a7417ca21a95b76ec8da24c +md5: c160954f7418d7b6e87eaf05a8913fa9 +depends: +- libopenblas >=0.3.30,<0.3.31.0a0 +- libopenblas >=0.3.30,<1.0a0 +constrains: +- mkl <2026 +- liblapack 3.11.0 5*_openblas +- libcblas 3.11.0 5*_openblas +- blas 2.305 openblas +- liblapacke 3.11.0 5*_openblas +license: BSD-3-Clause +license_family: BSD +size: 18213 +timestamp: 1765818813880 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda +build_number: 5 +sha256: 0cbdcc67901e02dc17f1d19e1f9170610bd828100dc207de4d5b6b8ad1ae7ad8 +md5: 6636a2b6f1a87572df2970d3ebc87cc0 +depends: +- libblas 3.11.0 5_h4a7cf45_openblas +constrains: +- liblapacke 3.11.0 5*_openblas +- blas 2.305 openblas +- liblapack 3.11.0 5*_openblas +license: BSD-3-Clause +license_family: BSD +size: 18194 +timestamp: 1765818837135 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.25-h17f619e_0.conda +sha256: aa8e8c4be9a2e81610ddf574e05b64ee131fab5e0e3693210c9d6d2fba32c680 +md5: 6c77a605a7a689d17d4819c0f8ac9a00 +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +license: MIT +license_family: MIT +size: 73490 +timestamp: 1761979956660 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.4-hecca717_0.conda +sha256: d78f1d3bea8c031d2f032b760f36676d87929b18146351c4464c66b0869df3f5 +md5: e7f7ce06ec24cfcfb9e36d28cf82ba57 +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +constrains: +- expat 2.7.4.* +license: MIT +license_family: MIT +size: 76798 +timestamp: 1771259418166 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.5.2-h3435931_0.conda +sha256: 31f19b6a88ce40ebc0d5a992c131f57d919f73c0b92cd1617a5bec83f6e961e6 +md5: a360c33a5abe61c07959e449fa1453eb +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +license: MIT +license_family: MIT +size: 58592 +timestamp: 1769456073053 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libfreetype-2.14.3-ha770c72_0.conda +sha256: 38f014a7129e644636e46064ecd6b1945e729c2140e21d75bb476af39e692db2 +md5: e289f3d17880e44b633ba911d57a321b +depends: +- libfreetype6 >=2.14.3 +license: GPL-2.0-only OR FTL +size: 8049 +timestamp: 1774298163029 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libfreetype6-2.14.3-h73754d4_0.conda +sha256: 16f020f96da79db1863fcdd8f2b8f4f7d52f177dd4c58601e38e9182e91adf1d +md5: fb16b4b69e3f1dcfe79d80db8fd0c55d +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +- libpng >=1.6.55,<1.7.0a0 +- libzlib >=1.3.2,<2.0a0 +constrains: +- freetype >=2.14.3 +license: GPL-2.0-only OR FTL +size: 384575 +timestamp: 1774298162622 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.2.0-he0feb66_18.conda +sha256: faf7d2017b4d718951e3a59d081eb09759152f93038479b768e3d612688f83f5 +md5: 0aa00f03f9e39fb9876085dee11a85d4 +depends: +- __glibc >=2.17,<3.0.a0 +- _openmp_mutex >=4.5 +constrains: +- libgcc-ng ==15.2.0=*_18 +- libgomp 15.2.0 he0feb66_18 +license: GPL-3.0-only WITH GCC-exception-3.1 +license_family: GPL +size: 1041788 +timestamp: 1771378212382 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.2.0-h69a702a_18.conda +sha256: e318a711400f536c81123e753d4c797a821021fb38970cebfb3f454126016893 +md5: d5e96b1ed75ca01906b3d2469b4ce493 +depends: +- libgcc 15.2.0 he0feb66_18 +license: GPL-3.0-only WITH GCC-exception-3.1 +license_family: GPL +size: 27526 +timestamp: 1771378224552 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.2.0-h69a702a_18.conda +sha256: d2c9fad338fd85e4487424865da8e74006ab2e2475bd788f624d7a39b2a72aee +md5: 9063115da5bc35fdc3e1002e69b9ef6e +depends: +- libgfortran5 15.2.0 h68bc16d_18 +constrains: +- libgfortran-ng ==15.2.0=*_18 +license: GPL-3.0-only WITH GCC-exception-3.1 +license_family: GPL +size: 27523 +timestamp: 1771378269450 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.2.0-h68bc16d_18.conda +sha256: 539b57cf50ec85509a94ba9949b7e30717839e4d694bc94f30d41c9d34de2d12 +md5: 646855f357199a12f02a87382d429b75 +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=15.2.0 +constrains: +- libgfortran 15.2.0 +license: GPL-3.0-only WITH GCC-exception-3.1 +license_family: GPL +size: 2482475 +timestamp: 1771378241063 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_18.conda +sha256: 21337ab58e5e0649d869ab168d4e609b033509de22521de1bfed0c031bfc5110 +md5: 239c5e9546c38a1e884d69effcf4c882 +depends: +- __glibc >=2.17,<3.0.a0 +license: GPL-3.0-only WITH GCC-exception-3.1 +license_family: GPL +size: 603262 +timestamp: 1771378117851 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libjpeg-turbo-3.1.2-hb03c661_0.conda +sha256: cc9aba923eea0af8e30e0f94f2ad7156e2984d80d1e8e7fe6be5a1f257f0eb32 +md5: 8397539e3a0bbd1695584fb4f927485a +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +constrains: +- jpeg <0.0.0a +license: IJG AND BSD-3-Clause AND Zlib +size: 633710 +timestamp: 1762094827865 +- conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.11.0-5_h47877c9_openblas.conda +build_number: 5 +sha256: c723b6599fcd4c6c75dee728359ef418307280fa3e2ee376e14e85e5bbdda053 +md5: b38076eb5c8e40d0106beda6f95d7609 +depends: +- libblas 3.11.0 5_h4a7cf45_openblas +constrains: +- blas 2.305 openblas +- liblapacke 3.11.0 5*_openblas +- libcblas 3.11.0 5*_openblas +license: BSD-3-Clause +license_family: BSD +size: 18200 +timestamp: 1765818857876 +- conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.2-hb03c661_0.conda +sha256: 755c55ebab181d678c12e49cced893598f2bab22d582fbbf4d8b83c18be207eb +md5: c7c83eecbb72d88b940c249af56c8b17 +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +constrains: +- xz 5.8.2.* +license: 0BSD +size: 113207 +timestamp: 1768752626120 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb03c661_1.conda +sha256: fe171ed5cf5959993d43ff72de7596e8ac2853e9021dec0344e583734f1e0843 +md5: 2c21e66f50753a083cbe6b80f38268fa +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +license: BSD-2-Clause +license_family: BSD +size: 92400 +timestamp: 1769482286018 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.30-pthreads_h94d23a6_4.conda +sha256: 199d79c237afb0d4780ccd2fbf829cea80743df60df4705202558675e07dd2c5 +md5: be43915efc66345cccb3c310b6ed0374 +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +- libgfortran +- libgfortran5 >=14.3.0 +constrains: +- openblas >=0.3.30,<0.3.31.0a0 +license: BSD-3-Clause +license_family: BSD +size: 5927939 +timestamp: 1763114673331 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libpng-1.6.55-h421ea60_0.conda +sha256: 36ade759122cdf0f16e2a2562a19746d96cf9c863ffaa812f2f5071ebbe9c03c +md5: 5f13ffc7d30ffec87864e678df9957b4 +depends: +- libgcc >=14 +- __glibc >=2.17,<3.0.a0 +- libzlib >=1.3.1,<2.0a0 +license: zlib-acknowledgement +size: 317669 +timestamp: 1770691470744 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.52.0-hf4e2dac_0.conda +sha256: d716847b7deca293d2e49ed1c8ab9e4b9e04b9d780aea49a97c26925b28a7993 +md5: fd893f6a3002a635b5e50ceb9dd2c0f4 +depends: +- __glibc >=2.17,<3.0.a0 +- icu >=78.2,<79.0a0 +- libgcc >=14 +- libzlib >=1.3.1,<2.0a0 +license: blessing +size: 951405 +timestamp: 1772818874251 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.2.0-h934c35e_18.conda +sha256: 78668020064fdaa27e9ab65cd2997e2c837b564ab26ce3bf0e58a2ce1a525c6e +md5: 1b08cd684f34175e4514474793d44bcb +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc 15.2.0 he0feb66_18 +constrains: +- libstdcxx-ng ==15.2.0=*_18 +license: GPL-3.0-only WITH GCC-exception-3.1 +license_family: GPL +size: 5852330 +timestamp: 1771378262446 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libtiff-4.7.1-h9d88235_1.conda +sha256: e5f8c38625aa6d567809733ae04bb71c161a42e44a9fa8227abe61fa5c60ebe0 +md5: cd5a90476766d53e901500df9215e927 +depends: +- __glibc >=2.17,<3.0.a0 +- lerc >=4.0.0,<5.0a0 +- libdeflate >=1.25,<1.26.0a0 +- libgcc >=14 +- libjpeg-turbo >=3.1.0,<4.0a0 +- liblzma >=5.8.1,<6.0a0 +- libstdcxx >=14 +- libwebp-base >=1.6.0,<2.0a0 +- libzlib >=1.3.1,<2.0a0 +- zstd >=1.5.7,<1.6.0a0 +license: HPND +size: 435273 +timestamp: 1762022005702 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.41.3-h5347b49_0.conda +sha256: 1a7539cfa7df00714e8943e18de0b06cceef6778e420a5ee3a2a145773758aee +md5: db409b7c1720428638e7c0d509d3e1b5 +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +license: BSD-3-Clause +license_family: BSD +size: 40311 +timestamp: 1766271528534 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libwebp-base-1.6.0-hd42ef1d_0.conda +sha256: 3aed21ab28eddffdaf7f804f49be7a7d701e8f0e46c856d801270b470820a37b +md5: aea31d2e5b1091feca96fcfe945c3cf9 +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +constrains: +- libwebp 1.6.0 +license: BSD-3-Clause +license_family: BSD +size: 429011 +timestamp: 1752159441324 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libxcb-1.17.0-h8a09558_0.conda +sha256: 666c0c431b23c6cec6e492840b176dde533d48b7e6fb8883f5071223433776aa +md5: 92ed62436b625154323d40d5f2f11dd7 +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=13 +- pthread-stubs +- xorg-libxau >=1.0.11,<2.0a0 +- xorg-libxdmcp +license: MIT +license_family: MIT +size: 395888 +timestamp: 1727278577118 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.2-h25fd6f3_2.conda +sha256: 55044c403570f0dc26e6364de4dc5368e5f3fc7ff103e867c487e2b5ab2bcda9 +md5: d87ff7921124eccd67248aa483c23fec +depends: +- __glibc >=2.17,<3.0.a0 +constrains: +- zlib 1.3.2 *_2 +license: Zlib +license_family: Other +size: 63629 +timestamp: 1774072609062 +- conda: https://conda.anaconda.org/conda-forge/noarch/markdown-3.10.2-pyhcf101f3_0.conda +sha256: 20e0892592a3e7c683e3d66df704a9425d731486a97c34fc56af4da1106b2b6b +md5: ba0a9221ce1063f31692c07370d062f3 +depends: +- importlib-metadata >=4.4 +- python >=3.10 +- python +license: BSD-3-Clause +license_family: BSD +size: 85893 +timestamp: 1770694658918 +- conda: https://conda.anaconda.org/conda-forge/noarch/markdown-it-py-4.0.0-pyhd8ed1ab_0.conda +sha256: 7b1da4b5c40385791dbc3cc85ceea9fad5da680a27d5d3cb8bfaa185e304a89e +md5: 5b5203189eb668f042ac2b0826244964 +depends: +- mdurl >=0.1,<1 +- python >=3.10 +license: MIT +license_family: MIT +size: 64736 +timestamp: 1754951288511 +- conda: https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.3-py314h67df5f8_1.conda +sha256: c279be85b59a62d5c52f5dd9a4cd43ebd08933809a8416c22c3131595607d4cf +md5: 9a17c4307d23318476d7fbf0fedc0cde +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +- python >=3.14,<3.15.0a0 +- python_abi 3.14.* *_cp314 +constrains: +- jinja2 >=3.0.0 +license: BSD-3-Clause +license_family: BSD +size: 27424 +timestamp: 1772445227915 +- conda: https://conda.anaconda.org/conda-forge/linux-64/mathjax-2.7.7-ha770c72_3.tar.bz2 +sha256: 02fef69bde69db264a12f21386612262f545b6e3e68d8f1ccec19f3eaae58edf +md5: 86e69bd82c2a2c6fd29f5ab7e02b3691 +license: Apache-2.0 +license_family: Apache +size: 22281629 +timestamp: 1662784498331 +- conda: https://conda.anaconda.org/conda-forge/noarch/mdurl-0.1.2-pyhd8ed1ab_1.conda +sha256: 78c1bbe1723449c52b7a9df1af2ee5f005209f67e40b6e1d3c7619127c43b1c7 +md5: 592132998493b3ff25fd7479396e8351 +depends: +- python >=3.9 +license: MIT +license_family: MIT +size: 14465 +timestamp: 1733255681319 +- conda: https://conda.anaconda.org/bioconda/noarch/multiqc-1.33-pyhdfd78af_0.conda +sha256: f005760b13093362fc9c997d603dd487de32ab2e821a3cbce52a42bcb8136517 +md5: 698a8a27c2b9d8a542c70cb47099a75e +depends: +- click +- coloredlogs +- humanize +- importlib-metadata +- jinja2 >=3.0.0 +- jsonschema +- markdown +- natsort +- numpy +- packaging +- pillow >=10.2.0 +- plotly >=5.18 +- polars-lts-cpu +- pyaml-env +- pydantic >=2.7.1 +- python >=3.8,!=3.14.1 +- python-dotenv +- python-kaleido 0.2.1 +- pyyaml >=4 +- requests +- rich >=10 +- rich-click +- spectra >=0.0.10 +- tiktoken +- tqdm +- typeguard +license: GPL-3.0-or-later +license_family: GPL3 +size: 4198799 +timestamp: 1765300743879 +- conda: https://conda.anaconda.org/conda-forge/noarch/narwhals-2.18.1-pyhcf101f3_1.conda +sha256: 541fd4390a0687228b8578247f1536a821d9261389a65585af9d1a6f2a14e1e0 +md5: 30bec5e8f4c3969e2b1bd407c5e52afb +depends: +- python >=3.10 +- python +license: MIT +size: 280459 +timestamp: 1774380620329 +- conda: https://conda.anaconda.org/conda-forge/noarch/natsort-8.4.0-pyhcf101f3_2.conda +sha256: aeb1548eb72e4f198e72f19d242fb695b35add2ac7b2c00e0d83687052867680 +md5: e941e85e273121222580723010bd4fa2 +depends: +- python >=3.9 +- python +license: MIT +license_family: MIT +size: 39262 +timestamp: 1770905275632 +- conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda +sha256: 3fde293232fa3fca98635e1167de6b7c7fda83caf24b9d6c91ec9eefb4f4d586 +md5: 47e340acb35de30501a76c7c799c41d7 +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=13 +license: X11 AND BSD-3-Clause +size: 891641 +timestamp: 1738195959188 +- conda: https://conda.anaconda.org/conda-forge/noarch/networkx-3.6.1-pyhcf101f3_0.conda +sha256: f6a82172afc50e54741f6f84527ef10424326611503c64e359e25a19a8e4c1c6 +md5: a2c1eeadae7a309daed9d62c96012a2b +depends: +- python >=3.11 +- python +constrains: +- numpy >=1.25 +- scipy >=1.11.2 +- matplotlib-base >=3.8 +- pandas >=2.0 +license: BSD-3-Clause +license_family: BSD +size: 1587439 +timestamp: 1765215107045 +- conda: https://conda.anaconda.org/conda-forge/linux-64/nspr-4.38-h29cc59b_0.conda +sha256: e3664264bd936c357523b55c71ed5a30263c6ba278d726a75b1eb112e6fb0b64 +md5: e235d5566c9cc8970eb2798dd4ecf62f +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +- libstdcxx >=14 +license: MPL-2.0 +license_family: MOZILLA +size: 228588 +timestamp: 1762348634537 +- conda: https://conda.anaconda.org/conda-forge/linux-64/nss-3.118-h445c969_0.conda +sha256: 44dd98ffeac859d84a6dcba79a2096193a42fc10b29b28a5115687a680dd6aea +md5: 567fbeed956c200c1db5782a424e58ee +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +- libsqlite >=3.51.0,<4.0a0 +- libstdcxx >=14 +- libzlib >=1.3.1,<2.0a0 +- nspr >=4.38,<5.0a0 +license: MPL-2.0 +license_family: MOZILLA +size: 2057773 +timestamp: 1763485556350 +- conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-2.4.3-py314h2b28147_0.conda +sha256: f2ba8cb0d86a6461a6bcf0d315c80c7076083f72c6733c9290086640723f79ec +md5: 36f5b7eb328bdc204954a2225cf908e2 +depends: +- python +- libstdcxx >=14 +- libgcc >=14 +- __glibc >=2.17,<3.0.a0 +- python_abi 3.14.* *_cp314 +- libcblas >=3.9.0,<4.0a0 +- liblapack >=3.9.0,<4.0a0 +- libblas >=3.9.0,<4.0a0 +constrains: +- numpy-base <0a0 +license: BSD-3-Clause +license_family: BSD +size: 8927860 +timestamp: 1773839233468 +- conda: https://conda.anaconda.org/conda-forge/linux-64/openjpeg-2.5.4-h55fea9a_0.conda +sha256: 3900f9f2dbbf4129cf3ad6acf4e4b6f7101390b53843591c53b00f034343bc4d +md5: 11b3379b191f63139e29c0d19dee24cd +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +- libpng >=1.6.50,<1.7.0a0 +- libstdcxx >=14 +- libtiff >=4.7.1,<4.8.0a0 +- libzlib >=1.3.1,<2.0a0 +license: BSD-2-Clause +license_family: BSD +size: 355400 +timestamp: 1758489294972 +- conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.6.1-h35e630c_1.conda +sha256: 44c877f8af015332a5d12f5ff0fb20ca32f896526a7d0cdb30c769df1144fb5c +md5: f61eb8cd60ff9057122a3d338b99c00f +depends: +- __glibc >=2.17,<3.0.a0 +- ca-certificates +- libgcc >=14 +license: Apache-2.0 +license_family: Apache +size: 3164551 +timestamp: 1769555830639 +- conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.0-pyhcf101f3_0.conda +sha256: c1fc0f953048f743385d31c468b4a678b3ad20caffdeaa94bed85ba63049fd58 +md5: b76541e68fea4d511b1ac46a28dcd2c6 +depends: +- python >=3.8 +- python +license: Apache-2.0 +license_family: APACHE +size: 72010 +timestamp: 1769093650580 +- conda: https://conda.anaconda.org/conda-forge/linux-64/pillow-12.1.1-py314h8ec4b1a_0.conda +sha256: 9e6ec8f3213e8b7d64b0ad45f84c51a2c9eba4398efda31e196c9a56186133ee +md5: 79678378ae235e24b3aa83cee1b38207 +depends: +- python +- libgcc >=14 +- __glibc >=2.17,<3.0.a0 +- libwebp-base >=1.6.0,<2.0a0 +- zlib-ng >=2.3.3,<2.4.0a0 +- python_abi 3.14.* *_cp314 +- tk >=8.6.13,<8.7.0a0 +- libjpeg-turbo >=3.1.2,<4.0a0 +- libxcb >=1.17.0,<2.0a0 +- openjpeg >=2.5.4,<3.0a0 +- lcms2 >=2.18,<3.0a0 +- libtiff >=4.7.1,<4.8.0a0 +- libfreetype >=2.14.1 +- libfreetype6 >=2.14.1 +license: HPND +size: 1073026 +timestamp: 1770794002408 +- conda: https://conda.anaconda.org/conda-forge/noarch/plotly-6.6.0-pyhd8ed1ab_0.conda +sha256: c418d325359fc7a0074cea7f081ef1bce26e114d2da8a0154c5d27ecc87a08e7 +md5: 3e9427ee186846052e81fadde8ebe96a +depends: +- narwhals >=1.15.1 +- packaging +- python >=3.10 +constrains: +- ipywidgets >=7.6 +license: MIT +license_family: MIT +size: 5251872 +timestamp: 1772628857717 +- conda: https://conda.anaconda.org/conda-forge/noarch/polars-1.39.3-pyh58ad624_1.conda +sha256: d332c2d5002fc440ae37ed9679ffc21b552f18d20232390005d1dd3bce0888d3 +md5: d5a4e013a30dd8dfde9ab39f45aaf9c1 +depends: +- polars-runtime-32 ==1.39.3 +- python >=3.10 +- python +constrains: +- numpy >=1.16.0 +- pyarrow >=7.0.0 +- fastexcel >=0.9 +- openpyxl >=3.0.0 +- xlsx2csv >=0.8.0 +- connectorx >=0.3.2 +- deltalake >=1.0.0 +- pyiceberg >=0.7.1 +- altair >=5.4.0 +- great_tables >=0.8.0 +- polars-runtime-32 ==1.39.3 +- polars-runtime-64 ==1.39.3 +- polars-runtime-compat ==1.39.3 +license: MIT +license_family: MIT +size: 533495 +timestamp: 1774207987966 +- conda: https://conda.anaconda.org/conda-forge/noarch/polars-lts-cpu-1.34.0.deprecated-hc364b38_0.conda +sha256: e466fb31f67ba9bde18deafeb34263ca5eb25807f39ead0e9d753a8e82c4c4f4 +md5: ef0340e75068ac8ff96462749b5c98e7 +depends: +- polars >=1.34.0 +- polars-runtime-compat >=1.34.0 +license: MIT +license_family: MIT +size: 3902 +timestamp: 1760206808444 +- conda: https://conda.anaconda.org/conda-forge/linux-64/polars-runtime-32-1.39.3-py310hffdcd12_1.conda +noarch: python +sha256: 9744f8086bb0832998f5b01076f57ddc9efbe460e493b14303c3567dc4f401e7 +md5: f9327f9f2cfc4215f55b613e64afd3ba +depends: +- python +- libstdcxx >=14 +- libgcc >=14 +- __glibc >=2.17,<3.0.a0 +- _python_abi3_support 1.* +- cpython >=3.10 +constrains: +- __glibc >=2.17 +license: MIT +license_family: MIT +size: 37570276 +timestamp: 1774207987966 +- conda: https://conda.anaconda.org/conda-forge/linux-64/polars-runtime-compat-1.39.3-py310hbcd5346_1.conda +noarch: python +sha256: bf0b932713f0f27924f42159c98426e0073bb6145ed796eaa4cec79ca05363c7 +md5: 4b9b312453eebd6fbdbbe2a88fa1b5c4 +depends: +- python +- libgcc >=14 +- libstdcxx >=14 +- __glibc >=2.17,<3.0.a0 +- _python_abi3_support 1.* +- cpython >=3.10 +constrains: +- __glibc >=2.17 +license: MIT +license_family: MIT +size: 37224264 +timestamp: 1774207985377 +- conda: https://conda.anaconda.org/conda-forge/linux-64/procps-ng-4.0.6-h18c060e_0.conda +sha256: 4ce2e1ee31a6217998f78c31ce7dc0a3e0557d9238b51d49dd20c52d467a126d +md5: f2c23a77b25efcad57d377b34bd84941 +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +- ncurses >=6.5,<7.0a0 +license: GPL-2.0-or-later AND LGPL-2.0-or-later +license_family: GPL +size: 593603 +timestamp: 1769710381284 +- conda: https://conda.anaconda.org/conda-forge/linux-64/pthread-stubs-0.4-hb9d3cd8_1002.conda +sha256: 9c88f8c64590e9567c6c80823f0328e58d3b1efb0e1c539c0315ceca764e0973 +md5: b3c17d95b5a10c6e64a21fa17573e70e +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=13 +license: MIT +license_family: MIT +size: 8252 +timestamp: 1726802366959 +- conda: https://conda.anaconda.org/conda-forge/noarch/pyaml-env-1.2.2-pyhd8ed1ab_0.conda +sha256: 58994e0d2ea8584cb399546e6f6896d771995e6121d1a7b6a2c9948388358932 +md5: e17be1016bcc3516827b836cd3e4d9dc +depends: +- python >=3.9 +- pyyaml >=5.0,<=7.0 +license: MIT +license_family: MIT +size: 14645 +timestamp: 1736766960536 +- conda: https://conda.anaconda.org/conda-forge/noarch/pydantic-2.12.5-pyhcf101f3_1.conda +sha256: 868569d9505b7fe246c880c11e2c44924d7613a8cdcc1f6ef85d5375e892f13d +md5: c3946ed24acdb28db1b5d63321dbca7d +depends: +- typing-inspection >=0.4.2 +- typing_extensions >=4.14.1 +- python >=3.10 +- typing-extensions >=4.6.1 +- annotated-types >=0.6.0 +- pydantic-core ==2.41.5 +- python +license: MIT +license_family: MIT +size: 340482 +timestamp: 1764434463101 +- conda: https://conda.anaconda.org/conda-forge/linux-64/pydantic-core-2.41.5-py314h2e6c369_1.conda +sha256: 7e0ae379796e28a429f8e48f2fe22a0f232979d65ec455e91f8dac689247d39f +md5: 432b0716a1dfac69b86aa38fdd59b7e6 +depends: +- python +- typing-extensions >=4.6.0,!=4.7.0 +- libgcc >=14 +- __glibc >=2.17,<3.0.a0 +- python_abi 3.14.* *_cp314 +constrains: +- __glibc >=2.17 +license: MIT +license_family: MIT +size: 1943088 +timestamp: 1762988995556 +- conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda +sha256: 5577623b9f6685ece2697c6eb7511b4c9ac5fb607c9babc2646c811b428fd46a +md5: 6b6ece66ebcae2d5f326c77ef2c5a066 +depends: +- python >=3.9 +license: BSD-2-Clause +license_family: BSD +size: 889287 +timestamp: 1750615908735 +- conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda +sha256: ba3b032fa52709ce0d9fd388f63d330a026754587a2f461117cac9ab73d8d0d8 +md5: 461219d1a5bd61342293efa2c0c90eac +depends: +- __unix +- python >=3.9 +license: BSD-3-Clause +license_family: BSD +size: 21085 +timestamp: 1733217331982 +- conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.14.3-h32b2ec7_101_cp314.conda +build_number: 101 +sha256: cb0628c5f1732f889f53a877484da98f5a0e0f47326622671396fb4f2b0cd6bd +md5: c014ad06e60441661737121d3eae8a60 +depends: +- __glibc >=2.17,<3.0.a0 +- bzip2 >=1.0.8,<2.0a0 +- ld_impl_linux-64 >=2.36.1 +- libexpat >=2.7.3,<3.0a0 +- libffi >=3.5.2,<3.6.0a0 +- libgcc >=14 +- liblzma >=5.8.2,<6.0a0 +- libmpdec >=4.0.0,<5.0a0 +- libsqlite >=3.51.2,<4.0a0 +- libuuid >=2.41.3,<3.0a0 +- libzlib >=1.3.1,<2.0a0 +- ncurses >=6.5,<7.0a0 +- openssl >=3.5.5,<4.0a0 +- python_abi 3.14.* *_cp314 +- readline >=8.3,<9.0a0 +- tk >=8.6.13,<8.7.0a0 +- tzdata +- zstd >=1.5.7,<1.6.0a0 +license: Python-2.0 +size: 36702440 +timestamp: 1770675584356 +python_site_packages_path: lib/python3.14/site-packages +- conda: https://conda.anaconda.org/conda-forge/noarch/python-dotenv-1.2.2-pyhcf101f3_0.conda +sha256: 74e417a768f59f02a242c25e7db0aa796627b5bc8c818863b57786072aeb85e5 +md5: 130584ad9f3a513cdd71b1fdc1244e9c +depends: +- python >=3.10 +license: BSD-3-Clause +license_family: BSD +size: 27848 +timestamp: 1772388605021 +- conda: https://conda.anaconda.org/conda-forge/noarch/python-gil-3.14.3-h4df99d1_101.conda +sha256: 233aebd94c704ac112afefbb29cf4170b7bc606e22958906f2672081bc50638a +md5: 235765e4ea0d0301c75965985163b5a1 +depends: +- cpython 3.14.3.* +- python_abi * *_cp314 +license: Python-2.0 +size: 50062 +timestamp: 1770674497152 +- conda: https://conda.anaconda.org/conda-forge/noarch/python-kaleido-0.2.1-pyhd8ed1ab_0.tar.bz2 +sha256: e17bf63a30aec33432f1ead86e15e9febde9fc40a7f869c0e766be8d2db44170 +md5: 310259a5b03ff02289d7705f39e2b1d2 +depends: +- kaleido-core 0.2.1.* +- python >=3.5 +license: MIT +license_family: MIT +size: 18320 +timestamp: 1615204747600 +- conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.14-8_cp314.conda +build_number: 8 +sha256: ad6d2e9ac39751cc0529dd1566a26751a0bf2542adb0c232533d32e176e21db5 +md5: 0539938c55b6b1a59b560e843ad864a4 +constrains: +- python 3.14.* *_cp314 +license: BSD-3-Clause +license_family: BSD +size: 6989 +timestamp: 1752805904792 +- conda: https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.3-py314h67df5f8_1.conda +sha256: b318fb070c7a1f89980ef124b80a0b5ccf3928143708a85e0053cde0169c699d +md5: 2035f68f96be30dc60a5dfd7452c7941 +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +- python >=3.14,<3.15.0a0 +- python_abi 3.14.* *_cp314 +- yaml >=0.2.5,<0.3.0a0 +license: MIT +license_family: MIT +size: 202391 +timestamp: 1770223462836 +- conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda +sha256: 12ffde5a6f958e285aa22c191ca01bbd3d6e710aa852e00618fa6ddc59149002 +md5: d7d95fc8287ea7bf33e0e7116d2b95ec +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +- ncurses >=6.5,<7.0a0 +license: GPL-3.0-only +license_family: GPL +size: 345073 +timestamp: 1765813471974 +- conda: https://conda.anaconda.org/conda-forge/noarch/referencing-0.37.0-pyhcf101f3_0.conda +sha256: 0577eedfb347ff94d0f2fa6c052c502989b028216996b45c7f21236f25864414 +md5: 870293df500ca7e18bedefa5838a22ab +depends: +- attrs >=22.2.0 +- python >=3.10 +- rpds-py >=0.7.0 +- typing_extensions >=4.4.0 +- python +license: MIT +license_family: MIT +size: 51788 +timestamp: 1760379115194 +- conda: https://conda.anaconda.org/conda-forge/linux-64/regex-2026.2.28-py314h5bd0f2a_0.conda +sha256: e085e336f1446f5263a3ec9747df8c719b6996753901181add50dc4fdd8bb2e8 +md5: 3c8b6a8c4d0ff5a264e9831eac4941f4 +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +- python >=3.14,<3.15.0a0 +- python_abi 3.14.* *_cp314 +license: Apache-2.0 AND CNRI-Python +license_family: PSF +size: 411924 +timestamp: 1772255161535 +- conda: https://conda.anaconda.org/conda-forge/noarch/requests-2.32.5-pyhcf101f3_1.conda +sha256: 7813c38b79ae549504b2c57b3f33394cea4f2ad083f0994d2045c2e24cb538c5 +md5: c65df89a0b2e321045a9e01d1337b182 +depends: +- python >=3.10 +- certifi >=2017.4.17 +- charset-normalizer >=2,<4 +- idna >=2.5,<4 +- urllib3 >=1.21.1,<3 +- python +constrains: +- chardet >=3.0.2,<6 +license: Apache-2.0 +license_family: APACHE +size: 63602 +timestamp: 1766926974520 +- conda: https://conda.anaconda.org/conda-forge/noarch/rich-14.3.3-pyhcf101f3_0.conda +sha256: b06ce84d6a10c266811a7d3adbfa1c11f13393b91cc6f8a5b468277d90be9590 +md5: 7a6289c50631d620652f5045a63eb573 +depends: +- markdown-it-py >=2.2.0 +- pygments >=2.13.0,<3.0.0 +- python >=3.10 +- typing_extensions >=4.0.0,<5.0.0 +- python +license: MIT +license_family: MIT +size: 208472 +timestamp: 1771572730357 +- conda: https://conda.anaconda.org/conda-forge/noarch/rich-click-1.9.7-pyh8f84b5b_0.conda +sha256: aa3fcb167321bae51998de2e94d199109c9024f25a5a063cb1c28d8f1af33436 +md5: 0c20a8ebcddb24a45da89d5e917e6cb9 +depends: +- python >=3.10 +- rich >=12 +- click >=8 +- typing-extensions >=4 +- __unix +- python +license: MIT +license_family: MIT +size: 64356 +timestamp: 1769850479089 +- conda: https://conda.anaconda.org/conda-forge/linux-64/rpds-py-0.30.0-py314h2e6c369_0.conda +sha256: e53b0cbf3b324eaa03ca1fe1a688fdf4ab42cea9c25270b0a7307d8aaaa4f446 +md5: c1c368b5437b0d1a68f372ccf01cb133 +depends: +- python +- libgcc >=14 +- __glibc >=2.17,<3.0.a0 +- python_abi 3.14.* *_cp314 +constrains: +- __glibc >=2.17 +license: MIT +license_family: MIT +size: 376121 +timestamp: 1764543122774 +- conda: https://conda.anaconda.org/conda-forge/noarch/spectra-0.0.11-pyhd8ed1ab_2.conda +sha256: 7c65782d2511738e62c70462e89d65da4fa54d5a7e47c46667bcd27a59f81876 +md5: 472239e4eb7b5a84bb96b3ed7e3a596a +depends: +- colormath >=3.0.0 +- python >=3.9 +license: MIT +license_family: MIT +size: 22284 +timestamp: 1735770589188 +- conda: https://conda.anaconda.org/conda-forge/linux-64/sqlite-3.52.0-h04a0ce9_0.conda +sha256: c9af81e7830d9c4b67a7f48e512d060df2676b29cac59e3b31f09dbfcee29c58 +md5: 7d9d7efe9541d4bb71b5934e8ee348ea +depends: +- __glibc >=2.17,<3.0.a0 +- icu >=78.2,<79.0a0 +- libgcc >=14 +- libsqlite 3.52.0 hf4e2dac_0 +- libzlib >=1.3.1,<2.0a0 +- ncurses >=6.5,<7.0a0 +- readline >=8.3,<9.0a0 +license: blessing +size: 203641 +timestamp: 1772818888368 +- conda: https://conda.anaconda.org/conda-forge/linux-64/tiktoken-0.12.0-py314h67fec18_3.conda +sha256: 7e395d67fd249d901beb1ae269057763c0d8c3ee5f7a348694bdb16d158a37d9 +md5: d705f9d8a1185a2b01cced191177a028 +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +- libstdcxx >=14 +- python >=3.14,<3.15.0a0 +- python_abi 3.14.* *_cp314 +- regex >=2022.1.18 +- requests >=2.26.0 +constrains: +- __glibc >=2.17 +license: MIT +license_family: MIT +size: 939648 +timestamp: 1764028306357 +- conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h366c992_103.conda +sha256: cafeec44494f842ffeca27e9c8b0c27ed714f93ac77ddadc6aaf726b5554ebac +md5: cffd3bdd58090148f4cfcd831f4b26ab +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +- libzlib >=1.3.1,<2.0a0 +constrains: +- xorg-libx11 >=1.8.12,<2.0a0 +license: TCL +license_family: BSD +size: 3301196 +timestamp: 1769460227866 +- conda: https://conda.anaconda.org/conda-forge/noarch/tqdm-4.67.3-pyh8f84b5b_0.conda +sha256: 9ef8e47cf00e4d6dcc114eb32a1504cc18206300572ef14d76634ba29dfe1eb6 +md5: e5ce43272193b38c2e9037446c1d9206 +depends: +- python >=3.10 +- __unix +- python +license: MPL-2.0 and MIT +size: 94132 +timestamp: 1770153424136 +- conda: https://conda.anaconda.org/conda-forge/noarch/typeguard-4.5.1-pyhd8ed1ab_0.conda +sha256: 39d8ae33c43cdb8f771373e149b0b4fae5a08960ac58dcca95b2f1642bb17448 +md5: 260af1b0a94f719de76b4e14094e9a3b +depends: +- importlib-metadata >=3.6 +- python >=3.10 +- typing-extensions >=4.10.0 +- typing_extensions >=4.14.0 +constrains: +- pytest >=7 +license: MIT +license_family: MIT +size: 36838 +timestamp: 1771532971545 +- conda: https://conda.anaconda.org/conda-forge/noarch/typing-extensions-4.15.0-h396c80c_0.conda +sha256: 7c2df5721c742c2a47b2c8f960e718c930031663ac1174da67c1ed5999f7938c +md5: edd329d7d3a4ab45dcf905899a7a6115 +depends: +- typing_extensions ==4.15.0 pyhcf101f3_0 +license: PSF-2.0 +license_family: PSF +size: 91383 +timestamp: 1756220668932 +- conda: https://conda.anaconda.org/conda-forge/noarch/typing-inspection-0.4.2-pyhd8ed1ab_1.conda +sha256: 70db27de58a97aeb7ba7448366c9853f91b21137492e0b4430251a1870aa8ff4 +md5: a0a4a3035667fc34f29bfbd5c190baa6 +depends: +- python >=3.10 +- typing_extensions >=4.12.0 +license: MIT +license_family: MIT +size: 18923 +timestamp: 1764158430324 +- conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.15.0-pyhcf101f3_0.conda +sha256: 032271135bca55aeb156cee361c81350c6f3fb203f57d024d7e5a1fc9ef18731 +md5: 0caa1af407ecff61170c9437a808404d +depends: +- python >=3.10 +- python +license: PSF-2.0 +license_family: PSF +size: 51692 +timestamp: 1756220668932 +- conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda +sha256: 1d30098909076af33a35017eed6f2953af1c769e273a0626a04722ac4acaba3c +md5: ad659d0a2b3e47e38d829aa8cad2d610 +license: LicenseRef-Public-Domain +size: 119135 +timestamp: 1767016325805 +- conda: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.6.3-pyhd8ed1ab_0.conda +sha256: af641ca7ab0c64525a96fd9ad3081b0f5bcf5d1cbb091afb3f6ed5a9eee6111a +md5: 9272daa869e03efe68833e3dc7a02130 +depends: +- backports.zstd >=1.0.0 +- brotli-python >=1.2.0 +- h2 >=4,<5 +- pysocks >=1.5.6,<2.0,!=1.5.7 +- python >=3.10 +license: MIT +license_family: MIT +size: 103172 +timestamp: 1767817860341 +- conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxau-1.0.12-hb03c661_1.conda +sha256: 6bc6ab7a90a5d8ac94c7e300cc10beb0500eeba4b99822768ca2f2ef356f731b +md5: b2895afaf55bf96a8c8282a2e47a5de0 +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +license: MIT +license_family: MIT +size: 15321 +timestamp: 1762976464266 +- conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxdmcp-1.1.5-hb03c661_1.conda +sha256: 25d255fb2eef929d21ff660a0c687d38a6d2ccfbcbf0cc6aa738b12af6e9d142 +md5: 1dafce8548e38671bea82e3f5c6ce22f +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +license: MIT +license_family: MIT +size: 20591 +timestamp: 1762976546182 +- conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h280c20c_3.conda +sha256: 6d9ea2f731e284e9316d95fa61869fe7bbba33df7929f82693c121022810f4ad +md5: a77f85f77be52ff59391544bfe73390a +depends: +- libgcc >=14 +- __glibc >=2.17,<3.0.a0 +license: MIT +license_family: MIT +size: 85189 +timestamp: 1753484064210 +- conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda +sha256: b4533f7d9efc976511a73ef7d4a2473406d7f4c750884be8e8620b0ce70f4dae +md5: 30cd29cb87d819caead4d55184c1d115 +depends: +- python >=3.10 +- python +license: MIT +license_family: MIT +size: 24194 +timestamp: 1764460141901 +- conda: https://conda.anaconda.org/conda-forge/linux-64/zlib-ng-2.3.3-hceb46e0_1.conda +sha256: ea4e50c465d70236408cb0bfe0115609fd14db1adcd8bd30d8918e0291f8a75f +md5: 2aadb0d17215603a82a2a6b0afd9a4cb +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +- libstdcxx >=14 +license: Zlib +license_family: Other +size: 122618 +timestamp: 1770167931827 +- conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda +sha256: 68f0206ca6e98fea941e5717cec780ed2873ffabc0e1ed34428c061e2c6268c7 +md5: 4a13eeac0b5c8e5b8ab496e6c4ddd829 +depends: +- __glibc >=2.17,<3.0.a0 +- libzlib >=1.3.1,<2.0a0 +license: BSD-3-Clause +license_family: BSD +size: 601375 +timestamp: 1764777111296 diff --git a/modules/nf-core/multiqc/.conda-lock/linux_arm64-bd-40bf3b435e89dc22_1.txt b/modules/nf-core/multiqc/.conda-lock/linux_arm64-bd-40bf3b435e89dc22_1.txt new file mode 100644 index 00000000..a58231a0 --- /dev/null +++ b/modules/nf-core/multiqc/.conda-lock/linux_arm64-bd-40bf3b435e89dc22_1.txt @@ -0,0 +1,1502 @@ + +version: 6 +environments: +default: +channels: +- url: https://conda.anaconda.org/conda-forge/ +- url: https://conda.anaconda.org/bioconda/ +- url: https://conda.anaconda.org/bioconda/ +options: +pypi-prerelease-mode: if-necessary-or-explicit +packages: +linux-aarch64: +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/_openmp_mutex-4.5-20_gnu.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/_python_abi3_support-1.0-hd8ed1ab_2.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/annotated-types-0.7.0-pyhd8ed1ab_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/attrs-26.1.0-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/backports.zstd-1.3.0-py314h680f03e_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/brotli-python-1.2.0-py314h352cb57_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/bzip2-1.0.8-h4777abc_9.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.2.25-hbd8a1cb_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/certifi-2026.2.25-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/charset-normalizer-3.4.6-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/click-8.3.1-pyh8f84b5b_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/coloredlogs-15.0.1-pyhd8ed1ab_4.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/colormath-3.0.0-pyhd8ed1ab_4.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/cpython-3.14.3-py314hd8ed1ab_101.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/expat-2.7.4-hfae3067_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-dejavu-sans-mono-2.37-hab24e00_0.tar.bz2 +- conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-inconsolata-3.000-h77eed37_0.tar.bz2 +- conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-source-code-pro-2.038-h77eed37_0.tar.bz2 +- conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-ubuntu-0.83-h77eed37_3.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/fontconfig-2.17.1-hba86a56_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/fonts-conda-forge-1-hc364b38_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/h2-4.3.0-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/hpack-4.1.0-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/humanfriendly-10.0-pyh707e725_8.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/humanize-4.15.0-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.1.0-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/icu-78.3-hcab7f73_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/idna-3.11-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.8.0-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.6-pyhcf101f3_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/jsonschema-4.26.0-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/jsonschema-specifications-2025.9.1-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/kaleido-core-0.2.1-he5a581e_0.tar.bz2 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/lcms2-2.18-h9d5b58d_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/ld_impl_linux-aarch64-2.45.1-default_h1979696_102.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/lerc-4.1.0-h52b7260_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libblas-3.11.0-5_haddc8a3_openblas.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcblas-3.11.0-5_hd72aa62_openblas.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libdeflate-1.25-h1af38f5_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libexpat-2.7.4-hfae3067_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libffi-3.5.2-h376a255_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libfreetype-2.14.3-h8af1aa0_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libfreetype6-2.14.3-hdae7a39_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgcc-15.2.0-h8acb6b2_18.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgcc-ng-15.2.0-he9431aa_18.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgfortran-15.2.0-he9431aa_18.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgfortran5-15.2.0-h1b7bec0_18.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgomp-15.2.0-h8acb6b2_18.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libjpeg-turbo-3.1.2-he30d5cf_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/liblapack-3.11.0-5_h88aeb00_openblas.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/liblzma-5.8.2-he30d5cf_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libmpdec-4.0.0-he30d5cf_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenblas-0.3.30-pthreads_h9d3fd7e_4.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libpng-1.6.55-h1abf092_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libsqlite-3.52.0-h10b116e_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libstdcxx-15.2.0-hef695bb_18.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libtiff-4.7.1-hdb009f0_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libuuid-2.41.3-h1022ec0_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libwebp-base-1.6.0-ha2e29f5_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libxcb-1.17.0-h262b8f6_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libzlib-1.3.2-hdc9db2a_2.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/markdown-3.10.2-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/markdown-it-py-4.0.0-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/markupsafe-3.0.3-py314hb76de3f_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/mathjax-2.7.7-h8af1aa0_3.tar.bz2 +- conda: https://conda.anaconda.org/conda-forge/noarch/mdurl-0.1.2-pyhd8ed1ab_1.conda +- conda: https://conda.anaconda.org/bioconda/noarch/multiqc-1.33-pyhdfd78af_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/narwhals-2.18.1-pyhcf101f3_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/natsort-8.4.0-pyhcf101f3_2.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/ncurses-6.5-ha32ae93_3.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/networkx-3.6.1-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/nspr-4.38-h3ad9384_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/nss-3.118-h544fa81_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/numpy-2.4.3-py314haac167e_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/openjpeg-2.5.4-h5da879a_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/openssl-3.6.1-h546c87b_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.0-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pillow-12.1.1-py314hac3e5ec_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/plotly-6.6.0-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/polars-1.39.3-pyh58ad624_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/polars-lts-cpu-1.34.0.deprecated-hc364b38_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/polars-runtime-32-1.39.3-py310hff09b76_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/polars-runtime-compat-1.39.3-py310hf00a4a2_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/procps-ng-4.0.6-h1779866_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pthread-stubs-0.4-h86ecc28_1002.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/pyaml-env-1.2.2-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/pydantic-2.12.5-pyhcf101f3_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pydantic-core-2.41.5-py314h451b6cc_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/python-3.14.3-hb06a95a_101_cp314.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/python-dotenv-1.2.2-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/python-gil-3.14.3-h4df99d1_101.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/python-kaleido-0.2.1-pyhd8ed1ab_0.tar.bz2 +- conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.14-8_cp314.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pyyaml-6.0.3-py314h807365f_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/readline-8.3-hb682ff5_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/referencing-0.37.0-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/regex-2026.2.28-py314h51f160d_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/requests-2.32.5-pyhcf101f3_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/rich-14.3.3-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/rich-click-1.9.7-pyh8f84b5b_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/rpds-py-0.30.0-py314h02b7a91_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/spectra-0.0.11-pyhd8ed1ab_2.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/sqlite-3.52.0-hf1c7be2_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/tiktoken-0.12.0-py314h6a36e60_3.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/tk-8.6.13-noxft_h0dc03b3_103.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/tqdm-4.67.3-pyh8f84b5b_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/typeguard-4.5.1-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/typing-extensions-4.15.0-h396c80c_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/typing-inspection-0.4.2-pyhd8ed1ab_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.15.0-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.6.3-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxau-1.0.12-he30d5cf_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxdmcp-1.1.5-he30d5cf_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/yaml-0.2.5-h80f16a2_3.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/zlib-ng-2.3.3-ha7cb516_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/zstd-1.5.7-h85ac4a6_6.conda +packages: +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/_openmp_mutex-4.5-20_gnu.conda +build_number: 20 +sha256: a2527b1d81792a0ccd2c05850960df119c2b6d8f5fdec97f2db7d25dc23b1068 +md5: 468fd3bb9e1f671d36c2cbc677e56f1d +depends: +- libgomp >=7.5.0 +constrains: +- openmp_impl <0.0a0 +license: BSD-3-Clause +license_family: BSD +size: 28926 +timestamp: 1770939656741 +- conda: https://conda.anaconda.org/conda-forge/noarch/_python_abi3_support-1.0-hd8ed1ab_2.conda +sha256: a3967b937b9abf0f2a99f3173fa4630293979bd1644709d89580e7c62a544661 +md5: aaa2a381ccc56eac91d63b6c1240312f +depends: +- cpython +- python-gil +license: MIT +license_family: MIT +size: 8191 +timestamp: 1744137672556 +- conda: https://conda.anaconda.org/conda-forge/noarch/annotated-types-0.7.0-pyhd8ed1ab_1.conda +sha256: e0ea1ba78fbb64f17062601edda82097fcf815012cf52bb704150a2668110d48 +md5: 2934f256a8acfe48f6ebb4fce6cde29c +depends: +- python >=3.9 +- typing-extensions >=4.0.0 +license: MIT +license_family: MIT +size: 18074 +timestamp: 1733247158254 +- conda: https://conda.anaconda.org/conda-forge/noarch/attrs-26.1.0-pyhcf101f3_0.conda +sha256: 1b6124230bb4e571b1b9401537ecff575b7b109cc3a21ee019f65e083b8399ab +md5: c6b0543676ecb1fb2d7643941fe375f2 +depends: +- python >=3.10 +- python +license: MIT +license_family: MIT +size: 64927 +timestamp: 1773935801332 +- conda: https://conda.anaconda.org/conda-forge/noarch/backports.zstd-1.3.0-py314h680f03e_0.conda +noarch: generic +sha256: c31ab719d256bc6f89926131e88ecd0f0c5d003fe8481852c6424f4ec6c7eb29 +md5: a2ac7763a9ac75055b68f325d3255265 +depends: +- python >=3.14 +license: BSD-3-Clause AND MIT AND EPL-2.0 +size: 7514 +timestamp: 1767044983590 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/brotli-python-1.2.0-py314h352cb57_1.conda +sha256: 5a5b0cdcd7ed89c6a8fb830924967f6314a2b71944bc1ebc2c105781ba97aa75 +md5: a1b5c571a0923a205d663d8678df4792 +depends: +- libgcc >=14 +- libstdcxx >=14 +- python >=3.14,<3.15.0a0 +- python >=3.14,<3.15.0a0 *_cp314 +- python_abi 3.14.* *_cp314 +constrains: +- libbrotlicommon 1.2.0 he30d5cf_1 +license: MIT +license_family: MIT +size: 373193 +timestamp: 1764017486851 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/bzip2-1.0.8-h4777abc_9.conda +sha256: b3495077889dde6bb370938e7db82be545c73e8589696ad0843a32221520ad4c +md5: 840d8fc0d7b3209be93080bc20e07f2d +depends: +- libgcc >=14 +license: bzip2-1.0.6 +license_family: BSD +size: 192412 +timestamp: 1771350241232 +- conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.2.25-hbd8a1cb_0.conda +sha256: 67cc7101b36421c5913a1687ef1b99f85b5d6868da3abbf6ec1a4181e79782fc +md5: 4492fd26db29495f0ba23f146cd5638d +depends: +- __unix +license: ISC +size: 147413 +timestamp: 1772006283803 +- conda: https://conda.anaconda.org/conda-forge/noarch/certifi-2026.2.25-pyhd8ed1ab_0.conda +sha256: a6b118fd1ed6099dc4fc03f9c492b88882a780fadaef4ed4f93dc70757713656 +md5: 765c4d97e877cdbbb88ff33152b86125 +depends: +- python >=3.10 +license: ISC +size: 151445 +timestamp: 1772001170301 +- conda: https://conda.anaconda.org/conda-forge/noarch/charset-normalizer-3.4.6-pyhd8ed1ab_0.conda +sha256: d86dfd428b2e3c364fa90e07437c8405d635aa4ef54b25ab51d9c712be4112a5 +md5: 49ee13eb9b8f44d63879c69b8a40a74b +depends: +- python >=3.10 +license: MIT +license_family: MIT +size: 58510 +timestamp: 1773660086450 +- conda: https://conda.anaconda.org/conda-forge/noarch/click-8.3.1-pyh8f84b5b_1.conda +sha256: 38cfe1ee75b21a8361c8824f5544c3866f303af1762693a178266d7f198e8715 +md5: ea8a6c3256897cc31263de9f455e25d9 +depends: +- python >=3.10 +- __unix +- python +license: BSD-3-Clause +license_family: BSD +size: 97676 +timestamp: 1764518652276 +- conda: https://conda.anaconda.org/conda-forge/noarch/coloredlogs-15.0.1-pyhd8ed1ab_4.conda +sha256: 8021c76eeadbdd5784b881b165242db9449783e12ce26d6234060026fd6a8680 +md5: b866ff7007b934d564961066c8195983 +depends: +- humanfriendly >=9.1 +- python >=3.9 +license: MIT +license_family: MIT +size: 43758 +timestamp: 1733928076798 +- conda: https://conda.anaconda.org/conda-forge/noarch/colormath-3.0.0-pyhd8ed1ab_4.conda +sha256: 59c9e29800b483b390467f90e82b0da3a4fbf0612efe1c90813fca232780e160 +md5: 071cf7b0ce333c81718b054066c15102 +depends: +- networkx >=2.0 +- numpy +- python >=3.9 +license: BSD-3-Clause +license_family: BSD +size: 39326 +timestamp: 1735759976140 +- conda: https://conda.anaconda.org/conda-forge/noarch/cpython-3.14.3-py314hd8ed1ab_101.conda +noarch: generic +sha256: 91b06300879df746214f7363d6c27c2489c80732e46a369eb2afc234bcafb44c +md5: 3bb89e4f795e5414addaa531d6b1500a +depends: +- python >=3.14,<3.15.0a0 +- python_abi * *_cp314 +license: Python-2.0 +size: 50078 +timestamp: 1770674447292 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/expat-2.7.4-hfae3067_0.conda +sha256: 5f087bef054c681edcaae84a8c2230585b938691e371ff92957a30707b7fcdf7 +md5: b304307db639831ad7caabd2eac6fca6 +depends: +- libexpat 2.7.4 hfae3067_0 +- libgcc >=14 +license: MIT +license_family: MIT +size: 137701 +timestamp: 1771259543650 +- conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-dejavu-sans-mono-2.37-hab24e00_0.tar.bz2 +sha256: 58d7f40d2940dd0a8aa28651239adbf5613254df0f75789919c4e6762054403b +md5: 0c96522c6bdaed4b1566d11387caaf45 +license: BSD-3-Clause +license_family: BSD +size: 397370 +timestamp: 1566932522327 +- conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-inconsolata-3.000-h77eed37_0.tar.bz2 +sha256: c52a29fdac682c20d252facc50f01e7c2e7ceac52aa9817aaf0bb83f7559ec5c +md5: 34893075a5c9e55cdafac56607368fc6 +license: OFL-1.1 +license_family: Other +size: 96530 +timestamp: 1620479909603 +- conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-source-code-pro-2.038-h77eed37_0.tar.bz2 +sha256: 00925c8c055a2275614b4d983e1df637245e19058d79fc7dd1a93b8d9fb4b139 +md5: 4d59c254e01d9cde7957100457e2d5fb +license: OFL-1.1 +license_family: Other +size: 700814 +timestamp: 1620479612257 +- conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-ubuntu-0.83-h77eed37_3.conda +sha256: 2821ec1dc454bd8b9a31d0ed22a7ce22422c0aef163c59f49dfdf915d0f0ca14 +md5: 49023d73832ef61042f6a237cb2687e7 +license: LicenseRef-Ubuntu-Font-Licence-Version-1.0 +license_family: Other +size: 1620504 +timestamp: 1727511233259 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/fontconfig-2.17.1-hba86a56_0.conda +sha256: 835aff8615dd8d8fff377679710ce81b8a2c47b6404e21a92fb349fda193a15c +md5: 0fed1ff55f4938a65907f3ecf62609db +depends: +- libexpat >=2.7.4,<3.0a0 +- libfreetype >=2.14.1 +- libfreetype6 >=2.14.1 +- libgcc >=14 +- libuuid >=2.41.3,<3.0a0 +- libzlib >=1.3.1,<2.0a0 +license: MIT +license_family: MIT +size: 279044 +timestamp: 1771382728182 +- conda: https://conda.anaconda.org/conda-forge/noarch/fonts-conda-forge-1-hc364b38_1.conda +sha256: 54eea8469786bc2291cc40bca5f46438d3e062a399e8f53f013b6a9f50e98333 +md5: a7970cd949a077b7cb9696379d338681 +depends: +- font-ttf-ubuntu +- font-ttf-inconsolata +- font-ttf-dejavu-sans-mono +- font-ttf-source-code-pro +license: BSD-3-Clause +license_family: BSD +size: 4059 +timestamp: 1762351264405 +- conda: https://conda.anaconda.org/conda-forge/noarch/h2-4.3.0-pyhcf101f3_0.conda +sha256: 84c64443368f84b600bfecc529a1194a3b14c3656ee2e832d15a20e0329b6da3 +md5: 164fc43f0b53b6e3a7bc7dce5e4f1dc9 +depends: +- python >=3.10 +- hyperframe >=6.1,<7 +- hpack >=4.1,<5 +- python +license: MIT +license_family: MIT +size: 95967 +timestamp: 1756364871835 +- conda: https://conda.anaconda.org/conda-forge/noarch/hpack-4.1.0-pyhd8ed1ab_0.conda +sha256: 6ad78a180576c706aabeb5b4c8ceb97c0cb25f1e112d76495bff23e3779948ba +md5: 0a802cb9888dd14eeefc611f05c40b6e +depends: +- python >=3.9 +license: MIT +license_family: MIT +size: 30731 +timestamp: 1737618390337 +- conda: https://conda.anaconda.org/conda-forge/noarch/humanfriendly-10.0-pyh707e725_8.conda +sha256: fa2071da7fab758c669e78227e6094f6b3608228740808a6de5d6bce83d9e52d +md5: 7fe569c10905402ed47024fc481bb371 +depends: +- __unix +- python >=3.9 +license: MIT +license_family: MIT +size: 73563 +timestamp: 1733928021866 +- conda: https://conda.anaconda.org/conda-forge/noarch/humanize-4.15.0-pyhd8ed1ab_0.conda +sha256: 6c4343b376d0b12a4c75ab992640970d36c933cad1fd924f6a1181fa91710e80 +md5: daddf757c3ecd6067b9af1df1f25d89e +depends: +- python >=3.10 +license: MIT +license_family: MIT +size: 67994 +timestamp: 1766267728652 +- conda: https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.1.0-pyhd8ed1ab_0.conda +sha256: 77af6f5fe8b62ca07d09ac60127a30d9069fdc3c68d6b256754d0ffb1f7779f8 +md5: 8e6923fc12f1fe8f8c4e5c9f343256ac +depends: +- python >=3.9 +license: MIT +license_family: MIT +size: 17397 +timestamp: 1737618427549 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/icu-78.3-hcab7f73_0.conda +sha256: 49ba6aed2c6b482bb0ba41078057555d29764299bc947b990708617712ef6406 +md5: 546da38c2fa9efacf203e2ad3f987c59 +depends: +- libgcc >=14 +- libstdcxx >=14 +license: MIT +license_family: MIT +size: 12837286 +timestamp: 1773822650615 +- conda: https://conda.anaconda.org/conda-forge/noarch/idna-3.11-pyhd8ed1ab_0.conda +sha256: ae89d0299ada2a3162c2614a9d26557a92aa6a77120ce142f8e0109bbf0342b0 +md5: 53abe63df7e10a6ba605dc5f9f961d36 +depends: +- python >=3.10 +license: BSD-3-Clause +license_family: BSD +size: 50721 +timestamp: 1760286526795 +- conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.8.0-pyhcf101f3_0.conda +sha256: 82ab2a0d91ca1e7e63ab6a4939356667ef683905dea631bc2121aa534d347b16 +md5: 080594bf4493e6bae2607e65390c520a +depends: +- python >=3.10 +- zipp >=3.20 +- python +license: Apache-2.0 +license_family: APACHE +size: 34387 +timestamp: 1773931568510 +- conda: https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.6-pyhcf101f3_1.conda +sha256: fc9ca7348a4f25fed2079f2153ecdcf5f9cf2a0bc36c4172420ca09e1849df7b +md5: 04558c96691bed63104678757beb4f8d +depends: +- markupsafe >=2.0 +- python >=3.10 +- python +license: BSD-3-Clause +license_family: BSD +size: 120685 +timestamp: 1764517220861 +- conda: https://conda.anaconda.org/conda-forge/noarch/jsonschema-4.26.0-pyhcf101f3_0.conda +sha256: db973a37d75db8e19b5f44bbbdaead0c68dde745407f281e2a7fe4db74ec51d7 +md5: ada41c863af263cc4c5fcbaff7c3e4dc +depends: +- attrs >=22.2.0 +- jsonschema-specifications >=2023.3.6 +- python >=3.10 +- referencing >=0.28.4 +- rpds-py >=0.25.0 +- python +license: MIT +license_family: MIT +size: 82356 +timestamp: 1767839954256 +- conda: https://conda.anaconda.org/conda-forge/noarch/jsonschema-specifications-2025.9.1-pyhcf101f3_0.conda +sha256: 0a4f3b132f0faca10c89fdf3b60e15abb62ded6fa80aebfc007d05965192aa04 +md5: 439cd0f567d697b20a8f45cb70a1005a +depends: +- python >=3.10 +- referencing >=0.31.0 +- python +license: MIT +license_family: MIT +size: 19236 +timestamp: 1757335715225 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/kaleido-core-0.2.1-he5a581e_0.tar.bz2 +sha256: d3c7f4797566e6f983d16c2a87063a18e4b2d819a66230190a21584d70042755 +md5: 4f0d284f5d11e04277b552eb1c172c7f +depends: +- __glibc >=2.17,<3.0.a0 +- expat >=2.2.10,<3.0.0a0 +- fontconfig +- fonts-conda-forge +- libgcc-ng >=9.3.0 +- mathjax 2.7.* +- nspr >=4.29,<5.0a0 +- nss >=3.62,<4.0a0 +- sqlite >=3.34.0,<4.0a0 +license: MIT +license_family: MIT +size: 65750397 +timestamp: 1615199465742 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/lcms2-2.18-h9d5b58d_0.conda +sha256: 379ef5e91a587137391a6149755d0e929f1a007d2dcb211318ac670a46c8596f +md5: bb960f01525b5e001608afef9d47b79c +depends: +- libgcc >=14 +- libjpeg-turbo >=3.1.2,<4.0a0 +- libtiff >=4.7.1,<4.8.0a0 +license: MIT +license_family: MIT +size: 293039 +timestamp: 1768184778398 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/ld_impl_linux-aarch64-2.45.1-default_h1979696_102.conda +sha256: 7abd913d81a9bf00abb699e8987966baa2065f5132e37e815f92d90fc6bba530 +md5: a21644fc4a83da26452a718dc9468d5f +depends: +- zstd >=1.5.7,<1.6.0a0 +constrains: +- binutils_impl_linux-aarch64 2.45.1 +license: GPL-3.0-only +license_family: GPL +size: 875596 +timestamp: 1774197520746 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/lerc-4.1.0-h52b7260_0.conda +sha256: 8957fd460c1c132c8031f65fd5f56ec3807fd71b7cab2c5e2b0937b13404ab36 +md5: d13423b06447113a90b5b1366d4da171 +depends: +- libgcc >=14 +- libstdcxx >=14 +license: Apache-2.0 +license_family: Apache +size: 240444 +timestamp: 1773114901155 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libblas-3.11.0-5_haddc8a3_openblas.conda +build_number: 5 +sha256: 700f3c03d0fba8e687a345404a45fbabe781c1cf92242382f62cef2948745ec4 +md5: 5afcea37a46f76ec1322943b3c4dfdc0 +depends: +- libopenblas >=0.3.30,<0.3.31.0a0 +- libopenblas >=0.3.30,<1.0a0 +constrains: +- mkl <2026 +- libcblas 3.11.0 5*_openblas +- liblapack 3.11.0 5*_openblas +- liblapacke 3.11.0 5*_openblas +- blas 2.305 openblas +license: BSD-3-Clause +license_family: BSD +size: 18369 +timestamp: 1765818610617 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcblas-3.11.0-5_hd72aa62_openblas.conda +build_number: 5 +sha256: 3fad5c9de161dccb4e42c8b1ae8eccb33f4ed56bccbcced9cbb0956ae7869e61 +md5: 0b2f1143ae2d0aa4c991959d0daaf256 +depends: +- libblas 3.11.0 5_haddc8a3_openblas +constrains: +- liblapack 3.11.0 5*_openblas +- liblapacke 3.11.0 5*_openblas +- blas 2.305 openblas +license: BSD-3-Clause +license_family: BSD +size: 18371 +timestamp: 1765818618899 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libdeflate-1.25-h1af38f5_0.conda +sha256: 48814b73bd462da6eed2e697e30c060ae16af21e9fbed30d64feaf0aad9da392 +md5: a9138815598fe6b91a1d6782ca657b0c +depends: +- libgcc >=14 +license: MIT +license_family: MIT +size: 71117 +timestamp: 1761979776756 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libexpat-2.7.4-hfae3067_0.conda +sha256: 995ce3ad96d0f4b5ed6296b051a0d7b6377718f325bc0e792fbb96b0e369dad7 +md5: 57f3b3da02a50a1be2a6fe847515417d +depends: +- libgcc >=14 +constrains: +- expat 2.7.4.* +license: MIT +license_family: MIT +size: 76564 +timestamp: 1771259530958 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libffi-3.5.2-h376a255_0.conda +sha256: 3df4c539449aabc3443bbe8c492c01d401eea894603087fca2917aa4e1c2dea9 +md5: 2f364feefb6a7c00423e80dcb12db62a +depends: +- libgcc >=14 +license: MIT +license_family: MIT +size: 55952 +timestamp: 1769456078358 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libfreetype-2.14.3-h8af1aa0_0.conda +sha256: 752e4f66283d7deb4c6fd47d88df644d8daa2aaa825a54f3bf350a625190192a +md5: a229e22d4d8814a07702b0919d8e6701 +depends: +- libfreetype6 >=2.14.3 +license: GPL-2.0-only OR FTL +size: 8125 +timestamp: 1774301094057 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libfreetype6-2.14.3-hdae7a39_0.conda +sha256: 8e6b27fe4eec4c2fa7b7769a21973734c8dba1de80086fb0213e58375ac09f4c +md5: b99ed99e42dafb27889483b3098cace7 +depends: +- libgcc >=14 +- libpng >=1.6.55,<1.7.0a0 +- libzlib >=1.3.2,<2.0a0 +constrains: +- freetype >=2.14.3 +license: GPL-2.0-only OR FTL +size: 422941 +timestamp: 1774301093473 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgcc-15.2.0-h8acb6b2_18.conda +sha256: 43df385bedc1cab11993c4369e1f3b04b4ca5d0ea16cba6a0e7f18dbc129fcc9 +md5: 552567ea2b61e3a3035759b2fdb3f9a6 +depends: +- _openmp_mutex >=4.5 +constrains: +- libgcc-ng ==15.2.0=*_18 +- libgomp 15.2.0 h8acb6b2_18 +license: GPL-3.0-only WITH GCC-exception-3.1 +license_family: GPL +size: 622900 +timestamp: 1771378128706 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgcc-ng-15.2.0-he9431aa_18.conda +sha256: 83bb0415f59634dccfa8335d4163d1f6db00a27b36666736f9842b650b92cf2f +md5: 4feebd0fbf61075a1a9c2e9b3936c257 +depends: +- libgcc 15.2.0 h8acb6b2_18 +license: GPL-3.0-only WITH GCC-exception-3.1 +license_family: GPL +size: 27568 +timestamp: 1771378136019 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgfortran-15.2.0-he9431aa_18.conda +sha256: 7dcd7dff2505d56fd5272a6e712ec912f50a46bf07dc6873a7e853694304e6e4 +md5: 41f261f5e4e2e8cbd236c2f1f15dae1b +depends: +- libgfortran5 15.2.0 h1b7bec0_18 +constrains: +- libgfortran-ng ==15.2.0=*_18 +license: GPL-3.0-only WITH GCC-exception-3.1 +license_family: GPL +size: 27587 +timestamp: 1771378169244 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgfortran5-15.2.0-h1b7bec0_18.conda +sha256: 85347670dfb4a8d4c13cd7cae54138dcf2b1606b6bede42eef5507bf5f9660c6 +md5: 574d88ce3348331e962cfa5ed451b247 +depends: +- libgcc >=15.2.0 +constrains: +- libgfortran 15.2.0 +license: GPL-3.0-only WITH GCC-exception-3.1 +license_family: GPL +size: 1486341 +timestamp: 1771378148102 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgomp-15.2.0-h8acb6b2_18.conda +sha256: fc716f11a6a8525e27a5d332ef6a689210b0d2a4dd1133edc0f530659aa9faa6 +md5: 4faa39bf919939602e594253bd673958 +license: GPL-3.0-only WITH GCC-exception-3.1 +license_family: GPL +size: 588060 +timestamp: 1771378040807 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libjpeg-turbo-3.1.2-he30d5cf_0.conda +sha256: 84064c7c53a64291a585d7215fe95ec42df74203a5bf7615d33d49a3b0f08bb6 +md5: 5109d7f837a3dfdf5c60f60e311b041f +depends: +- libgcc >=14 +constrains: +- jpeg <0.0.0a +license: IJG AND BSD-3-Clause AND Zlib +size: 691818 +timestamp: 1762094728337 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/liblapack-3.11.0-5_h88aeb00_openblas.conda +build_number: 5 +sha256: 692222d186d3ffbc99eaf04b5b20181fd26aee1edec1106435a0a755c57cce86 +md5: 88d1e4133d1182522b403e9ba7435f04 +depends: +- libblas 3.11.0 5_haddc8a3_openblas +constrains: +- liblapacke 3.11.0 5*_openblas +- blas 2.305 openblas +- libcblas 3.11.0 5*_openblas +license: BSD-3-Clause +license_family: BSD +size: 18392 +timestamp: 1765818627104 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/liblzma-5.8.2-he30d5cf_0.conda +sha256: 843c46e20519651a3e357a8928352b16c5b94f4cd3d5481acc48be2e93e8f6a3 +md5: 96944e3c92386a12755b94619bae0b35 +depends: +- libgcc >=14 +constrains: +- xz 5.8.2.* +license: 0BSD +size: 125916 +timestamp: 1768754941722 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libmpdec-4.0.0-he30d5cf_1.conda +sha256: 57c0dd12d506e84541c4e877898bd2a59cca141df493d34036f18b2751e0a453 +md5: 7b9813e885482e3ccb1fa212b86d7fd0 +depends: +- libgcc >=14 +license: BSD-2-Clause +license_family: BSD +size: 114056 +timestamp: 1769482343003 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenblas-0.3.30-pthreads_h9d3fd7e_4.conda +sha256: 794a7270ea049ec931537874cd8d2de0ef4b3cef71c055cfd8b4be6d2f4228b0 +md5: 11d7d57b7bdd01da745bbf2b67020b2e +depends: +- libgcc >=14 +- libgfortran +- libgfortran5 >=14.3.0 +constrains: +- openblas >=0.3.30,<0.3.31.0a0 +license: BSD-3-Clause +license_family: BSD +size: 4959359 +timestamp: 1763114173544 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libpng-1.6.55-h1abf092_0.conda +sha256: c7378c6b79de4d571d00ad1caf0a4c19d43c9c94077a761abb6ead44d891f907 +md5: be4088903b94ea297975689b3c3aeb27 +depends: +- libgcc >=14 +- libzlib >=1.3.1,<2.0a0 +license: zlib-acknowledgement +size: 340156 +timestamp: 1770691477245 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libsqlite-3.52.0-h10b116e_0.conda +sha256: 1ddaf91b44fae83856276f4cb7ce544ffe41d4b55c1e346b504c6b45f19098d6 +md5: 77891484f18eca74b8ad83694da9815e +depends: +- icu >=78.2,<79.0a0 +- libgcc >=14 +- libzlib >=1.3.1,<2.0a0 +license: blessing +size: 952296 +timestamp: 1772818881550 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libstdcxx-15.2.0-hef695bb_18.conda +sha256: 31fdb9ffafad106a213192d8319b9f810e05abca9c5436b60e507afb35a6bc40 +md5: f56573d05e3b735cb03efeb64a15f388 +depends: +- libgcc 15.2.0 h8acb6b2_18 +constrains: +- libstdcxx-ng ==15.2.0=*_18 +license: GPL-3.0-only WITH GCC-exception-3.1 +license_family: GPL +size: 5541411 +timestamp: 1771378162499 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libtiff-4.7.1-hdb009f0_1.conda +sha256: 7ff79470db39e803e21b8185bc8f19c460666d5557b1378d1b1e857d929c6b39 +md5: 8c6fd84f9c87ac00636007c6131e457d +depends: +- lerc >=4.0.0,<5.0a0 +- libdeflate >=1.25,<1.26.0a0 +- libgcc >=14 +- libjpeg-turbo >=3.1.0,<4.0a0 +- liblzma >=5.8.1,<6.0a0 +- libstdcxx >=14 +- libwebp-base >=1.6.0,<2.0a0 +- libzlib >=1.3.1,<2.0a0 +- zstd >=1.5.7,<1.6.0a0 +license: HPND +size: 488407 +timestamp: 1762022048105 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libuuid-2.41.3-h1022ec0_0.conda +sha256: c37a8e89b700646f3252608f8368e7eb8e2a44886b92776e57ad7601fc402a11 +md5: cf2861212053d05f27ec49c3784ff8bb +depends: +- libgcc >=14 +license: BSD-3-Clause +license_family: BSD +size: 43453 +timestamp: 1766271546875 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libwebp-base-1.6.0-ha2e29f5_0.conda +sha256: b03700a1f741554e8e5712f9b06dd67e76f5301292958cd3cb1ac8c6fdd9ed25 +md5: 24e92d0942c799db387f5c9d7b81f1af +depends: +- libgcc >=14 +constrains: +- libwebp 1.6.0 +license: BSD-3-Clause +license_family: BSD +size: 359496 +timestamp: 1752160685488 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libxcb-1.17.0-h262b8f6_0.conda +sha256: 461cab3d5650ac6db73a367de5c8eca50363966e862dcf60181d693236b1ae7b +md5: cd14ee5cca2464a425b1dbfc24d90db2 +depends: +- libgcc >=13 +- pthread-stubs +- xorg-libxau >=1.0.11,<2.0a0 +- xorg-libxdmcp +license: MIT +license_family: MIT +size: 397493 +timestamp: 1727280745441 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libzlib-1.3.2-hdc9db2a_2.conda +sha256: eb111e32e5a7313a5bf799c7fb2419051fa2fe7eff74769fac8d5a448b309f7f +md5: 502006882cf5461adced436e410046d1 +constrains: +- zlib 1.3.2 *_2 +license: Zlib +license_family: Other +size: 69833 +timestamp: 1774072605429 +- conda: https://conda.anaconda.org/conda-forge/noarch/markdown-3.10.2-pyhcf101f3_0.conda +sha256: 20e0892592a3e7c683e3d66df704a9425d731486a97c34fc56af4da1106b2b6b +md5: ba0a9221ce1063f31692c07370d062f3 +depends: +- importlib-metadata >=4.4 +- python >=3.10 +- python +license: BSD-3-Clause +license_family: BSD +size: 85893 +timestamp: 1770694658918 +- conda: https://conda.anaconda.org/conda-forge/noarch/markdown-it-py-4.0.0-pyhd8ed1ab_0.conda +sha256: 7b1da4b5c40385791dbc3cc85ceea9fad5da680a27d5d3cb8bfaa185e304a89e +md5: 5b5203189eb668f042ac2b0826244964 +depends: +- mdurl >=0.1,<1 +- python >=3.10 +license: MIT +license_family: MIT +size: 64736 +timestamp: 1754951288511 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/markupsafe-3.0.3-py314hb76de3f_1.conda +sha256: 383c188496d13a55658c06e61e7d4cdff2c9f9d5a0648769fca8250bece7e0ef +md5: e5de3c36dd548b35ff2a8aa49208dcb3 +depends: +- libgcc >=14 +- python >=3.14,<3.15.0a0 +- python_abi 3.14.* *_cp314 +constrains: +- jinja2 >=3.0.0 +license: BSD-3-Clause +license_family: BSD +size: 27913 +timestamp: 1772446407659 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/mathjax-2.7.7-h8af1aa0_3.tar.bz2 +sha256: 8fd4c79d6eda3d4cba73783114305a53a154ada4d1e334d4e02cb3521429599b +md5: 7b08314a6867a9d5648a1c3265e9eb8e +license: Apache-2.0 +license_family: Apache +size: 22257008 +timestamp: 1662784555011 +- conda: https://conda.anaconda.org/conda-forge/noarch/mdurl-0.1.2-pyhd8ed1ab_1.conda +sha256: 78c1bbe1723449c52b7a9df1af2ee5f005209f67e40b6e1d3c7619127c43b1c7 +md5: 592132998493b3ff25fd7479396e8351 +depends: +- python >=3.9 +license: MIT +license_family: MIT +size: 14465 +timestamp: 1733255681319 +- conda: https://conda.anaconda.org/bioconda/noarch/multiqc-1.33-pyhdfd78af_0.conda +sha256: f005760b13093362fc9c997d603dd487de32ab2e821a3cbce52a42bcb8136517 +md5: 698a8a27c2b9d8a542c70cb47099a75e +depends: +- click +- coloredlogs +- humanize +- importlib-metadata +- jinja2 >=3.0.0 +- jsonschema +- markdown +- natsort +- numpy +- packaging +- pillow >=10.2.0 +- plotly >=5.18 +- polars-lts-cpu +- pyaml-env +- pydantic >=2.7.1 +- python >=3.8,!=3.14.1 +- python-dotenv +- python-kaleido 0.2.1 +- pyyaml >=4 +- requests +- rich >=10 +- rich-click +- spectra >=0.0.10 +- tiktoken +- tqdm +- typeguard +license: GPL-3.0-or-later +license_family: GPL3 +size: 4198799 +timestamp: 1765300743879 +- conda: https://conda.anaconda.org/conda-forge/noarch/narwhals-2.18.1-pyhcf101f3_1.conda +sha256: 541fd4390a0687228b8578247f1536a821d9261389a65585af9d1a6f2a14e1e0 +md5: 30bec5e8f4c3969e2b1bd407c5e52afb +depends: +- python >=3.10 +- python +license: MIT +size: 280459 +timestamp: 1774380620329 +- conda: https://conda.anaconda.org/conda-forge/noarch/natsort-8.4.0-pyhcf101f3_2.conda +sha256: aeb1548eb72e4f198e72f19d242fb695b35add2ac7b2c00e0d83687052867680 +md5: e941e85e273121222580723010bd4fa2 +depends: +- python >=3.9 +- python +license: MIT +license_family: MIT +size: 39262 +timestamp: 1770905275632 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/ncurses-6.5-ha32ae93_3.conda +sha256: 91cfb655a68b0353b2833521dc919188db3d8a7f4c64bea2c6a7557b24747468 +md5: 182afabe009dc78d8b73100255ee6868 +depends: +- libgcc >=13 +license: X11 AND BSD-3-Clause +size: 926034 +timestamp: 1738196018799 +- conda: https://conda.anaconda.org/conda-forge/noarch/networkx-3.6.1-pyhcf101f3_0.conda +sha256: f6a82172afc50e54741f6f84527ef10424326611503c64e359e25a19a8e4c1c6 +md5: a2c1eeadae7a309daed9d62c96012a2b +depends: +- python >=3.11 +- python +constrains: +- numpy >=1.25 +- scipy >=1.11.2 +- matplotlib-base >=3.8 +- pandas >=2.0 +license: BSD-3-Clause +license_family: BSD +size: 1587439 +timestamp: 1765215107045 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/nspr-4.38-h3ad9384_0.conda +sha256: 78a06e89285fef242e272998b292c1e621e3ee3dd4fba62ec014e503c7ec118f +md5: 6dd4f07147774bf720075a210f8026b9 +depends: +- libgcc >=14 +- libstdcxx >=14 +license: MPL-2.0 +license_family: MOZILLA +size: 235140 +timestamp: 1762350120355 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/nss-3.118-h544fa81_0.conda +sha256: 48942696889367ffd448f8dccfc080fb7e130b9938a4a3b6b20ef8e6af856463 +md5: 4540f9570d12db2150f42ba036154552 +depends: +- libgcc >=14 +- libsqlite >=3.51.0,<4.0a0 +- libstdcxx >=14 +- libzlib >=1.3.1,<2.0a0 +- nspr >=4.38,<5.0a0 +license: MPL-2.0 +license_family: MOZILLA +size: 2061869 +timestamp: 1763490303490 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/numpy-2.4.3-py314haac167e_0.conda +sha256: a6d42fd88afc57c3b0a57b21a12eff7492dfc419bb61ee3f74e9ba6261dabc88 +md5: 25d896c331481145720a21e5145fad65 +depends: +- python +- libgcc >=14 +- python 3.14.* *_cp314 +- libstdcxx >=14 +- libcblas >=3.9.0,<4.0a0 +- liblapack >=3.9.0,<4.0a0 +- python_abi 3.14.* *_cp314 +- libblas >=3.9.0,<4.0a0 +constrains: +- numpy-base <0a0 +license: BSD-3-Clause +license_family: BSD +size: 8008045 +timestamp: 1773839355275 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/openjpeg-2.5.4-h5da879a_0.conda +sha256: bd1bc8bdde5e6c5cbac42d462b939694e40b59be6d0698f668515908640c77b8 +md5: cea962410e327262346d48d01f05936c +depends: +- libgcc >=14 +- libpng >=1.6.50,<1.7.0a0 +- libstdcxx >=14 +- libtiff >=4.7.1,<4.8.0a0 +- libzlib >=1.3.1,<2.0a0 +license: BSD-2-Clause +license_family: BSD +size: 392636 +timestamp: 1758489353577 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/openssl-3.6.1-h546c87b_1.conda +sha256: 7f8048c0e75b2620254218d72b4ae7f14136f1981c5eb555ef61645a9344505f +md5: 25f5885f11e8b1f075bccf4a2da91c60 +depends: +- ca-certificates +- libgcc >=14 +license: Apache-2.0 +license_family: Apache +size: 3692030 +timestamp: 1769557678657 +- conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.0-pyhcf101f3_0.conda +sha256: c1fc0f953048f743385d31c468b4a678b3ad20caffdeaa94bed85ba63049fd58 +md5: b76541e68fea4d511b1ac46a28dcd2c6 +depends: +- python >=3.8 +- python +license: Apache-2.0 +license_family: APACHE +size: 72010 +timestamp: 1769093650580 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pillow-12.1.1-py314hac3e5ec_0.conda +sha256: 1ca2d1616baad9bccb7ebc425ef2dcd6cebe742fbe91edf226fb606ad371ca0f +md5: d3c959c7efe560b2d7da459d69121fe9 +depends: +- python +- python 3.14.* *_cp314 +- libgcc >=14 +- zlib-ng >=2.3.3,<2.4.0a0 +- libwebp-base >=1.6.0,<2.0a0 +- tk >=8.6.13,<8.7.0a0 +- libfreetype >=2.14.1 +- libfreetype6 >=2.14.1 +- libtiff >=4.7.1,<4.8.0a0 +- lcms2 >=2.18,<3.0a0 +- python_abi 3.14.* *_cp314 +- openjpeg >=2.5.4,<3.0a0 +- libjpeg-turbo >=3.1.2,<4.0a0 +- libxcb >=1.17.0,<2.0a0 +license: HPND +size: 1051828 +timestamp: 1770794010335 +- conda: https://conda.anaconda.org/conda-forge/noarch/plotly-6.6.0-pyhd8ed1ab_0.conda +sha256: c418d325359fc7a0074cea7f081ef1bce26e114d2da8a0154c5d27ecc87a08e7 +md5: 3e9427ee186846052e81fadde8ebe96a +depends: +- narwhals >=1.15.1 +- packaging +- python >=3.10 +constrains: +- ipywidgets >=7.6 +license: MIT +license_family: MIT +size: 5251872 +timestamp: 1772628857717 +- conda: https://conda.anaconda.org/conda-forge/noarch/polars-1.39.3-pyh58ad624_1.conda +sha256: d332c2d5002fc440ae37ed9679ffc21b552f18d20232390005d1dd3bce0888d3 +md5: d5a4e013a30dd8dfde9ab39f45aaf9c1 +depends: +- polars-runtime-32 ==1.39.3 +- python >=3.10 +- python +constrains: +- numpy >=1.16.0 +- pyarrow >=7.0.0 +- fastexcel >=0.9 +- openpyxl >=3.0.0 +- xlsx2csv >=0.8.0 +- connectorx >=0.3.2 +- deltalake >=1.0.0 +- pyiceberg >=0.7.1 +- altair >=5.4.0 +- great_tables >=0.8.0 +- polars-runtime-32 ==1.39.3 +- polars-runtime-64 ==1.39.3 +- polars-runtime-compat ==1.39.3 +license: MIT +license_family: MIT +size: 533495 +timestamp: 1774207987966 +- conda: https://conda.anaconda.org/conda-forge/noarch/polars-lts-cpu-1.34.0.deprecated-hc364b38_0.conda +sha256: e466fb31f67ba9bde18deafeb34263ca5eb25807f39ead0e9d753a8e82c4c4f4 +md5: ef0340e75068ac8ff96462749b5c98e7 +depends: +- polars >=1.34.0 +- polars-runtime-compat >=1.34.0 +license: MIT +license_family: MIT +size: 3902 +timestamp: 1760206808444 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/polars-runtime-32-1.39.3-py310hff09b76_1.conda +noarch: python +sha256: c070be507c5a90df397a47ae0299660be437d5546d68f1bc0fa4402c9f07d59e +md5: 3c1a7c6b4ba8b9fb773ace9723f8a5db +depends: +- python +- libgcc >=14 +- libstdcxx >=14 +- _python_abi3_support 1.* +- cpython >=3.10 +constrains: +- __glibc >=2.17 +license: MIT +license_family: MIT +size: 34785466 +timestamp: 1774207998285 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/polars-runtime-compat-1.39.3-py310hf00a4a2_1.conda +noarch: python +sha256: 683315f1a49e47ce72bf9462419733b40b588b2b3106552d95fd4cd994e174de +md5: dd3464e2132dc3a783e76e5078870c76 +depends: +- python +- libgcc >=14 +- libstdcxx >=14 +- _python_abi3_support 1.* +- cpython >=3.10 +constrains: +- __glibc >=2.17 +license: MIT +license_family: MIT +size: 34652491 +timestamp: 1774207996879 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/procps-ng-4.0.6-h1779866_0.conda +sha256: e9cbcbc94e151ada3d6dc365380aaaf591f65012c16d9a2abaea4b9b90adc402 +md5: ab7288cc39545556d1bc5e71ab2df9a9 +depends: +- libgcc >=14 +- ncurses >=6.5,<7.0a0 +license: GPL-2.0-or-later AND LGPL-2.0-or-later +license_family: GPL +size: 636733 +timestamp: 1769712412683 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pthread-stubs-0.4-h86ecc28_1002.conda +sha256: 977dfb0cb3935d748521dd80262fe7169ab82920afd38ed14b7fee2ea5ec01ba +md5: bb5a90c93e3bac3d5690acf76b4a6386 +depends: +- libgcc >=13 +license: MIT +license_family: MIT +size: 8342 +timestamp: 1726803319942 +- conda: https://conda.anaconda.org/conda-forge/noarch/pyaml-env-1.2.2-pyhd8ed1ab_0.conda +sha256: 58994e0d2ea8584cb399546e6f6896d771995e6121d1a7b6a2c9948388358932 +md5: e17be1016bcc3516827b836cd3e4d9dc +depends: +- python >=3.9 +- pyyaml >=5.0,<=7.0 +license: MIT +license_family: MIT +size: 14645 +timestamp: 1736766960536 +- conda: https://conda.anaconda.org/conda-forge/noarch/pydantic-2.12.5-pyhcf101f3_1.conda +sha256: 868569d9505b7fe246c880c11e2c44924d7613a8cdcc1f6ef85d5375e892f13d +md5: c3946ed24acdb28db1b5d63321dbca7d +depends: +- typing-inspection >=0.4.2 +- typing_extensions >=4.14.1 +- python >=3.10 +- typing-extensions >=4.6.1 +- annotated-types >=0.6.0 +- pydantic-core ==2.41.5 +- python +license: MIT +license_family: MIT +size: 340482 +timestamp: 1764434463101 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pydantic-core-2.41.5-py314h451b6cc_1.conda +sha256: f8acb2d03ebe80fed0032b9a989fc9acfb6735e3cd3f8c704b72728cb31868f6 +md5: 28f5027a1e04d67aa13fac1c5ba79693 +depends: +- python +- typing-extensions >=4.6.0,!=4.7.0 +- libgcc >=14 +- python 3.14.* *_cp314 +- python_abi 3.14.* *_cp314 +constrains: +- __glibc >=2.17 +license: MIT +license_family: MIT +size: 1828339 +timestamp: 1762989038561 +- conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda +sha256: 5577623b9f6685ece2697c6eb7511b4c9ac5fb607c9babc2646c811b428fd46a +md5: 6b6ece66ebcae2d5f326c77ef2c5a066 +depends: +- python >=3.9 +license: BSD-2-Clause +license_family: BSD +size: 889287 +timestamp: 1750615908735 +- conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda +sha256: ba3b032fa52709ce0d9fd388f63d330a026754587a2f461117cac9ab73d8d0d8 +md5: 461219d1a5bd61342293efa2c0c90eac +depends: +- __unix +- python >=3.9 +license: BSD-3-Clause +license_family: BSD +size: 21085 +timestamp: 1733217331982 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/python-3.14.3-hb06a95a_101_cp314.conda +build_number: 101 +sha256: 87e9dff5646aba87cecfbc08789634c855871a7325169299d749040b0923a356 +md5: 205011b36899ff0edf41b3db0eda5a44 +depends: +- bzip2 >=1.0.8,<2.0a0 +- ld_impl_linux-aarch64 >=2.36.1 +- libexpat >=2.7.3,<3.0a0 +- libffi >=3.5.2,<3.6.0a0 +- libgcc >=14 +- liblzma >=5.8.2,<6.0a0 +- libmpdec >=4.0.0,<5.0a0 +- libsqlite >=3.51.2,<4.0a0 +- libuuid >=2.41.3,<3.0a0 +- libzlib >=1.3.1,<2.0a0 +- ncurses >=6.5,<7.0a0 +- openssl >=3.5.5,<4.0a0 +- python_abi 3.14.* *_cp314 +- readline >=8.3,<9.0a0 +- tk >=8.6.13,<8.7.0a0 +- tzdata +- zstd >=1.5.7,<1.6.0a0 +license: Python-2.0 +size: 37305578 +timestamp: 1770674395875 +python_site_packages_path: lib/python3.14/site-packages +- conda: https://conda.anaconda.org/conda-forge/noarch/python-dotenv-1.2.2-pyhcf101f3_0.conda +sha256: 74e417a768f59f02a242c25e7db0aa796627b5bc8c818863b57786072aeb85e5 +md5: 130584ad9f3a513cdd71b1fdc1244e9c +depends: +- python >=3.10 +license: BSD-3-Clause +license_family: BSD +size: 27848 +timestamp: 1772388605021 +- conda: https://conda.anaconda.org/conda-forge/noarch/python-gil-3.14.3-h4df99d1_101.conda +sha256: 233aebd94c704ac112afefbb29cf4170b7bc606e22958906f2672081bc50638a +md5: 235765e4ea0d0301c75965985163b5a1 +depends: +- cpython 3.14.3.* +- python_abi * *_cp314 +license: Python-2.0 +size: 50062 +timestamp: 1770674497152 +- conda: https://conda.anaconda.org/conda-forge/noarch/python-kaleido-0.2.1-pyhd8ed1ab_0.tar.bz2 +sha256: e17bf63a30aec33432f1ead86e15e9febde9fc40a7f869c0e766be8d2db44170 +md5: 310259a5b03ff02289d7705f39e2b1d2 +depends: +- kaleido-core 0.2.1.* +- python >=3.5 +license: MIT +license_family: MIT +size: 18320 +timestamp: 1615204747600 +- conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.14-8_cp314.conda +build_number: 8 +sha256: ad6d2e9ac39751cc0529dd1566a26751a0bf2542adb0c232533d32e176e21db5 +md5: 0539938c55b6b1a59b560e843ad864a4 +constrains: +- python 3.14.* *_cp314 +license: BSD-3-Clause +license_family: BSD +size: 6989 +timestamp: 1752805904792 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pyyaml-6.0.3-py314h807365f_1.conda +sha256: 496b5e65dfdd0aaaaa5de0dcaaf3bceea00fcb4398acf152f89e567c82ec1046 +md5: 9ae2c92975118058bd720e9ba2bb7c58 +depends: +- libgcc >=14 +- python >=3.14,<3.15.0a0 +- python >=3.14,<3.15.0a0 *_cp314 +- python_abi 3.14.* *_cp314 +- yaml >=0.2.5,<0.3.0a0 +license: MIT +license_family: MIT +size: 195678 +timestamp: 1770223441816 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/readline-8.3-hb682ff5_0.conda +sha256: fe695f9d215e9a2e3dd0ca7f56435ab4df24f5504b83865e3d295df36e88d216 +md5: 3d49cad61f829f4f0e0611547a9cda12 +depends: +- libgcc >=14 +- ncurses >=6.5,<7.0a0 +license: GPL-3.0-only +license_family: GPL +size: 357597 +timestamp: 1765815673644 +- conda: https://conda.anaconda.org/conda-forge/noarch/referencing-0.37.0-pyhcf101f3_0.conda +sha256: 0577eedfb347ff94d0f2fa6c052c502989b028216996b45c7f21236f25864414 +md5: 870293df500ca7e18bedefa5838a22ab +depends: +- attrs >=22.2.0 +- python >=3.10 +- rpds-py >=0.7.0 +- typing_extensions >=4.4.0 +- python +license: MIT +license_family: MIT +size: 51788 +timestamp: 1760379115194 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/regex-2026.2.28-py314h51f160d_0.conda +sha256: 2080ecea825e1ef91a2422cc0bc63e85db9e38908ed17657fb8f41de7a6eee71 +md5: 818aa2c9f6b3c808da5e7be22a9a424c +depends: +- libgcc >=14 +- python >=3.14,<3.15.0a0 +- python >=3.14,<3.15.0a0 *_cp314 +- python_abi 3.14.* *_cp314 +license: Apache-2.0 AND CNRI-Python +license_family: PSF +size: 408097 +timestamp: 1772255205521 +- conda: https://conda.anaconda.org/conda-forge/noarch/requests-2.32.5-pyhcf101f3_1.conda +sha256: 7813c38b79ae549504b2c57b3f33394cea4f2ad083f0994d2045c2e24cb538c5 +md5: c65df89a0b2e321045a9e01d1337b182 +depends: +- python >=3.10 +- certifi >=2017.4.17 +- charset-normalizer >=2,<4 +- idna >=2.5,<4 +- urllib3 >=1.21.1,<3 +- python +constrains: +- chardet >=3.0.2,<6 +license: Apache-2.0 +license_family: APACHE +size: 63602 +timestamp: 1766926974520 +- conda: https://conda.anaconda.org/conda-forge/noarch/rich-14.3.3-pyhcf101f3_0.conda +sha256: b06ce84d6a10c266811a7d3adbfa1c11f13393b91cc6f8a5b468277d90be9590 +md5: 7a6289c50631d620652f5045a63eb573 +depends: +- markdown-it-py >=2.2.0 +- pygments >=2.13.0,<3.0.0 +- python >=3.10 +- typing_extensions >=4.0.0,<5.0.0 +- python +license: MIT +license_family: MIT +size: 208472 +timestamp: 1771572730357 +- conda: https://conda.anaconda.org/conda-forge/noarch/rich-click-1.9.7-pyh8f84b5b_0.conda +sha256: aa3fcb167321bae51998de2e94d199109c9024f25a5a063cb1c28d8f1af33436 +md5: 0c20a8ebcddb24a45da89d5e917e6cb9 +depends: +- python >=3.10 +- rich >=12 +- click >=8 +- typing-extensions >=4 +- __unix +- python +license: MIT +license_family: MIT +size: 64356 +timestamp: 1769850479089 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/rpds-py-0.30.0-py314h02b7a91_0.conda +sha256: a587240f16eac7c6a80f9585cef679cd1cb9a287b8dfcdd36dcef1f7e7db15dc +md5: e7f6ed9e60043bb5cbcc527764897f0d +depends: +- python +- libgcc >=14 +- python_abi 3.14.* *_cp314 +constrains: +- __glibc >=2.17 +license: MIT +license_family: MIT +size: 376332 +timestamp: 1764543345455 +- conda: https://conda.anaconda.org/conda-forge/noarch/spectra-0.0.11-pyhd8ed1ab_2.conda +sha256: 7c65782d2511738e62c70462e89d65da4fa54d5a7e47c46667bcd27a59f81876 +md5: 472239e4eb7b5a84bb96b3ed7e3a596a +depends: +- colormath >=3.0.0 +- python >=3.9 +license: MIT +license_family: MIT +size: 22284 +timestamp: 1735770589188 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/sqlite-3.52.0-hf1c7be2_0.conda +sha256: 4f8523f5341f0d9e1547085206c6c1f71f9fc7c277443ca363a8cf98add8fc01 +md5: d9634079df93a65ee045b3c75f35cae1 +depends: +- icu >=78.2,<79.0a0 +- libgcc >=14 +- libsqlite 3.52.0 h10b116e_0 +- libzlib >=1.3.1,<2.0a0 +- ncurses >=6.5,<7.0a0 +- readline >=8.3,<9.0a0 +license: blessing +size: 209416 +timestamp: 1772818891689 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/tiktoken-0.12.0-py314h6a36e60_3.conda +sha256: c1da41c79262b27efa168407cfecc47b20270e5fc071a8307f95a2c85fb94170 +md5: 55bf7b559202236157b14323b40f19e6 +depends: +- libgcc >=14 +- libstdcxx >=14 +- python >=3.14,<3.15.0a0 +- python_abi 3.14.* *_cp314 +- regex >=2022.1.18 +- requests >=2.26.0 +constrains: +- __glibc >=2.17 +license: MIT +license_family: MIT +size: 914402 +timestamp: 1764030357702 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/tk-8.6.13-noxft_h0dc03b3_103.conda +sha256: e25c314b52764219f842b41aea2c98a059f06437392268f09b03561e4f6e5309 +md5: 7fc6affb9b01e567d2ef1d05b84aa6ed +depends: +- libgcc >=14 +- libzlib >=1.3.1,<2.0a0 +constrains: +- xorg-libx11 >=1.8.12,<2.0a0 +license: TCL +license_family: BSD +size: 3368666 +timestamp: 1769464148928 +- conda: https://conda.anaconda.org/conda-forge/noarch/tqdm-4.67.3-pyh8f84b5b_0.conda +sha256: 9ef8e47cf00e4d6dcc114eb32a1504cc18206300572ef14d76634ba29dfe1eb6 +md5: e5ce43272193b38c2e9037446c1d9206 +depends: +- python >=3.10 +- __unix +- python +license: MPL-2.0 and MIT +size: 94132 +timestamp: 1770153424136 +- conda: https://conda.anaconda.org/conda-forge/noarch/typeguard-4.5.1-pyhd8ed1ab_0.conda +sha256: 39d8ae33c43cdb8f771373e149b0b4fae5a08960ac58dcca95b2f1642bb17448 +md5: 260af1b0a94f719de76b4e14094e9a3b +depends: +- importlib-metadata >=3.6 +- python >=3.10 +- typing-extensions >=4.10.0 +- typing_extensions >=4.14.0 +constrains: +- pytest >=7 +license: MIT +license_family: MIT +size: 36838 +timestamp: 1771532971545 +- conda: https://conda.anaconda.org/conda-forge/noarch/typing-extensions-4.15.0-h396c80c_0.conda +sha256: 7c2df5721c742c2a47b2c8f960e718c930031663ac1174da67c1ed5999f7938c +md5: edd329d7d3a4ab45dcf905899a7a6115 +depends: +- typing_extensions ==4.15.0 pyhcf101f3_0 +license: PSF-2.0 +license_family: PSF +size: 91383 +timestamp: 1756220668932 +- conda: https://conda.anaconda.org/conda-forge/noarch/typing-inspection-0.4.2-pyhd8ed1ab_1.conda +sha256: 70db27de58a97aeb7ba7448366c9853f91b21137492e0b4430251a1870aa8ff4 +md5: a0a4a3035667fc34f29bfbd5c190baa6 +depends: +- python >=3.10 +- typing_extensions >=4.12.0 +license: MIT +license_family: MIT +size: 18923 +timestamp: 1764158430324 +- conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.15.0-pyhcf101f3_0.conda +sha256: 032271135bca55aeb156cee361c81350c6f3fb203f57d024d7e5a1fc9ef18731 +md5: 0caa1af407ecff61170c9437a808404d +depends: +- python >=3.10 +- python +license: PSF-2.0 +license_family: PSF +size: 51692 +timestamp: 1756220668932 +- conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda +sha256: 1d30098909076af33a35017eed6f2953af1c769e273a0626a04722ac4acaba3c +md5: ad659d0a2b3e47e38d829aa8cad2d610 +license: LicenseRef-Public-Domain +size: 119135 +timestamp: 1767016325805 +- conda: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.6.3-pyhd8ed1ab_0.conda +sha256: af641ca7ab0c64525a96fd9ad3081b0f5bcf5d1cbb091afb3f6ed5a9eee6111a +md5: 9272daa869e03efe68833e3dc7a02130 +depends: +- backports.zstd >=1.0.0 +- brotli-python >=1.2.0 +- h2 >=4,<5 +- pysocks >=1.5.6,<2.0,!=1.5.7 +- python >=3.10 +license: MIT +license_family: MIT +size: 103172 +timestamp: 1767817860341 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxau-1.0.12-he30d5cf_1.conda +sha256: e9f6e931feeb2f40e1fdbafe41d3b665f1ab6cb39c5880a1fcf9f79a3f3c84a5 +md5: 1c246e1105000c3660558459e2fd6d43 +depends: +- libgcc >=14 +license: MIT +license_family: MIT +size: 16317 +timestamp: 1762977521691 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxdmcp-1.1.5-he30d5cf_1.conda +sha256: 128d72f36bcc8d2b4cdbec07507542e437c7d67f677b7d77b71ed9eeac7d6df1 +md5: bff06dcde4a707339d66d45d96ceb2e2 +depends: +- libgcc >=14 +license: MIT +license_family: MIT +size: 21039 +timestamp: 1762979038025 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/yaml-0.2.5-h80f16a2_3.conda +sha256: 66265e943f32ce02396ad214e27cb35f5b0490b3bd4f064446390f9d67fa5d88 +md5: 032d8030e4a24fe1f72c74423a46fb88 +depends: +- libgcc >=14 +license: MIT +license_family: MIT +size: 88088 +timestamp: 1753484092643 +- conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda +sha256: b4533f7d9efc976511a73ef7d4a2473406d7f4c750884be8e8620b0ce70f4dae +md5: 30cd29cb87d819caead4d55184c1d115 +depends: +- python >=3.10 +- python +license: MIT +license_family: MIT +size: 24194 +timestamp: 1764460141901 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/zlib-ng-2.3.3-ha7cb516_1.conda +sha256: 638a3a41a4fbfed52d3c60c8ef5a3693b3f12a5b1a3f58fa29f5698d0a0702e2 +md5: f731af71c723065d91b4c01bb822641b +depends: +- libgcc >=14 +- libstdcxx >=14 +license: Zlib +license_family: Other +size: 121046 +timestamp: 1770167944449 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/zstd-1.5.7-h85ac4a6_6.conda +sha256: 569990cf12e46f9df540275146da567d9c618c1e9c7a0bc9d9cfefadaed20b75 +md5: c3655f82dcea2aa179b291e7099c1fcc +depends: +- libzlib >=1.3.1,<2.0a0 +license: BSD-3-Clause +license_family: BSD +size: 614429 +timestamp: 1764777145593 diff --git a/modules/nf-core/multiqc/environment.yml b/modules/nf-core/multiqc/environment.yml new file mode 100644 index 00000000..009874d4 --- /dev/null +++ b/modules/nf-core/multiqc/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::multiqc=1.33 diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf new file mode 100644 index 00000000..5376aea1 --- /dev/null +++ b/modules/nf-core/multiqc/main.nf @@ -0,0 +1,50 @@ +process MULTIQC { + tag "${meta.id}" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/34/34e733a9ae16a27e80fe00f863ea1479c96416017f24a907996126283e7ecd4d/data' + : 'community.wave.seqera.io/library/multiqc:1.33--ee7739d47738383b'}" + + input: + tuple val(meta), path(multiqc_files, stageAs: "?/*"), path(multiqc_config, stageAs: "?/*"), path(multiqc_logo), path(replace_names), path(sample_names) + + output: + tuple val(meta), path("*.html"), emit: report + tuple val(meta), path("*_data"), emit: data + tuple val(meta), path("*_plots"), emit: plots, optional: true + // MultiQC should not push its versions to the `versions` topic. Its input depends on the versions topic to be resolved thus outputting to the topic will let the pipeline hang forever + tuple val("${task.process}"), val('multiqc'), eval('multiqc --version | sed "s/.* //g"'), emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ? "--filename ${task.ext.prefix}.html" : '' + def config = multiqc_config ? multiqc_config instanceof List ? "--config ${multiqc_config.join(' --config ')}" : "--config ${multiqc_config}" : "" + def logo = multiqc_logo ? "--cl-config 'custom_logo: \"${multiqc_logo}\"'" : '' + def replace = replace_names ? "--replace-names ${replace_names}" : '' + def samples = sample_names ? "--sample-names ${sample_names}" : '' + """ + multiqc \\ + --force \\ + ${args} \\ + ${config} \\ + ${prefix} \\ + ${logo} \\ + ${replace} \\ + ${samples} \\ + . + """ + + stub: + """ + mkdir multiqc_data + touch multiqc_data/.stub + mkdir multiqc_plots + touch multiqc_plots/.stub + touch multiqc_report.html + """ +} diff --git a/modules/nf-core/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml new file mode 100644 index 00000000..57cf43ca --- /dev/null +++ b/modules/nf-core/multiqc/meta.yml @@ -0,0 +1,133 @@ +name: multiqc +description: Aggregate results from bioinformatics analyses across many samples + into a single report +keywords: + - QC + - bioinformatics tools + - Beautiful stand-alone HTML report +tools: + - multiqc: + description: | + MultiQC searches a given directory for analysis logs and compiles a HTML report. + It's a general use tool, perfect for summarising the output from numerous bioinformatics tools. + homepage: https://multiqc.info/ + documentation: https://multiqc.info/docs/ + licence: + - "GPL-3.0-or-later" + identifier: biotools:multiqc +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'sample1', single_end:false ] + - multiqc_files: + type: file + description: | + List of reports / files recognised by MultiQC, for example the html and zip output of FastQC + ontologies: [] + - multiqc_config: + type: file + description: Optional config yml for MultiQC + pattern: "*.{yml,yaml}" + ontologies: + - edam: http://edamontology.org/format_3750 + - multiqc_logo: + type: file + description: Optional logo file for MultiQC + pattern: "*.{png}" + ontologies: [] + - replace_names: + type: file + description: | + Optional two-column sample renaming file. First column a set of + patterns, second column a set of corresponding replacements. Passed via + MultiQC's `--replace-names` option. + pattern: "*.{tsv}" + ontologies: + - edam: http://edamontology.org/format_3475 + - sample_names: + type: file + description: | + Optional TSV file with headers, passed to the MultiQC --sample_names + argument. + pattern: "*.{tsv}" + ontologies: + - edam: http://edamontology.org/format_3475 +output: + report: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'sample1', single_end:false ] + - "*.html": + type: file + description: MultiQC report file + pattern: ".html" + ontologies: [] + data: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'sample1', single_end:false ] + - "*_data": + type: directory + description: MultiQC data dir + pattern: "multiqc_data" + plots: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'sample1', single_end:false ] + - "*_plots": + type: file + description: Plots created by MultiQC + pattern: "*_plots" + ontologies: [] + versions: + - - ${task.process}: + type: string + description: The process the versions were collected from + - multiqc: + type: string + description: The tool name + - multiqc --version | sed "s/.* //g": + type: eval + description: The expression to obtain the version of the tool +authors: + - "@abhi18av" + - "@bunop" + - "@drpatelh" + - "@jfy133" +maintainers: + - "@abhi18av" + - "@bunop" + - "@drpatelh" + - "@jfy133" +containers: + conda: + linux/amd64: + lock_file: modules/nf-core/multiqc/.conda-lock/linux_amd64-bd-c1f4a7982b743963_1.txt + linux/arm64: + lock_file: modules/nf-core/multiqc/.conda-lock/linux_arm64-bd-40bf3b435e89dc22_1.txt + docker: + linux/amd64: + name: community.wave.seqera.io/library/multiqc:1.33--c1f4a7982b743963 + build_id: bd-c1f4a7982b743963_1 + scan_id: sc-b7b7f470b2a16699_1 + linux/arm64: + name: community.wave.seqera.io/library/multiqc:1.33--40bf3b435e89dc22 + build_id: bd-40bf3b435e89dc22_1 + scan_id: sc-0e2108a0e7368d2f_1 + singularity: + linux/amd64: + name: oras://community.wave.seqera.io/library/multiqc:1.33--9b3473b1c4bb0493 + build_id: bd-9b3473b1c4bb0493_1 + https: https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c4/c4e6d9f669e1a99b53c7dc5cdd6b8e7fd6654032c755bb783cc9849e8203f4d1/data + linux/arm64: + name: oras://community.wave.seqera.io/library/multiqc:1.33--e1ef2065eb21b530 + build_id: bd-e1ef2065eb21b530_1 + https: https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/2a/2acce766e3efb280fa43acdbe85305ea6496ddadbcaa2d806ac4985dfe4686ce/data diff --git a/modules/nf-core/multiqc/tests/custom_prefix.config b/modules/nf-core/multiqc/tests/custom_prefix.config new file mode 100644 index 00000000..b30b1358 --- /dev/null +++ b/modules/nf-core/multiqc/tests/custom_prefix.config @@ -0,0 +1,5 @@ +process { + withName: 'MULTIQC' { + ext.prefix = "custom_prefix" + } +} diff --git a/modules/nf-core/multiqc/tests/main.nf.test b/modules/nf-core/multiqc/tests/main.nf.test new file mode 100644 index 00000000..4cbdb95d --- /dev/null +++ b/modules/nf-core/multiqc/tests/main.nf.test @@ -0,0 +1,211 @@ +nextflow_process { + + name "Test Process MULTIQC" + script "../main.nf" + process "MULTIQC" + + tag "modules" + tag "modules_nfcore" + tag "multiqc" + + config "./nextflow.config" + + test("sarscov2 single-end [fastqc]") { + + when { + process { + """ + input[0] = channel.of([ + [ id: 'FASTQC' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true), + [], + [], + [], + [] + ]) + """ + } + } + + then { + assert process.success + assert snapshot( + sanitizeOutput(process.out).collectEntries { key, val -> + if (key == "data") { + return [key, val.collect { [path(it[1]).list().collect { file(it.toString()).name }] }] + } + else if (key == "plots") { + return [key, val.collect { [ + "pdf", + path("${it[1]}/pdf").list().collect { file(it.toString()).name }, + "png", + path("${it[1]}/png").list().collect { file(it.toString()).name }, + "svg", + path("${it[1]}/svg").list().collect { file(it.toString()).name }] }] + } + else if (key == "report") { + return [key, file(val[0][1].toString()).name] + } + return [key, val] + } + ).match() + } + } + + test("sarscov2 single-end [fastqc] - custom prefix") { + config "./custom_prefix.config" + + when { + process { + """ + input[0] = channel.of([ + [ id: 'FASTQC' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true), + [], + [], + [], + [] + ]) + """ + } + } + + then { + assert process.success + assert snapshot( + sanitizeOutput(process.out).collectEntries { key, val -> + if (key == "data") { + return [key, val.collect { [path(it[1]).list().collect { file(it.toString()).name }] }] + } + else if (key == "plots") { + return [key, val.collect { [ + "pdf", + path("${it[1]}/pdf").list().collect { file(it.toString()).name }, + "png", + path("${it[1]}/png").list().collect { file(it.toString()).name }, + "svg", + path("${it[1]}/svg").list().collect { file(it.toString()).name }] }] + } + else if (key == "report") { + return [key, file(val[0][1].toString()).name] + } + return [key, val] + } + ).match() + } + } + + test("sarscov2 single-end [fastqc] [config]") { + + when { + process { + """ + input[0] = channel.of([ + [ id: 'FASTQC' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true), + file("https://raw.githubusercontent.com/nf-core/seqinspector/1.0.0/assets/multiqc_config.yml", checkIfExists: true), + [], + [], + [] + ]) + """ + } + } + + then { + assert process.success + assert snapshot( + sanitizeOutput(process.out).collectEntries { key, val -> + if (key == "data") { + return [key, val.collect { [path(it[1]).list().collect { file(it.toString()).name }] }] + } + else if (key == "plots") { + return [key, val.collect { [ + "pdf", + path("${it[1]}/pdf").list().collect { file(it.toString()).name }, + "png", + path("${it[1]}/png").list().collect { file(it.toString()).name }, + "svg", + path("${it[1]}/svg").list().collect { file(it.toString()).name }] }] + } + else if (key == "report") { + return [key, file(val[0][1].toString()).name] + } + return [key, val] + } + ).match() + } + } + + test("sarscov2 single-end [fastqc] [multiple configs]") { + + when { + process { + """ + input[0] = channel.of([ + [ id: 'FASTQC' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true), + [ + file("https://raw.githubusercontent.com/nf-core/seqinspector/1.0.0/assets/multiqc_config.yml", checkIfExists: true), + file("https://raw.githubusercontent.com/nf-core/seqinspector/1.0.0/assets/multiqc_config.yml", checkIfExists: true) + ], + [], + [], + [] + ]) + """ + } + } + + then { + assert process.success + assert snapshot( + sanitizeOutput(process.out).collectEntries { key, val -> + if (key == "data") { + return [key, val.collect { [path(it[1]).list().collect { file(it.toString()).name }] }] + } + else if (key == "plots") { + return [key, val.collect { [ + "pdf", + path("${it[1]}/pdf").list().collect { file(it.toString()).name }, + "png", + path("${it[1]}/png").list().collect { file(it.toString()).name }, + "svg", + path("${it[1]}/svg").list().collect { file(it.toString()).name }] }] + } + else if (key == "report") { + return [key, file(val[0][1].toString()).name] + } + return [key, val] + } + ).match() + } + } + + test("sarscov2 single-end [fastqc] - stub") { + + options "-stub" + + when { + process { + """ + input[0] = channel.of([ + [ id: 'FASTQC' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true), + [], + [], + [], + [] + ]) + """ + } + } + + then { + assert process.success + assertAll( + { assert snapshot(sanitizeOutput(process.out)).match() } + ) + } + } +} diff --git a/modules/nf-core/multiqc/tests/main.nf.test.snap b/modules/nf-core/multiqc/tests/main.nf.test.snap new file mode 100644 index 00000000..3bfc524f --- /dev/null +++ b/modules/nf-core/multiqc/tests/main.nf.test.snap @@ -0,0 +1,422 @@ +{ + "sarscov2 single-end [fastqc] [multiple configs]": { + "content": [ + { + "data": [ + [ + [ + "fastqc-status-check-heatmap.txt", + "fastqc_overrepresented_sequences_plot.txt", + "fastqc_per_base_n_content_plot.txt", + "fastqc_per_base_sequence_quality_plot.txt", + "fastqc_per_sequence_gc_content_plot_Counts.txt", + "fastqc_per_sequence_gc_content_plot_Percentages.txt", + "fastqc_per_sequence_quality_scores_plot.txt", + "fastqc_sequence_counts_plot.txt", + "fastqc_sequence_duplication_levels_plot.txt", + "fastqc_sequence_length_distribution_plot.txt", + "fastqc_top_overrepresented_sequences_table.txt", + "llms-full.txt", + "multiqc.log", + "multiqc.parquet", + "multiqc_citations.txt", + "multiqc_data.json", + "multiqc_fastqc.txt", + "multiqc_general_stats.txt", + "multiqc_sources.txt" + ] + ] + ], + "plots": [ + [ + "pdf", + [ + "fastqc-status-check-heatmap.pdf", + "fastqc_overrepresented_sequences_plot.pdf", + "fastqc_per_base_n_content_plot.pdf", + "fastqc_per_base_sequence_quality_plot.pdf", + "fastqc_per_sequence_gc_content_plot_Counts.pdf", + "fastqc_per_sequence_gc_content_plot_Percentages.pdf", + "fastqc_per_sequence_quality_scores_plot.pdf", + "fastqc_sequence_counts_plot-cnt.pdf", + "fastqc_sequence_counts_plot-pct.pdf", + "fastqc_sequence_duplication_levels_plot.pdf", + "fastqc_sequence_length_distribution_plot.pdf", + "fastqc_top_overrepresented_sequences_table.pdf" + ], + "png", + [ + "fastqc-status-check-heatmap.png", + "fastqc_overrepresented_sequences_plot.png", + "fastqc_per_base_n_content_plot.png", + "fastqc_per_base_sequence_quality_plot.png", + "fastqc_per_sequence_gc_content_plot_Counts.png", + "fastqc_per_sequence_gc_content_plot_Percentages.png", + "fastqc_per_sequence_quality_scores_plot.png", + "fastqc_sequence_counts_plot-cnt.png", + "fastqc_sequence_counts_plot-pct.png", + "fastqc_sequence_duplication_levels_plot.png", + "fastqc_sequence_length_distribution_plot.png", + "fastqc_top_overrepresented_sequences_table.png" + ], + "svg", + [ + "fastqc-status-check-heatmap.svg", + "fastqc_overrepresented_sequences_plot.svg", + "fastqc_per_base_n_content_plot.svg", + "fastqc_per_base_sequence_quality_plot.svg", + "fastqc_per_sequence_gc_content_plot_Counts.svg", + "fastqc_per_sequence_gc_content_plot_Percentages.svg", + "fastqc_per_sequence_quality_scores_plot.svg", + "fastqc_sequence_counts_plot-cnt.svg", + "fastqc_sequence_counts_plot-pct.svg", + "fastqc_sequence_duplication_levels_plot.svg", + "fastqc_sequence_length_distribution_plot.svg", + "fastqc_top_overrepresented_sequences_table.svg" + ] + ] + ], + "report": "multiqc_report.html", + "versions": [ + [ + "MULTIQC", + "multiqc", + "1.33" + ] + ] + } + ], + "timestamp": "2026-03-17T16:15:42.577775492", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "sarscov2 single-end [fastqc]": { + "content": [ + { + "data": [ + [ + [ + "fastqc-status-check-heatmap.txt", + "fastqc_overrepresented_sequences_plot.txt", + "fastqc_per_base_n_content_plot.txt", + "fastqc_per_base_sequence_quality_plot.txt", + "fastqc_per_sequence_gc_content_plot_Counts.txt", + "fastqc_per_sequence_gc_content_plot_Percentages.txt", + "fastqc_per_sequence_quality_scores_plot.txt", + "fastqc_sequence_counts_plot.txt", + "fastqc_sequence_duplication_levels_plot.txt", + "fastqc_sequence_length_distribution_plot.txt", + "fastqc_top_overrepresented_sequences_table.txt", + "llms-full.txt", + "multiqc.log", + "multiqc.parquet", + "multiqc_citations.txt", + "multiqc_data.json", + "multiqc_fastqc.txt", + "multiqc_general_stats.txt", + "multiqc_software_versions.txt", + "multiqc_sources.txt" + ] + ] + ], + "plots": [ + [ + "pdf", + [ + "fastqc-status-check-heatmap.pdf", + "fastqc_overrepresented_sequences_plot.pdf", + "fastqc_per_base_n_content_plot.pdf", + "fastqc_per_base_sequence_quality_plot.pdf", + "fastqc_per_sequence_gc_content_plot_Counts.pdf", + "fastqc_per_sequence_gc_content_plot_Percentages.pdf", + "fastqc_per_sequence_quality_scores_plot.pdf", + "fastqc_sequence_counts_plot-cnt.pdf", + "fastqc_sequence_counts_plot-pct.pdf", + "fastqc_sequence_duplication_levels_plot.pdf", + "fastqc_sequence_length_distribution_plot.pdf", + "fastqc_top_overrepresented_sequences_table.pdf" + ], + "png", + [ + "fastqc-status-check-heatmap.png", + "fastqc_overrepresented_sequences_plot.png", + "fastqc_per_base_n_content_plot.png", + "fastqc_per_base_sequence_quality_plot.png", + "fastqc_per_sequence_gc_content_plot_Counts.png", + "fastqc_per_sequence_gc_content_plot_Percentages.png", + "fastqc_per_sequence_quality_scores_plot.png", + "fastqc_sequence_counts_plot-cnt.png", + "fastqc_sequence_counts_plot-pct.png", + "fastqc_sequence_duplication_levels_plot.png", + "fastqc_sequence_length_distribution_plot.png", + "fastqc_top_overrepresented_sequences_table.png" + ], + "svg", + [ + "fastqc-status-check-heatmap.svg", + "fastqc_overrepresented_sequences_plot.svg", + "fastqc_per_base_n_content_plot.svg", + "fastqc_per_base_sequence_quality_plot.svg", + "fastqc_per_sequence_gc_content_plot_Counts.svg", + "fastqc_per_sequence_gc_content_plot_Percentages.svg", + "fastqc_per_sequence_quality_scores_plot.svg", + "fastqc_sequence_counts_plot-cnt.svg", + "fastqc_sequence_counts_plot-pct.svg", + "fastqc_sequence_duplication_levels_plot.svg", + "fastqc_sequence_length_distribution_plot.svg", + "fastqc_top_overrepresented_sequences_table.svg" + ] + ] + ], + "report": "multiqc_report.html", + "versions": [ + [ + "MULTIQC", + "multiqc", + "1.33" + ] + ] + } + ], + "timestamp": "2026-03-17T16:21:17.072841555", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "sarscov2 single-end [fastqc] - stub": { + "content": [ + { + "data": [ + [ + { + "id": "FASTQC" + }, + [ + ".stub:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "plots": [ + [ + { + "id": "FASTQC" + }, + [ + ".stub:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "report": [ + [ + { + "id": "FASTQC" + }, + "multiqc_report.html:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + [ + "MULTIQC", + "multiqc", + "1.33" + ] + ] + } + ], + "timestamp": "2026-02-26T15:14:39.789193051", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "sarscov2 single-end [fastqc] [config]": { + "content": [ + { + "data": [ + [ + [ + "fastqc-status-check-heatmap.txt", + "fastqc_overrepresented_sequences_plot.txt", + "fastqc_per_base_n_content_plot.txt", + "fastqc_per_base_sequence_quality_plot.txt", + "fastqc_per_sequence_gc_content_plot_Counts.txt", + "fastqc_per_sequence_gc_content_plot_Percentages.txt", + "fastqc_per_sequence_quality_scores_plot.txt", + "fastqc_sequence_counts_plot.txt", + "fastqc_sequence_duplication_levels_plot.txt", + "fastqc_sequence_length_distribution_plot.txt", + "fastqc_top_overrepresented_sequences_table.txt", + "llms-full.txt", + "multiqc.log", + "multiqc.parquet", + "multiqc_citations.txt", + "multiqc_data.json", + "multiqc_fastqc.txt", + "multiqc_general_stats.txt", + "multiqc_sources.txt" + ] + ] + ], + "plots": [ + [ + "pdf", + [ + "fastqc-status-check-heatmap.pdf", + "fastqc_overrepresented_sequences_plot.pdf", + "fastqc_per_base_n_content_plot.pdf", + "fastqc_per_base_sequence_quality_plot.pdf", + "fastqc_per_sequence_gc_content_plot_Counts.pdf", + "fastqc_per_sequence_gc_content_plot_Percentages.pdf", + "fastqc_per_sequence_quality_scores_plot.pdf", + "fastqc_sequence_counts_plot-cnt.pdf", + "fastqc_sequence_counts_plot-pct.pdf", + "fastqc_sequence_duplication_levels_plot.pdf", + "fastqc_sequence_length_distribution_plot.pdf", + "fastqc_top_overrepresented_sequences_table.pdf" + ], + "png", + [ + "fastqc-status-check-heatmap.png", + "fastqc_overrepresented_sequences_plot.png", + "fastqc_per_base_n_content_plot.png", + "fastqc_per_base_sequence_quality_plot.png", + "fastqc_per_sequence_gc_content_plot_Counts.png", + "fastqc_per_sequence_gc_content_plot_Percentages.png", + "fastqc_per_sequence_quality_scores_plot.png", + "fastqc_sequence_counts_plot-cnt.png", + "fastqc_sequence_counts_plot-pct.png", + "fastqc_sequence_duplication_levels_plot.png", + "fastqc_sequence_length_distribution_plot.png", + "fastqc_top_overrepresented_sequences_table.png" + ], + "svg", + [ + "fastqc-status-check-heatmap.svg", + "fastqc_overrepresented_sequences_plot.svg", + "fastqc_per_base_n_content_plot.svg", + "fastqc_per_base_sequence_quality_plot.svg", + "fastqc_per_sequence_gc_content_plot_Counts.svg", + "fastqc_per_sequence_gc_content_plot_Percentages.svg", + "fastqc_per_sequence_quality_scores_plot.svg", + "fastqc_sequence_counts_plot-cnt.svg", + "fastqc_sequence_counts_plot-pct.svg", + "fastqc_sequence_duplication_levels_plot.svg", + "fastqc_sequence_length_distribution_plot.svg", + "fastqc_top_overrepresented_sequences_table.svg" + ] + ] + ], + "report": "multiqc_report.html", + "versions": [ + [ + "MULTIQC", + "multiqc", + "1.33" + ] + ] + } + ], + "timestamp": "2026-03-17T16:15:30.372239611", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "sarscov2 single-end [fastqc] - custom prefix": { + "content": [ + { + "data": [ + [ + [ + "fastqc-status-check-heatmap.txt", + "fastqc_overrepresented_sequences_plot.txt", + "fastqc_per_base_n_content_plot.txt", + "fastqc_per_base_sequence_quality_plot.txt", + "fastqc_per_sequence_gc_content_plot_Counts.txt", + "fastqc_per_sequence_gc_content_plot_Percentages.txt", + "fastqc_per_sequence_quality_scores_plot.txt", + "fastqc_sequence_counts_plot.txt", + "fastqc_sequence_duplication_levels_plot.txt", + "fastqc_sequence_length_distribution_plot.txt", + "fastqc_top_overrepresented_sequences_table.txt", + "llms-full.txt", + "multiqc.log", + "multiqc.parquet", + "multiqc_citations.txt", + "multiqc_data.json", + "multiqc_fastqc.txt", + "multiqc_general_stats.txt", + "multiqc_software_versions.txt", + "multiqc_sources.txt" + ] + ] + ], + "plots": [ + [ + "pdf", + [ + "fastqc-status-check-heatmap.pdf", + "fastqc_overrepresented_sequences_plot.pdf", + "fastqc_per_base_n_content_plot.pdf", + "fastqc_per_base_sequence_quality_plot.pdf", + "fastqc_per_sequence_gc_content_plot_Counts.pdf", + "fastqc_per_sequence_gc_content_plot_Percentages.pdf", + "fastqc_per_sequence_quality_scores_plot.pdf", + "fastqc_sequence_counts_plot-cnt.pdf", + "fastqc_sequence_counts_plot-pct.pdf", + "fastqc_sequence_duplication_levels_plot.pdf", + "fastqc_sequence_length_distribution_plot.pdf", + "fastqc_top_overrepresented_sequences_table.pdf" + ], + "png", + [ + "fastqc-status-check-heatmap.png", + "fastqc_overrepresented_sequences_plot.png", + "fastqc_per_base_n_content_plot.png", + "fastqc_per_base_sequence_quality_plot.png", + "fastqc_per_sequence_gc_content_plot_Counts.png", + "fastqc_per_sequence_gc_content_plot_Percentages.png", + "fastqc_per_sequence_quality_scores_plot.png", + "fastqc_sequence_counts_plot-cnt.png", + "fastqc_sequence_counts_plot-pct.png", + "fastqc_sequence_duplication_levels_plot.png", + "fastqc_sequence_length_distribution_plot.png", + "fastqc_top_overrepresented_sequences_table.png" + ], + "svg", + [ + "fastqc-status-check-heatmap.svg", + "fastqc_overrepresented_sequences_plot.svg", + "fastqc_per_base_n_content_plot.svg", + "fastqc_per_base_sequence_quality_plot.svg", + "fastqc_per_sequence_gc_content_plot_Counts.svg", + "fastqc_per_sequence_gc_content_plot_Percentages.svg", + "fastqc_per_sequence_quality_scores_plot.svg", + "fastqc_sequence_counts_plot-cnt.svg", + "fastqc_sequence_counts_plot-pct.svg", + "fastqc_sequence_duplication_levels_plot.svg", + "fastqc_sequence_length_distribution_plot.svg", + "fastqc_top_overrepresented_sequences_table.svg" + ] + ] + ], + "report": "custom_prefix.html", + "versions": [ + [ + "MULTIQC", + "multiqc", + "1.33" + ] + ] + } + ], + "timestamp": "2026-03-17T16:15:18.189023981", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/modules/nf-core/multiqc/tests/nextflow.config b/modules/nf-core/multiqc/tests/nextflow.config new file mode 100644 index 00000000..374dfef2 --- /dev/null +++ b/modules/nf-core/multiqc/tests/nextflow.config @@ -0,0 +1,6 @@ +process { + withName: 'MULTIQC' { + ext.prefix = null + ext.args = '-p' + } +} diff --git a/nextflow.config b/nextflow.config index 3cff8ca6..910a3ec7 100644 --- a/nextflow.config +++ b/nextflow.config @@ -6,78 +6,100 @@ ---------------------------------------------------------------------------------------- */ -trace { - enabled = true - file = "trace.txt" -} - -timeline { - enabled = true - file = "timeline.html" -} - - // Global default params, used in configs params { // Mandatory inputs - outdir = null species = null - // Local datasets - input = null + // general options + keywords = "" + target_genes = "" + target_gene_file = null + platform = null + accessions_only = false + download_only = false - // Normalization - normalization_method = 'deseq2' + // Local datasets + datasets = null // Expression atlas - fetch_eatlas_accessions = false - eatlas_keywords = "" - eatlas_accessions = "" + skip_fetch_eatlas_accessions = false + fetch_geo_accessions = false + accessions = "" + excluded_accessions = "" + accessions_file = null + excluded_accessions_file = null - // Boilerplate options + // ID mapping + gprofiler_target_db = "ENSG" + gene_metadata = null + gene_id_mapping = null + skip_id_mapping = false + skip_cleaning_gene_ids = false + min_occurrence_freq = 0.1 + min_occurrence_quantile = 0.2 + + // sample filtering + max_zero_ratio = 0.9 + max_null_ratio = 0.9 + max_null_ratio_valid_sample = 0.75 + + // statistics + normalisation_method = 'tpm' + gene_length = null + gff = null + quantile_norm_target_distrib = 'uniform' + nb_sections = 20 + nb_candidates_per_section = 250 + missing_value_imputer = 'iterative' + // stability scoring + skip_genorm = false + stability_score_weights = "0.5,0.5,0,0" + + // random sampling + random_sampling_seed = 42 + random_sampling_size = 5000 + + // MultiQC options + multiqc_config = null + multiqc_title = null + multiqc_logo = null + max_multiqc_email_size = '25.MB' + multiqc_methods_description = null + + // Boilerplate options + outdir = null publish_dir_mode = 'copy' email = null email_on_fail = null plaintext_email = false monochrome_logs = false - hook_url = null + hook_url = System.getenv('HOOK_URL') help = false + help_full = false + show_hidden = false version = false pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/' + trace_report_suffix = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') // Config options config_profile_name = null config_profile_description = null + custom_config_version = 'master' custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" config_profile_contact = null config_profile_url = null -} -validation { - // logs - monochromeLogs = false - help.enabled = true + // Schema validation default options + validate_params = true } // Load base.config by default for all pipelines includeConfig 'conf/base.config' -// Load nf-core custom profiles from different Institutions -try { - includeConfig "${params.custom_config_base}/nfcore_custom.config" -} catch (Exception e) { - System.err.println("WARNING: Could not load nf-core/config profiles: ${params.custom_config_base}/nfcore_custom.config") -} - -// Load nf-core/stableexpression custom profiles from different institutions. -try { - includeConfig "${params.custom_config_base}/pipeline/stableexpression.config" -} catch (Exception e) { - System.err.println("WARNING: Could not load nf-core/config/stableexpression profiles: ${params.custom_config_base}/pipeline/stableexpression.config") -} profiles { debug { dumpHashes = true @@ -92,7 +114,7 @@ profiles { podman.enabled = false shifter.enabled = false charliecloud.enabled = false - conda.channels = ['conda-forge', 'bioconda', 'defaults'] + conda.channels = ['conda-forge', 'bioconda'] apptainer.enabled = false } mamba { @@ -105,6 +127,17 @@ profiles { charliecloud.enabled = false apptainer.enabled = false } + micromamba { + conda.enabled = true + conda.useMicromamba = true + conda.channels = ['conda-forge', 'bioconda'] + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + apptainer.enabled = false + } docker { docker.enabled = true conda.enabled = false @@ -115,7 +148,18 @@ profiles { apptainer.enabled = false docker.runOptions = '-u $(id -u):$(id -g)' } - arm { + arm64 { + process.arch = 'arm64' + // TODO https://github.com/nf-core/modules/issues/6694 + // For now if you're using arm64 you have to use wave for the sake of the maintainers + // wave profile + apptainer.ociAutoPull = true + singularity.ociAutoPull = true + wave.enabled = true + wave.freeze = true + wave.strategy = 'conda,container' + } + emulate_amd64 { docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' } singularity { @@ -172,30 +216,37 @@ profiles { wave.freeze = true wave.strategy = 'conda,container' } - gitpod { - executor.name = 'local' - executor.cpus = 4 - executor.memory = 8.GB + gpu { + docker.runOptions = '-u $(id -u):$(id -g) --gpus all' + apptainer.runOptions = '--nv' + singularity.runOptions = '--nv' } - test { includeConfig 'conf/test.config' } - test_input { includeConfig 'conf/test_input.config' } - test_eatlas { includeConfig 'conf/test_eatlas.config' } - test_accessions { includeConfig 'conf/test_accessions.config' } - test_full { includeConfig 'conf/test_full.config' } + + test { includeConfig 'conf/test.config' } + test_full { includeConfig 'conf/test_full.config' } + test_dataset_eatlas { includeConfig 'conf/test_dataset_eatlas.config' } } +// Load nf-core custom profiles from different institutions + +// If params.custom_config_base is set AND either the NXF_OFFLINE environment variable is not set or params.custom_config_base is a local path, the nfcore_custom.config file from the specified base path is included. +// Load nf-core/stableexpression custom profiles from different institutions. +includeConfig params.custom_config_base && (!System.getenv('NXF_OFFLINE') || !params.custom_config_base.startsWith('http')) ? "${params.custom_config_base}/nfcore_custom.config" : "/dev/null" + +// Load nf-core/stableexpression custom profiles from different institutions. +// TODO nf-core: Optionally, you can add a pipeline-specific nf-core config at https://github.com/nf-core/configs +includeConfig params.custom_config_base && (!System.getenv('NXF_OFFLINE') || !params.custom_config_base.startsWith('http')) ? "${params.custom_config_base}/pipeline/stableexpression.config" : "/dev/null" + // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile // Will not be used unless Apptainer / Docker / Podman / Singularity are enabled // Set to your registry if you have a mirror of containers -apptainer.registry = 'quay.io' -docker.registry = 'quay.io' -podman.registry = 'quay.io' -singularity.registry = 'quay.io' +apptainer.registry = 'quay.io' +docker.registry = 'quay.io' +podman.registry = 'quay.io' +singularity.registry = 'quay.io' +charliecloud.registry = 'quay.io' + -// Nextflow plugins -plugins { - id 'nf-schema@2.0.0' -} // Export these variables to prevent local Python/R libraries from conflicting with those in the container // The JULIA depot path has been adjusted to a fixed path `/usr/local/share/julia` that needs to be used for packages in the container. @@ -208,40 +259,67 @@ env { JULIA_DEPOT_PATH = "/usr/local/share/julia" } -// Capture exit codes from upstream processes when piping -process.shell = ['/bin/bash', '-euo', 'pipefail'] +// Set bash options +process.shell = [ + "bash", + "-C", // No clobber - prevent output redirection from overwriting files. + "-e", // Exit if a tool returns a non-zero status/exit code + "-u", // Treat unset variables and parameters as an error + "-o", // Returns the status of the last command to exit.. + "pipefail" // ..with a non-zero status or zero if all successfully execute +] // Disable process selector warnings by default. Use debug profile to enable warnings. nextflow.enable.configProcessNamesValidation = false -def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') timeline { enabled = true - file = "${params.outdir}/pipeline_info/execution_timeline_${trace_timestamp}.html" + file = "${params.outdir}/pipeline_info/execution_timeline_${params.trace_report_suffix}.html" } report { enabled = true - file = "${params.outdir}/pipeline_info/execution_report_${trace_timestamp}.html" + file = "${params.outdir}/pipeline_info/execution_report_${params.trace_report_suffix}.html" } trace { enabled = true - file = "${params.outdir}/pipeline_info/execution_trace_${trace_timestamp}.txt" + file = "${params.outdir}/pipeline_info/execution_trace_${params.trace_report_suffix}.txt" } dag { enabled = true - file = "${params.outdir}/pipeline_info/pipeline_dag_${trace_timestamp}.html" + file = "${params.outdir}/pipeline_info/pipeline_dag_${params.trace_report_suffix}.html" } manifest { name = 'nf-core/stableexpression' - author = """Olivier Coen""" + contributors = [ + // TODO nf-core: Update the field with the details of the contributors to your pipeline. New with Nextflow version 24.10.0 + [ + name: 'Olivier Coen', + affiliation: 'CNRS / UniversitĂŠ Paris-Saclay', + email: 'olivier.coen@universite-paris-saclay.fr', + github: 'OlivierCoen', + contribution: ['author', 'maintainer'], // List of contribution types ('author', 'maintainer' or 'contributor') + orcid: '0000-0003-3387-1040' + ], + ] homePage = 'https://github.com/nf-core/stableexpression' description = """This pipeline is dedicated to finding the most stable genes across count datasets""" mainScript = 'main.nf' - nextflowVersion = '!>=23.04.0' - version = '1.0dev' + defaultBranch = 'main' + nextflowVersion = '!>=25.04.0' + version = '1.0.0' doi = '' } +// Nextflow plugins +plugins { + id 'nf-schema@2.5.1' // Validation of pipeline parameters and creation of an input channel from a sample sheet +} + +validation { + defaultIgnoreParams = ["genomes"] + monochromeLogs = params.monochrome_logs +} + // Load modules.config for DSL2 module specific options includeConfig 'conf/modules.config' diff --git a/nextflow_schema.json b/nextflow_schema.json index 699eb0a4..8109b426 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1,8 +1,8 @@ { "$schema": "https://json-schema.org/draft/2020-12/schema", - "$id": "https://raw.githubusercontent.com/nf-core/stableexpression/master/nextflow_schema.json", + "$id": "https://raw.githubusercontent.com/nf-core/stableexpression/main/nextflow_schema.json", "title": "nf-core/stableexpression pipeline parameters", - "description": "This pipeline is dedicated to finding the most stable genes across count datasets", + "description": "This pipeline is dedicated to identifying the most stable genes within a single or multiple expression dataset(s). This is particularly useful for identifying the most suitable RT-qPCR reference genes for a specific species.", "type": "object", "$defs": { "input_output_options": { @@ -14,84 +14,342 @@ "properties": { "species": { "type": "string", - "description": "Species name.", + "description": "Scientifc species name (genus and species)", "fa_icon": "fas fa-hippo", - "pattern": "([a-zA-Z]+)[_ ]([a-zA-Z]+)", - "help_text": "e.g. `--species 'Arabidopsis thaliana'` or `--species 'homo_sapiens'`" + "pattern": "^([a-zA-Z]+)[_ ]([a-zA-Z]+)[_ a-zA-Z]*$", + "help_text": "At least genus and species name should be supplied. Words should be separated by ` ` or `_`. Note that character case is ignored. Examples: `--species 'Arabidopsis thaliana'`, `--species 'homo_sapiens' or `--species MARMOTA_MARMOTA_MARMOTA`." }, "outdir": { "type": "string", "format": "directory-path", - "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", + "description": "Output directory", + "help_text": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", "fa_icon": "fas fa-folder-open" }, + "datasets": { + "type": "string", + "format": "file-path", + "exists": true, + "schema": "assets/schema_datasets.json", + "pattern": "^\\S+\\.(csv|yaml|yml|dat)$", + "description": "Custom datasets (counts + designs)", + "help_text": "Path to CSV / YAML file listing your own count datasets and their related experimental design. This file should be a comma-separated file with 4 columns (`counts`, `design`, `platform` and `normalised`). It must have a header row. Before running the pipeline, and for each count dataset provided by you, a design file with information about the samples in your experiment is required. Combine with --skip_fetch_eatlas_accessions if you only want to analyse your own count datasets. Otherwise, accessions from Expression Atlas and GEO will be fetched automatically. See [usage docs](https://nf-co.re/stableexpression/usage#samplesheet-input) for more information. ", + "fa_icon": "fas fa-file-csv" + }, + "keywords": { + "type": "string", + "description": "Keywords used for selecting specific Expression Atlas / GEO accessions", + "fa_icon": "fas fa-font", + "pattern": "(([a-zA-Z,]+))?", + "help_text": "Keywords (separated by commas) to use when retrieving specific experiments from Expression Atlas and / or GEO datasets. The pipeline will select all Expression Atlas experiments / GEO datasets that contain the provided keywords in their description of in one of the condition names. Example: `--keywords 'stress,flowering'`. This parameter is unused if --skip_fetch_eatlas_accessions is set and --fetch_geo_accessions is not set." + }, + "target_genes": { + "type": "string", + "description": "Target genes", + "fa_icon": "fas fa-arrows-alt-h", + "help_text": "One or multiple target genes (separated by commas). These can be gene IDs (as provided in your input datasets), Ensembl gene IDs, or gene symbols." + }, + "target_gene_file": { + "type": "string", + "description": "File containing target genes", + "format": "file-path", + "exists": true, + "fa_icon": "fas fa-arrows-alt-h", + "help_text": "File containing one or multiple target genes (one ID per line). These can be gene IDs (as provided in your input datasets), Ensembl gene IDs, or gene symbols." + }, + "platform": { + "type": "string", + "enum": ["rnaseq", "microarray"], + "description": "Only download from this platform", + "fa_icon": "fas fa-arrows-alt-h", + "help_text": "By default, data from both RNA-seq and Microarray platforms are downloaded. Setting this parameter applies a filter to get data from only one of the two platforms. This filter is only used while fetching appropriate Expression atlas / GEO accessions. It will not filter accessions provided directly by the user." + }, + "accessions_only": { + "type": "boolean", + "description": "Only get accessions from Expression Atlas / GEO and exit.", + "fa_icon": "far fa-stop-circle", + "help_text": "Use this option if you only want to get Expression Atlas accessions and skip the rest of the pipeline." + }, + "download_only": { + "type": "boolean", + "description": "Only get accessions from Expression Atlas / GEO and download the selected datasets.", + "fa_icon": "far fa-stop-circle", + "help_text": "Use this option if you only want to get Expression Atlas / GEO accessions, download the selected data, and skip the rest of the pipeline." + }, "email": { "type": "string", "description": "Email address for completion summary.", "fa_icon": "fas fa-envelope", "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" + }, + "multiqc_title": { + "type": "string", + "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.", + "fa_icon": "fas fa-file-signature" } } }, - "local_datasets_options": { - "title": "Local datasets options", + "public_data_options": { + "title": "Public data options", "type": "object", - "fa_icon": "fas fa-dna", - "description": "Options for local count datasets to analyze.", + "fa_icon": "fas fa-book-atlas", + "description": "Options for fetching experiment data from Expression Atlas / GEO.", "properties": { - "datasets": { + "skip_fetch_eatlas_accessions": { + "type": "boolean", + "fa_icon": "fas fa-cloud-arrow-down", + "description": "Skip fetching Expression Atlas accessions", + "help_text": "Expression Atlas accessions are automatically fetched by default. Set this parameter to skip this step." + }, + "fetch_geo_accessions": { + "type": "boolean", + "fa_icon": "fas fa-cloud-arrow-down", + "description": "Fetch GEO accessions from NCBI [Experimental]", + "help_text": "Set this parameter to fetch GEO accessions from NCBI. **This feature is experimental and may not work as expected**. Please report any issues to https://github.com/nf-core/stableexpression/issues." + }, + "accessions": { + "type": "string", + "pattern": "([A-Z0-9-]+,?)+", + "description": "Expression Atlas / GEO accession(s) to include", + "fa_icon": "fas fa-address-card", + "help_text": "Provide Expression Atlas / GEO accession(s) that you want to download. The accessions should be comma-separated. Example: `--accessions E-MTAB-552,E-GEOD-61690,GSE8165,GSE8161`. Combine with --skip_fetch_eatlas_accessions if you want only these accessions to be used. User provided accessions are prioritised over excluded accessions." + }, + "accessions_file": { "type": "string", "format": "file-path", "exists": true, - "schema": "assets/schema_input.json", - "mimetype": "text/csv", - "pattern": "^\\S+\\.csv$", - "description": "Path to comma-separated file containing information about the input count datasets and their related experimental design.", - "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/stableexpression/usage#samplesheet-input).", - "fa_icon": "fas fa-file-csv" + "description": "File containing Expression Atlas / GEO accession(s) to download", + "fa_icon": "fas fa-file", + "help_text": "File containing Expression Atlas / GEO accession(s) that you want to download. One accession per line. Example: `--accessions_file included_accessions.txt`. Combine with --skip_fetch_accessions if you want only these accessions to be used. User provided accessions are prioritised over excluded accessions." + }, + "excluded_accessions": { + "type": "string", + "pattern": "([A-Z0-9-]+,?)+", + "description": "Expression Atlas accession(s) to exclude", + "fa_icon": "fas fa-id-card", + "help_text": "Provide Expression Atlas / GEO accession(s) that you want to exclude. The accessions should be comma-separated. Example: `--excluded_accessions E-MTAB-552,E-GEOD-61690`" + }, + "excluded_accessions_file": { + "type": "string", + "format": "file-path", + "exists": true, + "description": "File containing Expression Atlas accession(s) to exclude", + "fa_icon": "fas fa-file", + "help_text": "File containing Expression Atlas / GEO accession(s) that you want to exclude. One accession per line. Example: `--excluded_accessions_file excluded_accessions.txt`." } } }, - "expression_atlas_options": { - "title": "Expression Atlas options", + "idmapping_options": { + "title": "ID mapping options", "type": "object", - "fa_icon": "fas fa-dna", - "description": "Options for fetching datasets from Expression Atlas.", + "fa_icon": "fas fa-map", + "description": "Options for mapping gene IDs.", "properties": { - "fetch_eatlas_accessions": { + "skip_id_mapping": { + "type": "boolean", + "description": "Skip g:Profiler ID mapping step", + "fa_icon": "fas fa-ban", + "help": "If you don't want to map gene IDs with g:Profiler, you can skip this step by providing `--skip_id_mapping`. It can be in particular useful if the g:Profiler is down and if you already have a custom mapping file." + }, + "skip_cleaning_gene_ids": { "type": "boolean", - "fa_icon": "fas fa-book-atlas", - "description": "Fetch count datasets and experimental designs from Expression Atlas for this species.", - "help_text": "If you want to fetch count data for this species from Expression Atlas, set this parameter to `true`. You do not need to set this parameter if you provide Expression Atlas keywords with `--eatlas_keywords`." + "description": "Skip cleaning gene IDs step", + "fa_icon": "fas fa-ban", + "help": "If you don't want to clean gene IDs, you can skip this step by providing `--skip_cleaning_gene_ids`. Note that gene ID cleaning is automatically disabled with `--skip_id_mapping`." }, - "eatlas_keywords": { + "gprofiler_target_db": { "type": "string", - "description": "Keywords (separated by commas) to use when retrieving specific experiments from Expression Atlas.", - "fa_icon": "fas fa-book-atlas", - "pattern": "([a-zA-Z,]+)", - "help_text": "e.g. `--eatlas_keywords 'stress,flowering'`" + "description": "Experimental: target database for g:Profiler", + "fa_icon": "fas fa-divide", + "enum": ["ENSG", "ENTREZGENE", "UNIPROTSPTREMBL", "UNIPROTSWISSPROT"], + "default": "ENSG", + "help_text": "Experimental: target database for g:Profiler. You can see the full list of available target databases at https://biit.cs.ut.ee/gprofiler/convert." }, - "eatlas_accessions": { + "gene_id_mapping": { "type": "string", - "pattern": "([A-Z0-9-]+,?)+", - "description": "Expression Atlas accession(s), separated by commas.", - "fa_icon": "fas fa-book-atlas", - "help_text": "e.g. `--eatlas_accessions 'E-MTAB-552,E-GEOD-61690'`" + "format": "file-path", + "exists": true, + "schema": "assets/schema_gene_id_mapping.json", + "mimetype": "text/csv", + "pattern": "^\\S+\\.(csv|dat)$", + "description": "Custom gene id mapping file", + "help_text": "Path to comma-separated file containing custom gene id mappings. Each row represents a mapping from the original gene ID in your count datasets to a prefered gene ID. The mapping file should be a comma-separated file with 2 columns (original_gene_id and gene_id) and a header row.", + "fa_icon": "fas fa-file" + }, + "gene_metadata": { + "type": "string", + "format": "file-path", + "exists": true, + "schema": "assets/schema_gene_metadata.json", + "mimetype": "text/csv", + "pattern": "^\\S+\\.(csv|dat)$", + "description": "Custom gene metadata file", + "help_text": "Path to comma-separated file containing custom gene metadata information. Each row represents a gene and links its gene ID to its name and description. The metadata file should be a comma-separated file with 3 columns (gene_id, name and description) and a header row.", + "fa_icon": "fas fa-file" + }, + "min_occurrence_quantile": { + "type": "number", + "description": "Minimum quantile for the frequency of occurrence", + "fa_icon": "fas fa-battery-three-quarters", + "minimum": 0, + "maximum": 1, + "default": 0.2, + "help_text": "To avoid genes that are rarely observed, genes less represented than the specified quantile will be filtered out. For example, value of 0.2 means that the 20% less represented will be filtered out. This filter is applied before using the absolute filter `--min_occurrence_freq`." + }, + "min_occurrence_freq": { + "type": "number", + "description": "Minimum frequency of occurrence among all datasets", + "fa_icon": "fas fa-battery-three-quarters", + "minimum": 0, + "maximum": 1, + "default": 0.1, + "help_text": "To avoid genes that are rarely observed, genes showing a frequency of occurrence below this threshold will be filtered out." + } + } + }, + "sample_filtering_options": { + "title": "Sample filtering options", + "type": "object", + "fa_icon": "fas fa-chart-line", + "description": "Options for filtering samples based on their expression levels.", + "properties": { + "max_zero_ratio": { + "type": "number", + "description": "Maximum ratio of zero counts to total counts", + "fa_icon": "fas fa-divide", + "minimum": 0, + "maximum": 1, + "default": 0.9, + "help_text": "A filter is set up to avoid samples that contain a high proportion of zero counts. All samples with a ratio of zero counts to total counts above this threshold will be filtered out." + }, + "max_null_ratio": { + "type": "number", + "description": "Maximum ratio of null values", + "fa_icon": "fas fa-divide", + "minimum": 0, + "maximum": 1, + "default": 0.9, + "help_text": "A filter is set up to avoid samples that contain a high proportion of zero counts. All samples with a ratio of zero counts to total counts above this threshold will be filtered out." + }, + "max_null_ratio_valid_sample": { + "type": "number", + "description": "Maximum ratio of null values in a sample for it to be considered in the computation of the null value malus", + "fa_icon": "fas fa-divide", + "minimum": 0, + "maximum": 1, + "default": 0.75, + "help_text": "After filtering out samples with a very high proportion of zero counts (via `--max_null_ratio`), a second filter is set up to avoid samples that contain a substantial proportion of null values to be considered in the malus of null values comprised in the stability score." } } }, - "normalization_options": { - "title": "Normalization options", + "statistical_options": { + "title": "Statistics options", "type": "object", - "fa_icon": "fas fa-dna", - "description": "Options for normalizing datasets.", + "fa_icon": "fas fa-chart-line", + "description": "Statistical options for normalisation and calculation of gene expression variation.", "properties": { - "normalization_method": { + "normalisation_method": { + "type": "string", + "description": "Count normalisation method", + "fa_icon": "fas fa-divide", + "enum": ["tpm", "cpm"], + "default": "tpm", + "help_text": "Raw RNAseq data must be normalised before further processing. `tmp offers a more accurate representation of gene expression levels as it is unbiased toward gene length. However, you can choose `cpm` if you do not have access to a genome annotation." + }, + "gff": { + "type": "string", + "format": "file-path", + "exists": true, + "mimetype": "text/tsv", + "pattern": "^\\S+\\.(gff|dat)$", + "description": "Genome annotation file (GFF format)", + "help_text": "Path to genome annotation file (GFF format). Cannot be compressed.", + "fa_icon": "fas fa-file" + }, + "gene_length": { + "type": "string", + "format": "file-path", + "exists": true, + "schema": "assets/schema_gene_length.json", + "mimetype": "text/csv", + "pattern": "^\\S+\\.(csv|tsv|dat)$", + "description": "Gene length file", + "help_text": "Path to comma-separated file containing gene lengths. Each row represents a gene and gives the length of its longest transcript. The file should be a comma-separated file with 2 columns (gene_id and length) and a header row.", + "fa_icon": "fas fa-file" + }, + "quantile_norm_target_distrib": { + "type": "string", + "description": "Target distribution for quantile normalisation", + "fa_icon": "fas fa-chart-bar", + "enum": ["uniform", "normal"], + "default": "uniform", + "help_text": "In order to compare counts between samples and different datasets, all normalised counts are quantile normalised and mapped to a specific distribution. The pipeline uses scikit-learn's quantile_transform function. You can select the target distribution to map counts to." + }, + "missing_value_imputer": { "type": "string", - "description": "Tool to use for normalization.", - "fa_icon": "fas fa-chart-simple", - "enum": ["deseq2", "edger"] + "description": "Type of imputation method to use for missing values", + "fa_icon": "fas fa-battery-three-quarters", + "enum": ["iterative", "knn", "gene_mean"], + "default": "iterative", + "help_text": "The pipeline provides three options for imputing missing values: iterative, k-nearest neighbors, and gene mean. Iterative imputation uses a bayesian iterative algorithm to fill in missing values. K-nearest neighbors imputation uses a k-nearest neighbors algorithm to fill in missing values. Gene mean imputation is a very basic method that replaces missing values with the mean expression level of the gene across all samples." + } + } + }, + "stability_scoring_options": { + "title": "Stability scoring options", + "type": "object", + "fa_icon": "fas fa-chart-line", + "description": "Options relative to assessment of stability for each gene.", + "properties": { + "nb_sections": { + "type": "integer", + "description": "Number of sections to divide genes into for stability scoring.", + "fa_icon": "fas fa-sort-numeric-up-alt", + "minimum": 1, + "help_text": "All genes are divided into sections based on their expression levels. Set this parameter to modify the number of sections." + }, + "nb_candidates_per_section": { + "type": "integer", + "description": "Number of candidate genes to keep for stability scoring in each section", + "fa_icon": "fas fa-sort-numeric-up-alt", + "minimum": 1, + "help_text": "Number of candidate genes to keep in each section for stability scoring. Within each section, the top candidates are selected based on the descriptor chosen with `--candidate_selection_descriptor`." + }, + "skip_genorm": { + "type": "boolean", + "description": "Run Genorm", + "fa_icon": "fas fa-check", + "help": "Skip Genorm by setting this parameter to true. In this case, by default, only Normfinder will participate in the stability score." + }, + "stability_score_weights": { + "type": "string", + "description": "Weights for stability score calculation", + "fa_icon": "fas fa-balance-scale", + "help_text": "Weights for Normfinder / Genorm / Coefficient of Variation (CV) / Robust Coefficient of Variation on Median (RCVM) respectively. Must be a comma-separated string. Example: 0.5,0.5,0.0,0", + "pattern": "^\\d+(\\.\\d+)?,\\d+(\\.\\d+)?,\\d+(\\.\\d+)?,\\d+(\\.\\d+)?$" + } + } + }, + "scalability_options": { + "title": "Scalability options", + "type": "object", + "fa_icon": "fas fa-terminal", + "description": "Options to improve pipeline scalability and robustness", + "properties": { + "random_sampling_size": { + "type": "integer", + "description": "Number of public dataset samples to choose randomly before downloading.", + "fa_icon": "fas fa-sort-numeric-up-alt", + "minimum": 1, + "help_text": "When dealing with species for which there is a large number (eg. >10000) of samples considering all the downloaded datasets, users may encounter RAM issues (eg. errors with `137` exit codes). In such cases, it is recommended to sample a random subset of these datasets to reduce the computational load. A first subsampling is performedduring the search for Expression Atlas accessions. In case there is still room for datasets and if the `--fetch_geo_accessions` flag was set, a second ssubsampling is performed during the search for NCBI GEO accessions." + }, + "random_sampling_seed": { + "type": "integer", + "description": "Seed for dataset random sampling.", + "fa_icon": "fas fa-sort-numeric-up-alt", + "minimum": 0, + "help_text": "Seed for dataset random sampling. This ensures reproducibility of the random sampling process. Changing the seed will result in a different random sample being selected." } } }, @@ -143,41 +401,6 @@ } } }, - "max_job_request_options": { - "title": "Max job request options", - "type": "object", - "fa_icon": "fab fa-acquisitions-incorporated", - "description": "Set the top limit for requested resources for any single job.", - "help_text": "If you are running on a smaller system, a pipeline step requesting more resources than are available may cause the Nextflow to stop the run with an error. These options allow you to cap the maximum resources requested by any single job so that the pipeline will run on your system.\n\nNote that you can not _increase_ the resources requested by any job using these options. For that you will need your own configuration file. See [the nf-core website](https://nf-co.re/usage/configuration) for details.", - "properties": { - "max_cpus": { - "type": "integer", - "description": "Maximum number of CPUs that can be requested for any single job.", - "default": 16, - "fa_icon": "fas fa-microchip", - "hidden": true, - "help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`" - }, - "max_memory": { - "type": "string", - "description": "Maximum amount of memory that can be requested for any single job.", - "default": "128.GB", - "fa_icon": "fas fa-memory", - "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", - "hidden": true, - "help_text": "Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`" - }, - "max_time": { - "type": "string", - "description": "Maximum amount of time that can be requested for any single job.", - "default": "240.h", - "fa_icon": "far fa-clock", - "pattern": "^(\\d+\\.?\\s*(s|m|h|d|day)\\s*)+$", - "hidden": true, - "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" - } - } - }, "generic_options": { "title": "Generic options", "type": "object", @@ -185,12 +408,6 @@ "description": "Less common options for the pipeline, typically set in a config file.", "help_text": "These options are common to all nf-core pipelines and allow you to customise some of the core preferences for how the pipeline runs.\n\nTypically these options would be set in a Nextflow config file loaded for all pipeline runs, such as `~/.nextflow/config`.", "properties": { - "help": { - "type": "boolean", - "description": "Display help text.", - "fa_icon": "fas fa-question-circle", - "hidden": true - }, "version": { "type": "boolean", "description": "Display version and exit.", @@ -220,6 +437,14 @@ "fa_icon": "fas fa-remove-format", "hidden": true }, + "max_multiqc_email_size": { + "type": "string", + "description": "File size limit when attaching MultiQC reports to summary emails.", + "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", + "default": "25.MB", + "fa_icon": "fas fa-file-upload", + "hidden": true + }, "monochrome_logs": { "type": "boolean", "description": "Do not use coloured log outputs.", @@ -233,12 +458,55 @@ "help_text": "Incoming hook URL for messaging service. Currently, MS Teams and Slack are supported.", "hidden": true }, + "multiqc_config": { + "type": "string", + "format": "file-path", + "description": "Custom config file to supply to MultiQC.", + "fa_icon": "fas fa-cog", + "hidden": true + }, + "multiqc_logo": { + "type": "string", + "description": "Custom logo file to supply to MultiQC. File name must also be set in the MultiQC config file", + "fa_icon": "fas fa-image", + "hidden": true + }, + "multiqc_methods_description": { + "type": "string", + "description": "Custom MultiQC yaml file containing HTML including a methods description.", + "fa_icon": "fas fa-cog" + }, + "validate_params": { + "type": "boolean", + "description": "Boolean whether to validate parameters against the schema at runtime", + "default": true, + "fa_icon": "fas fa-check-square", + "hidden": true + }, "pipelines_testdata_base_path": { "type": "string", "fa_icon": "far fa-check-circle", "description": "Base URL or local path to location of pipeline test dataset files", "default": "https://raw.githubusercontent.com/nf-core/test-datasets/", "hidden": true + }, + "trace_report_suffix": { + "type": "string", + "fa_icon": "far calendar", + "description": "Suffix to add to the trace report filename. Default is the date and time in the format yyyy-MM-dd_HH-mm-ss.", + "hidden": true + }, + "help": { + "type": ["boolean", "string"], + "description": "Display the help message." + }, + "help_full": { + "type": "boolean", + "description": "Display the full detailed help message." + }, + "show_hidden": { + "type": "boolean", + "description": "Display hidden parameters in the help message (only works when --help or --help_full are provided)." } } } @@ -248,19 +516,25 @@ "$ref": "#/$defs/input_output_options" }, { - "$ref": "#/$defs/local_datasets_options" + "$ref": "#/$defs/public_data_options" }, { - "$ref": "#/$defs/expression_atlas_options" + "$ref": "#/$defs/idmapping_options" }, { - "$ref": "#/$defs/normalization_options" + "$ref": "#/$defs/sample_filtering_options" }, { - "$ref": "#/$defs/institutional_config_options" + "$ref": "#/$defs/statistical_options" + }, + { + "$ref": "#/$defs/stability_scoring_options" + }, + { + "$ref": "#/$defs/scalability_options" }, { - "$ref": "#/$defs/max_job_request_options" + "$ref": "#/$defs/institutional_config_options" }, { "$ref": "#/$defs/generic_options" diff --git a/nf-test.config b/nf-test.config index 2fa82adf..a0a009fd 100644 --- a/nf-test.config +++ b/nf-test.config @@ -1,8 +1,31 @@ config { + // location for all nf-test tests + testsDir "." + // nf-test directory including temporary files for each test + workDir System.getenv("NFT_WORKDIR") ?: ".nf-test" + + // location of an optional nextflow.config file specific for executing tests testsDir "tests" workDir ".nf-test" configFile "tests/nextflow.config" - profile "docker" + + // ignore tests coming from the nf-core/modules repo + ignore 'modules/nf-core/**/tests/*', 'subworkflows/nf-core/**/tests/*' + + // run all test with defined profile(s) from the main nextflow.config + //profile "apptainer" + + // list of filenames or patterns that should be trigger a full test run + triggers 'nextflow.config', 'nf-test.config', 'conf/test.config', 'tests/nextflow.config', 'tests/.nftignore' + + // load the necessary plugins + requires ( + "nf-test": "0.9.3" + ) + plugins { + load "nft-utils@0.0.3" + load "nft-csv@0.1.0" + } } diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..5fbcbac4 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,17 @@ +[tool.ruff.lint] +# Avoid enforcing line-length violations (`E501`) +ignore = ["E501"] + +[tool.ruff.format] +# Use single quotes when formatting. +quote-style = "double" +indent-style = "space" + +[tool.basedpyright] +reportUnusedCallResult = "none" +reportUnknownMemberType = "none" +reportUnknownVariableType = "none" +reportUnknownParameterType = "none" +reportUnknownArgumentType = "none" +reportAny = "none" +reportImplicitRelativeImport = "none" diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json new file mode 100644 index 00000000..e99a20e2 --- /dev/null +++ b/ro-crate-metadata.json @@ -0,0 +1,346 @@ +{ + "@context": [ + "https://w3id.org/ro/crate/1.1/context", + { + "GithubService": "https://w3id.org/ro/terms/test#GithubService", + "JenkinsService": "https://w3id.org/ro/terms/test#JenkinsService", + "PlanemoEngine": "https://w3id.org/ro/terms/test#PlanemoEngine", + "TestDefinition": "https://w3id.org/ro/terms/test#TestDefinition", + "TestInstance": "https://w3id.org/ro/terms/test#TestInstance", + "TestService": "https://w3id.org/ro/terms/test#TestService", + "TestSuite": "https://w3id.org/ro/terms/test#TestSuite", + "TravisService": "https://w3id.org/ro/terms/test#TravisService", + "definition": "https://w3id.org/ro/terms/test#definition", + "engineVersion": "https://w3id.org/ro/terms/test#engineVersion", + "instance": "https://w3id.org/ro/terms/test#instance", + "resource": "https://w3id.org/ro/terms/test#resource", + "runsOn": "https://w3id.org/ro/terms/test#runsOn" + } + ], + "@graph": [ + { + "@id": "./", + "@type": "Dataset", + "creativeWorkStatus": "Stable", + "datePublished": "2026-03-14T09:55:43+00:00", + "description": "

\n \n \n \"nf-core/stableexpression\"\n \n

\n\n[![Open in GitHub Codespaces](https://img.shields.io/badge/Open_In_GitHub_Codespaces-black?labelColor=grey&logo=github)](https://github.com/codespaces/new/nf-core/stableexpression)\n[![GitHub Actions CI Status](https://github.com/nf-core/stableexpression/actions/workflows/nf-test.yml/badge.svg)](https://github.com/nf-core/stableexpression/actions/workflows/nf-test.yml)\n[![GitHub Actions Linting Status](https://github.com/nf-core/stableexpression/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/stableexpression/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/stableexpression/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A525.04.0-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.5.1-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.5.1)\n[![run with apptainer](https://custom-icon-badges.demolab.com/badge/run%20with-apptainer-4545?logo=apptainer&color=teal&labelColor=000000)](https://apptainer.org/)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/stableexpression)\n\n[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23stableexpression-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/stableexpression)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)\n\n## Introduction\n\n**nf-core/stableexpression** is a bioinformatics pipeline aiming to aggregate multiple count datasets for a specific species and find the most stable genes. The datasets can be either downloaded from public databases (EBI, NCBI) or provided directly by the user. Both RNA-seq and Microarray count datasets can be utilised.\n\n

\n \n

\n\nIt takes as main inputs :\n\n- a species name (mandatory)\n- keywords for Expression Atlas / GEO search (optional)\n- a CSV input file listing your own raw / normalised count datasets (optional).\n\n**Use cases**:\n\n- **find the most suitable genes as RT-qPCR reference genes for a specific species (and optionally specific conditions)**\n- download all Expression Atlas and / or NCBI GEO datasets for a species (and optionally keywords)\n\n## Pipeline overview\n\nThe pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps:\n\n#### 1. Get accessions from public databases\n\n- Get [Expression Atlas](https://www.ebi.ac.uk/gxa/home) dataset accessions corresponding to the provided species (and optionally keywords)\n This step is run by default but is optional. Set `--skip_fetch_eatlas_accessions` to skip it.\n- Get NBCI [GEO](https://www.ncbi.nlm.nih.gov/gds) **microarray** dataset accessions corresponding to the provided species (and optionally keywords)\n This is optional and **NOT** run by default. Set `--fetch_geo_accessions` to run it.\n\n#### 2. Download data (see [usage](./conf/usage.md#3-provide-your-own-accessions))\n\n- Download [Expression Atlas](https://www.ebi.ac.uk/gxa/home) data if any\n- Download NBCI [GEO](https://www.ncbi.nlm.nih.gov/gds) data if any\n\n> [!NOTE]\n> At this point, datasets downloaded from public databases are merged with datasets provided by the user using the `--datasets` parameter. See [usage](./conf/usage.md#4-use-your-own-expression-datasets) for more information about local datasets.\n\n#### 3. ID Mapping (see [usage](./conf/usage.md#5-custom-gene-id-mapping--metadata))\n\n- Gene IDs are cleaned\n- Map gene IDS to NCBI Entrez Gene IDS (or Ensembl IDs) for standardisation among datasets using [g:Profiler](https://biit.cs.ut.ee/gprofiler/gost) (run by default; optional)\n- Rare genes are filtered out\n\n#### 4. Sample filtering\n\nSamples that show too high ratios of zeros or missing values are removed from the analysis.\n\n#### 5. Normalisation of expression\n\n- Normalize RNAseq raw data using TPM (necessitates downloading the corresponding genome and computing transcript lengths) or CPM.\n- Perform quantile normalisation on each dataset separately using [scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.quantile_transform.html)\n\n#### 6. Merge all data\n\nAll datasets are merged into one single dataframe.\n\n#### 7. Imputation of missing values\n\nMissing values are replaced by imputed values using a specific algorithm provided by [scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.quantile_transform.html). The user can choose the method of imputation with the `--missing_value_imputer` parameter.\n\n#### 8. General statistics for each gene\n\nBase statistics are computed for each gene, platform-wide and for each platform (RNAseq and microarray).\n\n#### 9. Scoring\n\n- The whole list of genes is divided in multiple sections, based on their expression level.\n- Based on the coefficient of variation, a shortlist of candidates genes is extracted for each section.\n- Run optimised, scalable version of [Normfinder](https://www.moma.dk/software/normfinder)\n- Run optimised, scalable version of [Genorm](https://genomebiology.biomedcentral.com/articles/10.1186/gb-2002-3-7-research0034) (run by default; optional)\n- Compute stability scores for each candidate gene\n\n#### 10. Reporting\n\n- Result aggregation\n- Make [`MultiQC`](http://multiqc.info/) report\n- Prepare [Dash Plotly](https://dash.plotly.com/) app for further investigation of gene / sample counts\n\n## Test pipeline\n\nYou can test the execution of the pipeline locally with:\n\n```bash\nnextflow run nf-core/stableexpression -profile test,\n```\n\n## Basic usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\nTo search the most stable genes in a species considering all public datasets, simply run:\n\n```bash\nnextflow run nf-core/stableexpression \\\n -profile \\\n --species \\\n --outdir \\\n -resume\n```\n\n## More advanced usage\n\nFor more specific scenarios, like:\n\n- **fetching only specific conditions**\n- **using your own expression dataset(s)**\n\nplease refer to the [usage documentation](https://nf-co.re/stableexpression/usage).\n\n## Resource allocation\n\nFor setting pipeline CPU / memory usage, see [here](./docs/configuration.md).\n\n## Profiles\n\nSee [here](https://nf-co.re/stableexpression/usage#profiles) for more information about profiles.\n\n## Pipeline output\n\nTo see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/stableexpression/results) tab on the nf-core website pipeline page.\nFor more details about the output files and reports, please refer to the\n[output documentation](https://nf-co.re/stableexpression/output).\n\n## Support us\n\nIf you like nf-core/stableexpression, please make sure you give it a star on GitHub!\n\n[![stars - stableexpression](https://img.shields.io/github/stars/nf-core/stableexpression?style=social)](https://github.com/nf-core/stableexpression)\n\n## Credits\n\nnf-core/stableexpression was originally written by Olivier Coen.\n\nWe thank the following people for their assistance in the development of this pipeline:\n\n- R\u00e9my Costa\n- Shaheen Acheche\n- Janine Soares\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\nFor further information or help, don't hesitate to get in touch on the [Slack `#stableexpression` channel](https://nfcore.slack.com/channels/stableexpression) (you can join with [this invite](https://nf-co.re/join/slack)).\n\n## Citations\n\n\n\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nYou can cite the `nf-core` publication as follows:\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", + "hasPart": [ + { + "@id": "main.nf" + }, + { + "@id": "assets/" + }, + { + "@id": "bin/" + }, + { + "@id": "conf/" + }, + { + "@id": "docs/" + }, + { + "@id": "docs/images/" + }, + { + "@id": "modules/" + }, + { + "@id": "modules/local/" + }, + { + "@id": "modules/nf-core/" + }, + { + "@id": "workflows/" + }, + { + "@id": "subworkflows/" + }, + { + "@id": "nextflow.config" + }, + { + "@id": "README.md" + }, + { + "@id": "nextflow_schema.json" + }, + { + "@id": "CHANGELOG.md" + }, + { + "@id": "LICENSE" + }, + { + "@id": "CODE_OF_CONDUCT.md" + }, + { + "@id": "CITATIONS.md" + }, + { + "@id": "modules.json" + }, + { + "@id": "docs/usage.md" + }, + { + "@id": "docs/output.md" + }, + { + "@id": ".nf-core.yml" + }, + { + "@id": ".pre-commit-config.yaml" + }, + { + "@id": ".prettierignore" + } + ], + "isBasedOn": "https://github.com/nf-core/stableexpression", + "license": "MIT", + "mainEntity": { + "@id": "main.nf" + }, + "mentions": [ + { + "@id": "#6aa6a373-9bb0-4502-a8f4-2fce1f6296ee" + } + ], + "name": "nf-core/stableexpression" + }, + { + "@id": "ro-crate-metadata.json", + "@type": "CreativeWork", + "about": { + "@id": "./" + }, + "conformsTo": [ + { + "@id": "https://w3id.org/ro/crate/1.1" + }, + { + "@id": "https://w3id.org/workflowhub/workflow-ro-crate/1.0" + } + ] + }, + { + "@id": "main.nf", + "@type": [ + "File", + "SoftwareSourceCode", + "ComputationalWorkflow" + ], + "creator": [ + { + "@id": "https://orcid.org/0000-0003-3387-1040" + } + ], + "dateCreated": "", + "dateModified": "2026-03-14T10:55:43Z", + "dct:conformsTo": "https://bioschemas.org/profiles/ComputationalWorkflow/1.0-RELEASE/", + "keywords": [ + "nf-core", + "nextflow", + "expression", + "housekeeping-genes", + "qpcr-analysis" + ], + "license": [ + "MIT" + ], + "maintainer": [ + { + "@id": "https://orcid.org/0000-0003-3387-1040" + } + ], + "name": [ + "nf-core/stableexpression" + ], + "programmingLanguage": { + "@id": "https://w3id.org/workflowhub/workflow-ro-crate#nextflow" + }, + "sdPublisher": { + "@id": "https://nf-co.re/" + }, + "url": [ + "https://github.com/nf-core/stableexpression", + "https://nf-co.re/stableexpression/1.0.0/" + ], + "version": [ + "1.0.0" + ] + }, + { + "@id": "https://w3id.org/workflowhub/workflow-ro-crate#nextflow", + "@type": "ComputerLanguage", + "identifier": { + "@id": "https://www.nextflow.io/" + }, + "name": "Nextflow", + "url": { + "@id": "https://www.nextflow.io/" + }, + "version": "!>=25.04.0" + }, + { + "@id": "#6aa6a373-9bb0-4502-a8f4-2fce1f6296ee", + "@type": "TestSuite", + "instance": [ + { + "@id": "#2fa3572f-894b-4153-a0a8-dca4386f0cea" + } + ], + "mainEntity": { + "@id": "main.nf" + }, + "name": "Test suite for nf-core/stableexpression" + }, + { + "@id": "#2fa3572f-894b-4153-a0a8-dca4386f0cea", + "@type": "TestInstance", + "name": "GitHub Actions workflow for testing nf-core/stableexpression", + "resource": "repos/nf-core/stableexpression/actions/workflows/nf-test.yml", + "runsOn": { + "@id": "https://w3id.org/ro/terms/test#GithubService" + }, + "url": "https://api.github.com" + }, + { + "@id": "https://w3id.org/ro/terms/test#GithubService", + "@type": "TestService", + "name": "Github Actions", + "url": { + "@id": "https://github.com" + } + }, + { + "@id": "assets/", + "@type": "Dataset", + "description": "Additional files" + }, + { + "@id": "bin/", + "@type": "Dataset", + "description": "Scripts that must be callable from a pipeline process" + }, + { + "@id": "conf/", + "@type": "Dataset", + "description": "Configuration files" + }, + { + "@id": "docs/", + "@type": "Dataset", + "description": "Markdown files for documenting the pipeline" + }, + { + "@id": "docs/images/", + "@type": "Dataset", + "description": "Images for the documentation files" + }, + { + "@id": "modules/", + "@type": "Dataset", + "description": "Modules used by the pipeline" + }, + { + "@id": "modules/local/", + "@type": "Dataset", + "description": "Pipeline-specific modules" + }, + { + "@id": "modules/nf-core/", + "@type": "Dataset", + "description": "nf-core modules" + }, + { + "@id": "workflows/", + "@type": "Dataset", + "description": "Main pipeline workflows to be executed in main.nf" + }, + { + "@id": "subworkflows/", + "@type": "Dataset", + "description": "Smaller subworkflows" + }, + { + "@id": "nextflow.config", + "@type": "File", + "description": "Main Nextflow configuration file" + }, + { + "@id": "README.md", + "@type": "File", + "description": "Basic pipeline usage information" + }, + { + "@id": "nextflow_schema.json", + "@type": "File", + "description": "JSON schema for pipeline parameter specification" + }, + { + "@id": "CHANGELOG.md", + "@type": "File", + "description": "Information on changes made to the pipeline" + }, + { + "@id": "LICENSE", + "@type": "File", + "description": "The license - should be MIT" + }, + { + "@id": "CODE_OF_CONDUCT.md", + "@type": "File", + "description": "The nf-core code of conduct" + }, + { + "@id": "CITATIONS.md", + "@type": "File", + "description": "Citations needed when using the pipeline" + }, + { + "@id": "modules.json", + "@type": "File", + "description": "Version information for modules from nf-core/modules" + }, + { + "@id": "docs/usage.md", + "@type": "File", + "description": "Usage documentation" + }, + { + "@id": "docs/output.md", + "@type": "File", + "description": "Output documentation" + }, + { + "@id": ".nf-core.yml", + "@type": "File", + "description": "nf-core configuration file, configuring template features and linting rules" + }, + { + "@id": ".pre-commit-config.yaml", + "@type": "File", + "description": "Configuration file for pre-commit hooks" + }, + { + "@id": ".prettierignore", + "@type": "File", + "description": "Ignore file for prettier" + }, + { + "@id": "https://nf-co.re/", + "@type": "Organization", + "name": "nf-core", + "url": "https://nf-co.re/" + }, + { + "@id": "https://orcid.org/0000-0003-3387-1040", + "@type": "Person", + "email": "coen.olivier@gmail.com", + "name": "Olivier Coen" + } + ] +} \ No newline at end of file diff --git a/subworkflows/local/dataset_analysis/main.nf b/subworkflows/local/dataset_analysis/main.nf new file mode 100644 index 00000000..66ec06ed --- /dev/null +++ b/subworkflows/local/dataset_analysis/main.nf @@ -0,0 +1,23 @@ +include { COMPUTE_DATASET_STATISTICS as DESCRIPTIVE_STATISTICS } from '../../../modules/local/compute_dataset_statistics' + +/* +======================================================================================== + SUBWORKFLOW TO COMPUTE VARIOUS STATISTICS AT THE DATASET / SAMPLE LEVEL +======================================================================================== +*/ + +workflow DATASET_ANALYSIS { + + take: + ch_counts + + main: + + // ----------------------------------------------------------------- + // COMPUTE VARIOUS STATISTICS AT THE SAMPLE LEVEL + // ----------------------------------------------------------------- + + DESCRIPTIVE_STATISTICS ( ch_counts ) + + +} diff --git a/subworkflows/local/download_public_datasets/main.nf b/subworkflows/local/download_public_datasets/main.nf new file mode 100644 index 00000000..6ea3f11f --- /dev/null +++ b/subworkflows/local/download_public_datasets/main.nf @@ -0,0 +1,66 @@ +include { EXPRESSIONATLAS_GETDATA as EXPRESSION_ATLAS } from '../../../modules/local/expressionatlas/getdata' +include { GEO_GETDATA as GEO } from '../../../modules/local/geo/getdata' + +include { addDatasetIdToMetadata } from '../utils_nfcore_stableexpression_pipeline' +include { groupFilesByDatasetId } from '../utils_nfcore_stableexpression_pipeline' +include { augmentMetadata } from '../utils_nfcore_stableexpression_pipeline' + +/* +======================================================================================== + SUBWORKFLOW TO DOWNLOAD GEO ACCESSIONS AND DATASETS +======================================================================================== +*/ + +workflow DOWNLOAD_PUBLIC_DATASETS { + + take: + species + ch_accessions + + + main: + + ch_datasets = channel.empty() + ch_fetched_accessions = channel.empty() + + ch_accessions = ch_accessions + .branch { acc -> + eatlas: acc.startsWith('E-') + geo: acc.startsWith('GSE') + } + + // ------------------------------------------------------------------------------------ + // DOWNLOAD EXPRESSION ATLAS DATASETS + // ------------------------------------------------------------------------------------ + + // Downloading Expression Atlas data for each accession in ch_accessions + EXPRESSION_ATLAS( ch_accessions.eatlas ) + + // ------------------------------------------------------------------------------------ + // DOWNLOAD GEO DATASETS + // ------------------------------------------------------------------------------------ + + // Downloading GEO datasets for each accession in ch_accessions + GEO( + ch_accessions.geo, + species + ) + + ch_downloaded_counts = EXPRESSION_ATLAS.out.counts.mix ( GEO.out.counts ) + ch_downloaded_design = EXPRESSION_ATLAS.out.design.mix ( GEO.out.design ) + + // adding dataset id (accession + data_type) in the file meta + // flattening in case multiple files are returned at once + ch_counts = addDatasetIdToMetadata( ch_downloaded_counts.flatten() ) + ch_design = addDatasetIdToMetadata( ch_downloaded_design.flatten() ) + + // adding design files to the meta of their respective count files + ch_datasets = groupFilesByDatasetId( ch_design, ch_counts ) + + // adding normalisation state in the meta + ch_datasets = augmentMetadata( ch_datasets ) + + emit: + datasets = ch_datasets + +} diff --git a/subworkflows/local/expression_normalisation/main.nf b/subworkflows/local/expression_normalisation/main.nf new file mode 100644 index 00000000..84130edd --- /dev/null +++ b/subworkflows/local/expression_normalisation/main.nf @@ -0,0 +1,83 @@ +include { NORMALISATION_COMPUTE_CPM as COMPUTE_CPM } from '../../../modules/local/normalisation/compute_cpm' +include { NORMALISATION_COMPUTE_TPM as COMPUTE_TPM } from '../../../modules/local/normalisation/compute_tpm' +include { QUANTILE_NORMALISATION } from '../../../modules/local/quantile_normalisation' + +include { GET_TRANSCRIPT_LENGTHS } from '../../../subworkflows/local/get_transcript_lengths' + +/* +======================================================================================== + SUBWORKFLOW TO NORMALISE AND HARMONISE EXPRESSION DATASETS +======================================================================================== +*/ + +workflow EXPRESSION_NORMALISATION { + + take: + species + ch_datasets + normalisation_method + quantile_norm_target_distrib + gff_file + gene_length_file + + main: + + // + // MODULE: normalisation of raw count datasets (including downloaded RNA-seq datasets) + // at the same time, removing genes that show only zero counts + // + + ch_datasets = ch_datasets.branch { + meta, file -> + raw: meta.normalised == false + normalised: meta.normalised == true + } + + ch_raw_rnaseq_datasets_to_normalise = ch_datasets.raw.filter { meta, file -> meta.platform == 'rnaseq' } + + if ( normalisation_method == 'tpm' ) { + + if ( gene_length_file ) { + + ch_gene_length_file = channel.fromPath( gene_length_file, checkIfExists: true ) + + } else { + + // download genome annotation + // and computing length of the longest transcript gene per gene + GET_TRANSCRIPT_LENGTHS( + species, + gff_file + ) + ch_gene_length_file = GET_TRANSCRIPT_LENGTHS.out.csv + + } + + COMPUTE_TPM( + ch_raw_rnaseq_datasets_to_normalise, + ch_gene_length_file + ) + ch_raw_rnaseq_datasets_normalised = COMPUTE_TPM.out.counts + + } else { // 'cpm' + + COMPUTE_CPM( ch_raw_rnaseq_datasets_to_normalise ) + ch_raw_rnaseq_datasets_normalised = COMPUTE_CPM.out.counts + + } + + // + // MODULE: Quantile normalisation + // + + // putting all normalised count datasets together and performing quantile normalisation + QUANTILE_NORMALISATION ( + ch_datasets.normalised.mix( ch_raw_rnaseq_datasets_normalised ), + quantile_norm_target_distrib + ) + + + emit: + counts = QUANTILE_NORMALISATION.out.counts + +} diff --git a/subworkflows/local/gene_statistics/main.nf b/subworkflows/local/gene_statistics/main.nf new file mode 100644 index 00000000..bfb924b9 --- /dev/null +++ b/subworkflows/local/gene_statistics/main.nf @@ -0,0 +1,47 @@ +include { COMPUTE_GENE_STATISTICS as GLOBAL } from '../../../modules/local/compute_gene_statistics' +include { COMPUTE_GENE_STATISTICS as PLATFORM } from '../../../modules/local/compute_gene_statistics' + +/* +======================================================================================== + SUBWORKFLOW TO DOWNLOAD EXPRESSIONATLAS ACCESSIONS AND DATASETS +======================================================================================== +*/ + +workflow GENE_STATISTICS { + + take: + ch_all_imputed_counts + ch_all_counts + ch_platform_counts + ch_ratio_nulls_per_sample_file + max_null_ratio_valid_sample + + main: + + // ----------------------------------------------------------------- + // PLATFORM-SPECIFIC STATISTICS + // ----------------------------------------------------------------- + + // platform counts have not been imputed + PLATFORM( + ch_platform_counts.map{ meta, file -> [ meta, file, [] ] }, + ch_ratio_nulls_per_sample_file.collect(), + max_null_ratio_valid_sample + ) + + + // ----------------------------------------------------------------- + // ALL DATA + // ----------------------------------------------------------------- + + GLOBAL( + ch_all_counts.join( ch_all_imputed_counts ).collect(), + ch_ratio_nulls_per_sample_file.collect(), + max_null_ratio_valid_sample + ) + + emit: + stats = GLOBAL.out.stats + platform_stats = PLATFORM.out.stats + +} diff --git a/subworkflows/local/genorm/main.nf b/subworkflows/local/genorm/main.nf new file mode 100644 index 00000000..c491566f --- /dev/null +++ b/subworkflows/local/genorm/main.nf @@ -0,0 +1,102 @@ +// +// Subworkflow with functionality specific to the nf-core/stableexpression pipeline +// + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT MODULES / SUBWORKFLOWS / FUNCTIONS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +include { MAKE_CHUNKS } from '../../../modules/local/genorm/make_chunks' +include { CROSS_JOIN } from '../../../modules/local/genorm/cross_join' +include { EXPRESSION_RATIO } from '../../../modules/local/genorm/expression_ratio' +include { RATIO_STANDARD_VARIATION } from '../../../modules/local/genorm/ratio_standard_variation' +include { COMPUTE_M_MEASURE } from '../../../modules/local/genorm/compute_m_measure' + +/* +======================================================================================== + SUBWORKFLOW TO COMPUTE PAIRWISE GENE VARIATION +======================================================================================== +*/ + +workflow GENORM { + + take: + ch_counts + + + main: + + // ----------------------------------------------------------------- + // MAKE CHUNKS OF GENE COUNTS + // ----------------------------------------------------------------- + + MAKE_CHUNKS( ch_counts ) + + // we need to flatten to set each chunk file as a separate item in the channel + ch_count_chunks = getUniqueFilePairs( MAKE_CHUNKS.out.chunks.transpose() ) + + // ----------------------------------------------------------------- + // CROSS JOIN CHUNKS + // ----------------------------------------------------------------- + + CROSS_JOIN( ch_count_chunks ) + + // ----------------------------------------------------------------- + // PAIRWISE EXPRESSION RATIOS + // ----------------------------------------------------------------- + + EXPRESSION_RATIO( CROSS_JOIN.out.data ) + + // ----------------------------------------------------------------- + // STANDARD VARIATION OF EXPRESSION RATIOS + // ----------------------------------------------------------------- + + RATIO_STANDARD_VARIATION( EXPRESSION_RATIO.out.data ) + + // ----------------------------------------------------------------- + // COMPUTE M-MEASURE + // ----------------------------------------------------------------- + + ch_ratio_files = RATIO_STANDARD_VARIATION.out.data + .map{ meta, file -> [ [ section: meta.section ], file ] } + .groupTuple() + + COMPUTE_M_MEASURE( + ch_counts.join( ch_ratio_files ) + ) + + emit: + m_measures = COMPUTE_M_MEASURE.out.m_measures + +} + + +/* +======================================================================================== + FUNCTIONS +======================================================================================== +*/ + +// +// Generate channels consisting of unique pairs of files +// +def getUniqueFilePairs( ch_count_chunks ) { + + def ch_count_chunks_with_indexes = ch_count_chunks + .map { meta, file -> [meta, file.name.tokenize('.')[1], file] } // extract file index + + return ch_count_chunks_with_indexes + .combine( // full cartesian product with itself, using the meta map as key + ch_count_chunks_with_indexes, + by: 0 + ) + .filter { + meta, i, file_i, j, file_j -> i <= j } // keeps only pairs where i <= j + .map { + meta, i, file_i, j, file_j -> + def new_meta = meta + [ index_1: i, index_2: j ] // puts indexes in a meta tuple + [ new_meta, file_i, file_j ] + } +} diff --git a/subworkflows/local/get_public_accessions/main.nf b/subworkflows/local/get_public_accessions/main.nf new file mode 100644 index 00000000..debb29e0 --- /dev/null +++ b/subworkflows/local/get_public_accessions/main.nf @@ -0,0 +1,137 @@ +include { EXPRESSIONATLAS_GETACCESSIONS as EXPRESSION_ATLAS } from '../../../modules/local/expressionatlas/getaccessions' +include { GEO_GETACCESSIONS as GEO } from '../../../modules/local/geo/getaccessions' + +/* +======================================================================================== + SUBWORKFLOW TO DOWNLOAD EXPRESSIONATLAS ACCESSIONS AND DATASETS +======================================================================================== +*/ + +workflow GET_PUBLIC_ACCESSIONS { + + take: + species + skip_fetch_eatlas_accessions + fetch_geo_accessions + platform + keywords + ch_accessions + ch_accessions_file + ch_excluded_accessions + ch_excluded_accessions_file + random_sampling_size + random_sampling_seed + outdir + + main: + + ch_fetched_eatlas_accessions = channel.empty() + ch_fetched_geo_accessions = channel.empty() + ch_sampling_quota = channel.of( "ok" ) + + // ----------------------------------------------------------------- + // GET EATLAS ACCESSIONS + // ----------------------------------------------------------------- + + // fetching Expression Atlas accessions if applicable + if ( !skip_fetch_eatlas_accessions ) { + + // getting Expression Atlas accessions given a species name and keywords + // keywords can be an empty string + EXPRESSION_ATLAS( + species, + keywords, + platform?: [], + random_sampling_size?: [], + random_sampling_seed?: [] + ) + + ch_fetched_eatlas_accessions = EXPRESSION_ATLAS.out.accessions.splitText() + ch_sampling_quota = EXPRESSION_ATLAS.out.sampling_quota + + } + + // ------------------------------------------------------------------------------------ + // GET GEO ACCESSIONS + // ------------------------------------------------------------------------------------ + + // fetching GEO accessions if applicable + if ( fetch_geo_accessions ) { + + // all Expression Atlas accessions starting with E-GEOD- are imported from GEO + // we do not want to collect these GEO data if we already get them from Expression Atlas + ch_excluded_eatlas_accessions_file = ch_fetched_eatlas_accessions + .filter { accession -> accession.startsWith("E-GEOD-") } + .map { accession -> accession.replace("E-GEOD-", "GSE") } + .collectFile( + name: 'excluded_geo_accessions.txt', + storeDir: "${outdir}/geo/", + sort: true, + newLine: true + ) + .ifEmpty( [] ) + + // trick to avoid fetching accessions from GEO when the sampling quota is already exceeded + ch_species = channel.of( species ) + .combine( ch_sampling_quota ) + .filter { species_name, quota -> quota == "ok" } + .map { species_name, quota -> species_name } + + // getting GEO accessions given a species name and keywords + // keywords can be an empty string + GEO( + ch_species, + keywords, + platform?: [], + ch_excluded_eatlas_accessions_file, + random_sampling_size?: [], + random_sampling_seed?: [] + ) + + ch_fetched_geo_accessions = GEO.out.accessions.splitText() + } + + // ----------------------------------------------------------------- + // MERGING AND EXCLUDING UNWANTED ACCESSIONS + // ----------------------------------------------------------------- + + // getting accessions to exclude and preparing in the right format + ch_excluded_accessions = ch_excluded_accessions + .mix( ch_excluded_accessions_file.splitText() ) + .unique() + .map { acc -> acc.trim() } + .toList() + .map { lst -> [lst] } // list of lists : mandatory when combining in the next step + + ch_fetched_public_accessions = ch_fetched_eatlas_accessions + .mix( ch_fetched_geo_accessions ) + .map { acc -> acc.trim() } + .filter { acc -> + (acc.startsWith('E-') || acc.startsWith('GSE')) && !acc.startsWith('E-PROT-') + } + .combine ( ch_excluded_accessions ) + .filter { accession, excluded_accessions -> !(accession in excluded_accessions) } + .map { accession, excluded_accessions -> accession } + + // ----------------------------------------------------------------- + // ADDING USER PROVIDED ACCESSIONS + // ----------------------------------------------------------------- + + ch_input_accessions = ch_accessions + .mix( ch_accessions_file.splitText() ) + .unique() + .map { acc -> acc.trim() } + + // appending to accessions provided by the user + // ensures that no accessions is present twice (provided by the user and fetched from E. Atlas) + // removing E-PROT- accessions because they are not supported in subsequent steps + // removing excluded accessions + ch_all_accessions = ch_input_accessions + .mix( ch_fetched_public_accessions ) + .unique() + .map { acc -> acc.trim() } + + emit: + accessions = ch_all_accessions + +} diff --git a/subworkflows/local/get_transcript_lengths/main.nf b/subworkflows/local/get_transcript_lengths/main.nf new file mode 100644 index 00000000..26213245 --- /dev/null +++ b/subworkflows/local/get_transcript_lengths/main.nf @@ -0,0 +1,35 @@ +include { COMPUTE_GENE_TRANSCRIPT_LENGTHS } from '../../../modules/local/compute_gene_transcript_lengths' +include { DOWNLOAD_ENSEMBL_ANNOTATION } from '../../../modules/local/download_ensembl_annotation' + + +/* +======================================================================================== + SUBWORKFLOW TO DOWNLOAD EXPRESSIONATLAS ACCESSIONS AND DATASETS +======================================================================================== +*/ + +workflow GET_TRANSCRIPT_LENGTHS { + + take: + species + gff_file + + main: + + if ( gff_file ) { + ch_annotation = channel.fromPath( gff_file, checkIfExists: true ) + } else { + DOWNLOAD_ENSEMBL_ANNOTATION( species ) + ch_annotation = DOWNLOAD_ENSEMBL_ANNOTATION.out.gff3 + } + + COMPUTE_GENE_TRANSCRIPT_LENGTHS( ch_annotation ) + + + + emit: + csv = COMPUTE_GENE_TRANSCRIPT_LENGTHS.out.csv + + + +} diff --git a/subworkflows/local/idmapping/main.nf b/subworkflows/local/idmapping/main.nf new file mode 100644 index 00000000..489f97dd --- /dev/null +++ b/subworkflows/local/idmapping/main.nf @@ -0,0 +1,171 @@ +include { CLEAN_GENE_IDS } from '../../../modules/local/clean_gene_ids' +include { EXTRACT_GENE_IDS } from '../../../modules/local/extract_gene_ids' +include { COLLECT_ALL_GENE_IDS } from '../../../modules/local/collect_all_gene_ids' +include { GPROFILER_IDMAPPING } from '../../../modules/local/gprofiler/idmapping' +include { DETECT_RARE_GENES } from '../../../modules/local/detect_rare_genes' +include { FILTER_AND_RENAME_GENES } from '../../../modules/local/filter_and_rename_genes' + +/* +======================================================================================== + SUBWORKFLOW TO DOWNLOAD EXPRESSIONATLAS ACCESSIONS AND DATASETS +======================================================================================== +*/ + +workflow ID_MAPPING { + + take: + ch_counts + species + skip_id_mapping + skip_cleaning_gene_ids + gprofiler_target_db + custom_gene_id_mapping + custom_gene_metadata + min_occurrence_freq + min_occurrence_quantile + outdir + + main: + + ch_gene_id_mapping = channel.empty() + ch_gene_metadata = channel.empty() + + + // ----------------------------------------------------------------- + // IN CASE OF ID MAPPING, CLEANING GENE IDS BEFOREHAND + // ----------------------------------------------------------------- + + if ( !skip_id_mapping && !skip_cleaning_gene_ids ) { + + // ensuring that all gene ids are valid before mapping + CLEAN_GENE_IDS ( ch_counts ) + ch_counts = CLEAN_GENE_IDS.out.counts + + } + + // ----------------------------------------------------------------- + // EXTRACTING GENE IDS FROM COUNTS FILE + // ----------------------------------------------------------------- + + EXTRACT_GENE_IDS ( ch_counts ) + ch_gene_ids = EXTRACT_GENE_IDS.out.gene_ids + + + + if ( skip_id_mapping ) { + + // ----------------------------------------------------------------- + // MAKING FILE CONTAINING ALL GENE UNIQUE GENE IDS (ALL GENE IDS ARE VALID) + // ----------------------------------------------------------------- + + ch_valid_gene_ids = ch_gene_ids + .splitText() + .map { it.trim() } + .unique() + .collectFile( + name: 'gene_ids.txt', + newLine: true, + storeDir: "${outdir}/idmapping/", + sort: true + ) + + } else { + + // ----------------------------------------------------------------- + // COLLECTING ALL CLEANED GENE IDS FROM ALL DATASETS + // ----------------------------------------------------------------- + + // sorting files in order to have a consistent input and be able to retry + COLLECT_ALL_GENE_IDS( + ch_gene_ids.toSortedList() + ) + + // ----------------------------------------------------------------- + // MAPPING THESE GENE IDS TO THE CHOSEN TARGET DB + // ----------------------------------------------------------------- + + GPROFILER_IDMAPPING( + COLLECT_ALL_GENE_IDS.out.unique_gene_ids, + species, + gprofiler_target_db + ) + ch_gene_id_mapping = GPROFILER_IDMAPPING.out.mapping + ch_gene_metadata = GPROFILER_IDMAPPING.out.metadata + + // ----------------------------------------------------------------- + // FILTERING OUT GENE IDS THAT DO NOT HAVE ENOUGH OCCURRENCES + // ----------------------------------------------------------------- + + DETECT_RARE_GENES( + ch_gene_id_mapping, + COLLECT_ALL_GENE_IDS.out.gene_id_occurrences, + ch_counts.count(), + min_occurrence_freq, + min_occurrence_quantile + ) + ch_valid_gene_ids = DETECT_RARE_GENES.out.valid_gene_ids + } + + // ----------------------------------------------------------------- + // COLLECTING GLOBAL GENE ID MAPPING AND METADATA + // ----------------------------------------------------------------- + + ch_global_gene_id_mapping = ch_gene_id_mapping + .mix( + custom_gene_id_mapping ? + channel.fromPath( custom_gene_id_mapping, checkIfExists: true ) : + channel.empty() + ) + .splitCsv( header: true ) + .unique() + .collectFile( + name: 'global_gene_id_mapping.csv', + seed: "original_gene_id,gene_id", + newLine: true, + storeDir: "${outdir}/idmapping/", + sort: true + ) { + item -> "${item["original_gene_id"]},${item["gene_id"]}" + } + + ch_global_gene_metadata = ch_gene_metadata + .mix( + custom_gene_metadata ? + channel.fromPath( custom_gene_metadata, checkIfExists: true ) : + channel.empty() + ) + .splitCsv( header: true ) + .unique() + .collectFile( + name: 'global_gene_metadata.csv', + seed: "gene_id,name,description", + newLine: true, + storeDir: "${outdir}/idmapping/", + sort: true + ) { + item -> "${item["gene_id"]},${item["name"]},${item["description"]}" + } + + // ----------------------------------------------------------------- + // RENAMING GENE IDS IN ALL COUNT DATASETS (ONLY IF NECESSARY) + // ----------------------------------------------------------------- + + if ( !skip_id_mapping || custom_gene_id_mapping ) { + + FILTER_AND_RENAME_GENES( + ch_counts, + ch_global_gene_id_mapping.first(), + ch_valid_gene_ids.collect() + ) + ch_counts = FILTER_AND_RENAME_GENES.out.counts + + } + + + emit: + counts = ch_counts + mapping = ch_global_gene_id_mapping + metadata = ch_global_gene_metadata + valid_gene_ids = ch_valid_gene_ids + +} diff --git a/subworkflows/local/merge_data/main.nf b/subworkflows/local/merge_data/main.nf new file mode 100644 index 00000000..a093c8f5 --- /dev/null +++ b/subworkflows/local/merge_data/main.nf @@ -0,0 +1,97 @@ +include { MERGE_COUNTS as PLATFORM } from '../../../modules/local/merge_counts' +include { MERGE_COUNTS as GLOBAL } from '../../../modules/local/merge_counts' +include { IMPUTE_MISSING_VALUES } from '../../../modules/local/impute_missing_values' + +/* +======================================================================================== + SUBWORKFLOW TO DOWNLOAD EXPRESSIONATLAS ACCESSIONS AND DATASETS +======================================================================================== +*/ + +workflow MERGE_DATA { + + take: + ch_normalised_counts + missing_value_imputer + outdir + + main: + + // ----------------------------------------------------------------- + // MERGE COUNTS FOR EACH PLATFORM SEPARATELY + // ----------------------------------------------------------------- + + + ch_normalised_rnaseq_counts = ch_normalised_counts.filter { meta, file -> meta.platform == "rnaseq" } + ch_normalised_microarray_counts = ch_normalised_counts.filter { meta, file -> meta.platform == "microarray" } + + ch_collected_rnaseq_counts = ch_normalised_rnaseq_counts + .map { meta, file -> file } + .collect( sort: true ) + .map { files -> [ [ platform: "rnaseq" ], files ] } + + ch_collected_microarray_counts = ch_normalised_microarray_counts + .map { meta, file -> file } + .collect( sort: true ) + .map { files -> [ [ platform: "microarray" ], files ] } + + PLATFORM ( + ch_collected_rnaseq_counts.concat( ch_collected_microarray_counts ) + ) + + ch_platform_counts = PLATFORM.out.counts + + // ----------------------------------------------------------------- + // MERGE ALL COUNTS + // ----------------------------------------------------------------- + + ch_collected_merged_counts = ch_platform_counts + .map { meta, file -> file } + .collect( sort: true ) + .map { files -> [ [ platform: "all" ], files ] } + + GLOBAL( ch_collected_merged_counts.collect() ) + ch_all_counts = GLOBAL.out.counts + + // ----------------------------------------------------------------- + // IMPUTE MISSING VALUES + // ----------------------------------------------------------------- + + IMPUTE_MISSING_VALUES( + ch_all_counts.collect(), + missing_value_imputer + ) + + // ----------------------------------------------------------------- + // MERGE ALL DESIGNS IN A SINGLE TABLE + // ----------------------------------------------------------------- + + ch_whole_design = ch_normalised_counts + .map { + meta, file -> // extracts design file and adds batch column whenever missing (for custom datasets) + def design_content = meta.design.splitCsv( header: true ) + // if there is no batch, it is custom data + def updated_design_content = design_content.collect { row -> + row.batch = row.batch ?: "custom_${meta.dataset}" + return row + } + [ updated_design_content ] + } + .flatten() + .unique() + .collectFile( + name: 'whole_design.csv', + seed: "batch,condition,sample", + newLine: true, + sort: true, + storeDir: "${outdir}/merged_datasets/" + ) { + item -> "${item.batch},${item.condition},${item.sample}" + } + + emit: + all_imputed_counts = IMPUTE_MISSING_VALUES.out.counts + all_counts = ch_all_counts + platform_counts = ch_platform_counts + whole_design = ch_whole_design +} diff --git a/subworkflows/local/reporting/main.nf b/subworkflows/local/reporting/main.nf new file mode 100644 index 00000000..0c560204 --- /dev/null +++ b/subworkflows/local/reporting/main.nf @@ -0,0 +1,400 @@ +include { AGGREGATE_RESULTS } from '../../../modules/local/aggregate_results' +include { DASH_APP } from '../../../modules/local/dash_app' +include { COLLECT_STATISTICS } from '../../../modules/local/collect_statistics' +include { MULTIQC } from '../../../modules/nf-core/multiqc' + + +include { methodsDescriptionText } from '../utils_nfcore_stableexpression_pipeline' +include { paramsSummaryMultiqc } from '../../nf-core/utils_nfcore_pipeline' +include { softwareVersionsToYAML } from '../../nf-core/utils_nfcore_pipeline' +include { paramsSummaryMap } from 'plugin/nf-schema' + +/* +======================================================================================== + SUBWORKFLOW TO DOWNLOAD EXPRESSIONATLAS ACCESSIONS AND DATASETS +======================================================================================== +*/ + +workflow REPORTING { + + take: + ch_all_counts + ch_whole_design + ch_stats_all_genes_with_scores + ch_platform_statistics + ch_whole_gene_metadata + ch_whole_gene_id_mapping + target_genes + target_gene_file + multiqc_config + multiqc_logo + multiqc_methods_description + outdir + + main: + + ch_versions = channel.empty() + + // ----------------------------------------------------------------- + // AGGREGATE ALL RESULTS FOR MULTIQC + // ----------------------------------------------------------------- + + ch_target_gene_file = target_gene_file ? channel.fromPath( target_gene_file, checkIfExists: true ) : channel.empty() + + ch_target_gene_list = channel.fromList( target_genes.tokenize(',') ) + .mix( ch_target_gene_file.splitText() ) + .map { it.trim() } + .filter { it != "" } + .unique() + .toSortedList() + + ch_custom_content_multiqc_config_template = channel.fromPath( + "${projectDir}/assets/multiqc_config.custom_content.template.yaml", + checkIfExists: true + ) + + AGGREGATE_RESULTS ( + ch_all_counts.map{ meta, file -> file }.collect(), + ch_stats_all_genes_with_scores.collect(), + ch_platform_statistics.collect(), + ch_target_gene_list, + ch_whole_gene_metadata.collect().ifEmpty([]), // handle case where there are no mappings + ch_whole_gene_id_mapping.collect().ifEmpty([]), // handle case where there are no mappings + ch_custom_content_multiqc_config_template.collect() + ) + + ch_all_genes_summary = AGGREGATE_RESULTS.out.all_genes_summary + ch_most_stable_genes_summary = AGGREGATE_RESULTS.out.most_stable_genes_summary + ch_most_stable_genes_transposed_counts = AGGREGATE_RESULTS.out.most_stable_genes_transposed_counts_filtered + ch_custom_content_multiqc_config = AGGREGATE_RESULTS.out.custom_content_multiqc_config + + // ----------------------------------------------------------------- + // DASH APPLICATION + // ----------------------------------------------------------------- + + DASH_APP( + ch_all_counts.map{ meta, file -> file }.collect(), + ch_whole_design.collect(), + ch_all_genes_summary.collect() + ) + ch_versions = ch_versions.mix ( DASH_APP.out.versions ) + + + // ------------------------------------------------------------------------------------ + // PREPARING BAR PLOTS + // ------------------------------------------------------------------------------------ + + ch_id_mapping_stats = channel.topic('mqc_id_mapping_stats') + .collectFile( + name: 'id_mapping_stats.csv', + seed: "dataset,final,merged,not_valid,unmapped", + newLine: true, + storeDir: "${outdir}/statistics/" + ) { + item -> "${item[0]},${item[1]},${item[2]},${item[3]},${item[4]}" + } + + ch_missing_values_filter_stats = channel.topic('mqc_missing_values_filter_stats') + .collectFile( + name: 'missing_values_filter_stats.csv', + seed: "dataset,kept,rejected", + newLine: true, + storeDir: "${outdir}/statistics/" + ) { + item -> "${item[0]},${item[1]},${item[2]}" + } + + ch_zero_values_filter_stats = channel.topic('mqc_zero_values_filter_stats') + .collectFile( + name: 'zero_values_filter_stats.csv', + seed: "dataset,kept,rejected", + newLine: true, + storeDir: "${outdir}/statistics/" + ) { + item -> "${item[0]},${item[1]},${item[2]}" + } + + // ------------------------------------------------------------------------------------ + // PREPARING BOX PLOTS + // ------------------------------------------------------------------------------------ + + ch_skewness = channel.topic('skewness') + .map { dataset, file -> "${dataset},${file.readLines()[0]}" } // concatenate dataset name with skewness values + .collectFile( + name: 'skewness.csv', + newLine: true, + sort: true, + storeDir: "${outdir}/statistics/" + ) + + + ch_ratio_zeros = channel.topic('ratio_zeros') + .map { dataset, file -> "${dataset},${file.readLines()[0]}" } // concatenate dataset name with ratio values + .collectFile( + name: 'ratio_zeros.csv', + newLine: true, + sort: true, + storeDir: "${outdir}/statistics/" + ) + + ch_ratio_nulls = channel.topic('ratio_nulls') + .map { dataset, file -> "${dataset},${file.readLines()[0]}" } // concatenate dataset name with ratio values + .collectFile( + name: 'ratio_nulls.csv', + newLine: true, + sort: true, + storeDir: "${outdir}/statistics/" + ) + + ch_stat_files = ch_skewness + .mix( ch_ratio_nulls ) + .mix( ch_ratio_zeros ) + + COLLECT_STATISTICS( ch_stat_files ) + + // ------------------------------------------------------------------------------------ + // FAILURE / WARNING REPORTS + // ------------------------------------------------------------------------------------ + + ch_eatlas_failure_reasons = channel.topic('eatlas_failure_reason') + .map { accession, file -> [ accession, file.readLines()[0] ] } + .collectFile( + name: 'eatlas_failure_reasons.csv', + seed: "Accession,Reason", + newLine: true, + sort: true, + storeDir: "${outdir}/errors/", + ) { + item -> "${item[0]},${item[1]}" + } + + ch_eatlas_warning_reasons = channel.topic('eatlas_warning_reason') + .map { accession, file -> [ accession, file.readLines()[0] ] } + .collectFile( + name: 'eatlas_warning_reasons.csv', + seed: "Accession,Reason", + newLine: true, + sort: true, + storeDir: "${outdir}/warnings/" + ) { + item -> "${item[0]},${item[1]}" + } + + ch_geo_failure_reasons = channel.topic('geo_failure_reason') + .map { accession, file -> [ accession, file.readLines()[0] ] } + .collectFile( + name: 'geo_failure_reasons.csv', + seed: "Accession,Reason", + newLine: true, + sort: true, + storeDir: "${outdir}/errors/" + ) { + item -> "${item[0]},${item[1]}" + } + + + ch_geo_warning_reasons = channel.topic('geo_warning_reason') + .map { accession, file -> [ accession, file.readLines()[0] ] } + .collectFile( + name: 'geo_warning_reasons.csv', + seed: "Accession,Reason", + newLine: true, + sort: true, + storeDir: "${outdir}/warnings/" + ) { + item -> "${item[0]},${item[1]}" + } + + ch_id_cleaning_failure_reasons = channel.topic('id_cleaning_failure_reason') + .map { dataset, file -> [ dataset, file.readLines()[0] ] } + .collectFile( + name: 'id_cleaning_failure_reasons.tsv', + seed: "Dataset\tReason", + newLine: true, + sort: true, + storeDir: "${outdir}/errors/" + ) { + item -> "${item[0]}\t${item[1]}" + } + + ch_id_mapping_warning_reasons = channel.topic('renaming_warning_reason') + .map { dataset, file -> [ dataset, file.readLines()[0] ] } + .collectFile( + name: 'renaming_warning_reasons.tsv', + seed: "Dataset\tReason", + newLine: true, + sort: true, + storeDir: "${outdir}/warnings/" + ) { + item -> "${item[0]}\t${item[1]}" + } + + ch_id_mapping_failure_reasons = channel.topic('renaming_failure_reason') + .map { dataset, file -> [ dataset, file.readLines()[0] ] } + .collectFile( + name: 'renaming_failure_reasons.tsv', + seed: "Dataset\tReason", + newLine: true, + sort: true, + storeDir: "${outdir}/errors/" + ) { + item -> "${item[0]}\t${item[1]}" + } + + ch_normalisation_warning_reasons = channel.topic('normalisation_warning_reason') + .map { dataset, file -> [ dataset, file.readLines()[0] ] } + .collectFile( + name: 'normalisation_warning_reasons.tsv', + seed: "Dataset\tReason", + newLine: true, + sort: true, + storeDir: "${outdir}/warnings/" + ) { + item -> "${item[0]}\t${item[1]}" + } + + ch_normalisation_failure_reasons = channel.topic('normalisation_failure_reason') + .map { dataset, file -> [ dataset, file.readLines()[0] ] } + .collectFile( + name: 'normalisation_failure_reasons.tsv', + seed: "Dataset\tReason", + newLine: true, + sort: true, + storeDir: "${outdir}/errors/" + ) { + item -> "${item[0]}\t${item[1]}" + } + + + // ------------------------------------------------------------------------------------ + // MULTIQC FILES + // ------------------------------------------------------------------------------------ + + ch_multiqc_files = channel.empty() + .mix( ch_most_stable_genes_summary.collect() ) // single item + .mix( ch_all_genes_summary.collect() ) // single item + .mix( ch_most_stable_genes_transposed_counts.collect() ) // single item + .mix( channel.topic('eatlas_all_datasets').toSortedList() ) + .mix( channel.topic('eatlas_selected_datasets').toSortedList() ) + .mix( channel.topic('geo_all_datasets').toSortedList() ) + .mix( channel.topic('geo_selected_datasets').toSortedList() ) + .mix( channel.topic('geo_rejected_datasets').toSortedList() ) + .mix( channel.topic('total_gene_id_occurrence_quantiles').toSortedList() ) + .mix( COLLECT_STATISTICS.out.csv ) + .mix( ch_id_mapping_stats ) + .mix( ch_missing_values_filter_stats ) + .mix( ch_zero_values_filter_stats ) + .mix( ch_eatlas_failure_reasons ) + .mix( ch_eatlas_warning_reasons ) + .mix( ch_geo_failure_reasons ) + .mix( ch_geo_warning_reasons ) + .mix( ch_id_cleaning_failure_reasons ) + .mix( ch_id_mapping_warning_reasons ) + .mix( ch_id_mapping_failure_reasons ) + .mix( ch_normalisation_failure_reasons ) + .mix( ch_normalisation_warning_reasons ) + + + // ------------------------------------------------------------------------------------ + // VERSIONS + // ------------------------------------------------------------------------------------ + + // Collate and save software versions + // + def topic_versions = channel.topic("versions") + .distinct() + .branch { entry -> + versions_file: entry instanceof Path + versions_tuple: true + } + + def topic_versions_string = topic_versions.versions_tuple + .map { process, tool, version -> + [ process[process.lastIndexOf(':')+1..-1], " ${tool}: ${version}" ] + } + .groupTuple(by:0) + .map { process, tool_versions -> + tool_versions.unique().sort() + "${process}:\n${tool_versions.join('\n')}" + } + + ch_collated_versions = softwareVersionsToYAML(ch_versions.mix(topic_versions.versions_file)) + .mix(topic_versions_string) + .collectFile( + storeDir: "${outdir}/pipeline_info", + name: 'nf_core_' + 'stableexpression_software_' + 'mqc_' + 'versions.yml', + sort: true, + newLine: true + ) + + // ------------------------------------------------------------------------------------ + // PREPARE MULTIQC INPUT + // ------------------------------------------------------------------------------------ + + ch_multiqc_config = channel.fromPath( + "$projectDir/assets/multiqc_config.yml", checkIfExists: true) + + ch_multiqc_custom_config = multiqc_config ? + channel.fromPath(multiqc_config, checkIfExists: true) : + channel.empty() + + ch_multiqc_logo = multiqc_logo ? + channel.fromPath(multiqc_logo, checkIfExists: true) : + channel.of([]) + + summary_params = paramsSummaryMap( + workflow, + parameters_schema: "nextflow_schema.json" + ) + ch_workflow_summary = channel.value(paramsSummaryMultiqc(summary_params)) + + ch_multiqc_files = ch_multiqc_files + .mix( ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml') ) + + ch_multiqc_custom_methods_description = multiqc_methods_description ? + file(multiqc_methods_description, checkIfExists: true) : + file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) + + ch_methods_description = channel.value( + methodsDescriptionText(ch_multiqc_custom_methods_description) + ) + + // ------------------------------------------------------------------------------------ + // ADDING KEY TO JOIN ON + // ------------------------------------------------------------------------------------ + + ch_multiqc_file_list = ch_multiqc_files + .mix( ch_collated_versions ) + .mix( + ch_methods_description.collectFile( + name: 'methods_description_mqc.yaml', + sort: true + ) + ) + .flatten() + .toSortedList() + .map{ list -> [ [id: 'Final report'], list ] } + + ch_multiqc_config_list = ch_multiqc_config + .mix( ch_multiqc_custom_config ) + .mix( ch_custom_content_multiqc_config ) + .toSortedList() + .map{ list -> [ [id: 'Final report'], list ] } + + ch_multiqc_logo = ch_multiqc_logo.map{ file -> [ [id: 'Final report'], file ] } + + // ------------------------------------------------------------------------------------ + // MULTIQC + // ------------------------------------------------------------------------------------ + + ch_multiqc_input = ch_multiqc_file_list + .join( ch_multiqc_config_list ) + .join( ch_multiqc_logo ) + .map { meta, files, configs, logo -> [ meta, files, configs, logo , [], [] ] } + + MULTIQC ( ch_multiqc_input ) + + emit: + multiqc_report = MULTIQC.out.report + all_genes_summary = ch_all_genes_summary +} diff --git a/subworkflows/local/sample_filtering/main.nf b/subworkflows/local/sample_filtering/main.nf new file mode 100644 index 00000000..06ac9cec --- /dev/null +++ b/subworkflows/local/sample_filtering/main.nf @@ -0,0 +1,62 @@ +include { FILTER_OUT_SAMPLES_WITH_TOO_MANY_ZEROS as TOO_MANY_ZEROS } from '../../../modules/local/filter_out_samples/with_too_many_zeros' +include { FILTER_OUT_SAMPLES_WITH_TOO_MANY_MISSING_VALUES as TOO_MANY_MISSING_VALUES } from '../../../modules/local/filter_out_samples/with_too_many_missing_values' + + +/* +======================================================================================== + SUBWORKFLOW TO FILTER OUT UNVALID SAMPLES AND EMIT STATISTICS ABOUT ZEROS / MISSING VALUES +======================================================================================== +*/ + +workflow SAMPLE_FILTERING { + + take: + ch_counts + ch_valid_gene_ids + max_zero_ratio + max_null_ratio + outdir + + main: + + // ----------------------------------------------------------------- + // REMOVE SAMPLES WITH TOO MANY ZEROS + // ----------------------------------------------------------------- + + TOO_MANY_ZEROS ( + ch_counts, + max_zero_ratio + ) + + // ----------------------------------------------------------------- + // REMOVE SAMPLES WITH TOO MANY MISSING VALUES + // ----------------------------------------------------------------- + + TOO_MANY_MISSING_VALUES( + TOO_MANY_ZEROS.out.counts, + ch_valid_gene_ids.collect(), + max_null_ratio + ) + + // ----------------------------------------------------------------- + // GET NUMBER OF NULLS PER SAMPLE + // ----------------------------------------------------------------- + + ch_ratio_nulls_per_sample_file = TOO_MANY_MISSING_VALUES.out.ratio_nulls_per_sample + .splitCsv( header: true ) + .collectFile( + name: 'ratio_nulls_per_sample.csv', + seed: "sample,ratio", + newLine: true, + storeDir: "${outdir}/statistics/", + sort: true + ) + { + item -> "${item["sample"]},${item["ratio"]}" + } + + emit: + counts = TOO_MANY_MISSING_VALUES.out.counts + ratio_nulls_per_sample_file = ch_ratio_nulls_per_sample_file + +} diff --git a/subworkflows/local/stability_scoring/main.nf b/subworkflows/local/stability_scoring/main.nf new file mode 100644 index 00000000..3f731949 --- /dev/null +++ b/subworkflows/local/stability_scoring/main.nf @@ -0,0 +1,87 @@ +include { GET_CANDIDATE_GENES } from '../../../modules/local/get_candidate_genes' +include { NORMFINDER } from '../../../modules/local/normfinder' +include { COMPUTE_STABILITY_SCORES } from '../../../modules/local/compute_stability_scores' + +include { GENORM } from '../genorm' + +/* +======================================================================================== + COMPUTE STABILITY SCORES +======================================================================================== +*/ + +workflow STABILITY_SCORING { + + take: + ch_counts + ch_design + ch_stats + nb_candidates_per_section + nb_sections + skip_genorm + stability_score_weights + + main: + + // ----------------------------------------------------------------- + // GETTING CANDIDATE GENES + // ----------------------------------------------------------------- + + GET_CANDIDATE_GENES( + ch_counts.collect(), // single item + ch_stats.collect(), // single item + nb_candidates_per_section, + nb_sections + ) + + ch_candidate_gene_counts = splitBySection( GET_CANDIDATE_GENES.out.counts ) + ch_section_stats = splitBySection( GET_CANDIDATE_GENES.out.section_stats ) + + // ----------------------------------------------------------------- + // NORMFINDER + // ----------------------------------------------------------------- + + NORMFINDER ( + ch_candidate_gene_counts, + ch_design.collect() // single item + ) + ch_normfinder_stabilities = NORMFINDER.out.stability_values + + // ----------------------------------------------------------------- + // GENORM + // ----------------------------------------------------------------- + + if ( !skip_genorm ) { + GENORM ( ch_candidate_gene_counts ) + ch_genorm_stability = GENORM.out.m_measures + } else { + ch_genorm_stability = channel.value([:]) + } + + // ----------------------------------------------------------------- + // AGGREGATION AND FINAL STABILITY SCORE + // ----------------------------------------------------------------- + //ch_normfinder_stabilities.join( ch_genorm_stability ).join( ch_section_stats ), + COMPUTE_STABILITY_SCORES ( + ch_normfinder_stabilities.join( ch_genorm_stability ).join( ch_section_stats ), + stability_score_weights + ) + + emit: + summary_statistics = COMPUTE_STABILITY_SCORES.out.stats_with_stability_scores + +} + +/* +======================================================================================== + FUNCTIONS +======================================================================================== +*/ + +def splitBySection( ch_files ) { + return ch_files + .map { files -> + files.collect { file -> [ [ section: file.name.tokenize(".")[0] ], file ] } + } + .flatMap{ n -> n } // turns a channel of one list of n files into a channel of n files +} diff --git a/subworkflows/local/utils_nfcore_stableexpression_pipeline/main.nf b/subworkflows/local/utils_nfcore_stableexpression_pipeline/main.nf index 66f4e9a8..ed736dd6 100644 --- a/subworkflows/local/utils_nfcore_stableexpression_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_stableexpression_pipeline/main.nf @@ -2,41 +2,39 @@ // Subworkflow with functionality specific to the nf-core/stableexpression pipeline // -import org.yaml.snakeyaml.Yaml - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT FUNCTIONS / MODULES / SUBWORKFLOWS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { UTILS_NFVALIDATION_PLUGIN } from '../../nf-core/utils_nfvalidation_plugin' +include { UTILS_NFSCHEMA_PLUGIN } from '../../nf-core/utils_nfschema_plugin' include { paramsSummaryMap } from 'plugin/nf-schema' -include { UTILS_NEXTFLOW_PIPELINE } from '../../nf-core/utils_nextflow_pipeline' +include { samplesheetToList } from 'plugin/nf-schema' include { completionEmail } from '../../nf-core/utils_nfcore_pipeline' include { completionSummary } from '../../nf-core/utils_nfcore_pipeline' -include { dashedLine } from '../../nf-core/utils_nfcore_pipeline' -include { nfCoreLogo } from '../../nf-core/utils_nfcore_pipeline' include { imNotification } from '../../nf-core/utils_nfcore_pipeline' include { UTILS_NFCORE_PIPELINE } from '../../nf-core/utils_nfcore_pipeline' -include { workflowCitation } from '../../nf-core/utils_nfcore_pipeline' -include { workflowVersionToYAML } from '../../nf-core/utils_nfcore_pipeline' +include { UTILS_NEXTFLOW_PIPELINE } from '../../nf-core/utils_nextflow_pipeline' /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SUBWORKFLOW TO INITIALISE PIPELINE -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ workflow PIPELINE_INITIALISATION { take: version // boolean: Display version and exit - help // boolean: Display help text validate_params // boolean: Boolean whether to validate parameters against the schema at runtime monochrome_logs // boolean: Do not use coloured log outputs nextflow_cli_args // array: List of positional nextflow CLI args outdir // string: The output directory where the results will be saved + input // string: Path to input samplesheet + help // boolean: Display help message and exit + help_full // boolean: Show the full help message + show_hidden // boolean: Show hidden parameters in the help message main: @@ -53,16 +51,35 @@ workflow PIPELINE_INITIALISATION { // // Validate parameters and generate parameter summary to stdout // - pre_help_text = nfCoreLogo(monochrome_logs) - post_help_text = '\n' + workflowCitation() + '\n' + dashedLine(monochrome_logs) - def String workflow_command = "nextflow run ${workflow.manifest.name} -profile --input samplesheet.csv --outdir " - UTILS_NFVALIDATION_PLUGIN ( - help, - workflow_command, - pre_help_text, - post_help_text, + before_text = """ +-\033[2m----------------------------------------------------\033[0m- + \033[0;32m,--.\033[0;30m/\033[0;32m,-.\033[0m +\033[0;34m ___ __ __ __ ___ \033[0;32m/,-._.--~\'\033[0m +\033[0;34m |\\ | |__ __ / ` / \\ |__) |__ \033[0;33m} {\033[0m +\033[0;34m | \\| | \\__, \\__/ | \\ |___ \033[0;32m\\`-._,-`-,\033[0m + \033[0;32m`._,._,\'\033[0m +\033[0;35m nf-core/stableexpression ${workflow.manifest.version}\033[0m +-\033[2m----------------------------------------------------\033[0m- +""" + after_text = """${workflow.manifest.doi ? "\n* The pipeline\n" : ""}${workflow.manifest.doi.tokenize(",").collect { doi -> " https://doi.org/${doi.trim().replace('https://doi.org/','')}"}.join("\n")}${workflow.manifest.doi ? "\n" : ""} +* The nf-core framework + https://doi.org/10.1038/s41587-020-0439-x + +* Software dependencies + https://github.com/nf-core/stableexpression/blob/main/CITATIONS.md +""" + command = "nextflow run ${workflow.manifest.name} -profile --species --outdir " + + UTILS_NFSCHEMA_PLUGIN ( + workflow, validate_params, - "nextflow_schema.json" + null, + help, + help_full, + show_hidden, + before_text, + after_text, + command ) // @@ -72,12 +89,30 @@ workflow PIPELINE_INITIALISATION { nextflow_cli_args ) + // + // Custom validation for pipeline parameters + // + validateInputParameters( params ) + + // + // Create channel from datasets file provided through params.datasets + // + if (params.datasets) { + ch_input_datasets = parseInputDatasets( params.datasets ) + validateInputSamplesheet( ch_input_datasets ) + } else { + ch_input_datasets = channel.empty() + } + + emit: + input_datasets = ch_input_datasets + } /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SUBWORKFLOW FOR PIPELINE COMPLETION -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ workflow PIPELINE_COMPLETION { @@ -89,21 +124,29 @@ workflow PIPELINE_COMPLETION { outdir // path: Path to output directory where results will be published monochrome_logs // boolean: Disable ANSI colour codes in log output hook_url // string: hook URL for notifications + multiqc_report // string: Path to MultiQC report main: - summary_params = paramsSummaryMap(workflow, parameters_schema: "nextflow_schema.json") + def multiqc_reports = multiqc_report.toList() // // Completion email and summary // workflow.onComplete { if (email || email_on_fail) { - completionEmail(summary_params, email, email_on_fail, plaintext_email, outdir, monochrome_logs) + completionEmail( + summary_params, + email, + email_on_fail, + plaintext_email, + outdir, + monochrome_logs, + multiqc_reports.getVal(), + ) } completionSummary(monochrome_logs) - if (hook_url) { imNotification(summary_params, hook_url) } @@ -114,43 +157,275 @@ workflow PIPELINE_COMPLETION { } } +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + FUNCTIONS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ +// +// Check and validate pipeline parameters +// + + +def check_accession(accession) { + if ( !( accession.startsWith('E-') || accession.startsWith('GSE') ) ) { + error('Accession ' + accession + ' is not well formated. All accessions should start with "E-" or "GSE".') + } +} + + +def check_accession_string(accessions_str) { + if ( accessions_str != null && accessions_str != "" ) { + accessions_str.tokenize(',').each { accession -> + check_accession(accession) + } + } +} + +def check_accession_file(accession_file) { + if ( accession_file != null ) { + def lines = new File(accession_file).readLines() + lines.each { accession -> + check_accession(accession) + } + } +} + +def validateInputParameters(params) { + + // checking that a species has been provided + if ( !params.species ) { + error('You must provide a species name') + } + + // if accessions are provided or excluded, checking that they are well formated + check_accession_string( params.accessions ) + check_accession_string( params.excluded_accessions ) + + check_accession_file( params.accessions_file ) + check_accession_file( params.excluded_accessions_file ) + + if ( params.keywords && params.skip_fetch_eatlas_accessions && !params.fetch_geo_accessions ) { + log.warn "Ignoring keywords as accessions will not be fetched from Expression Atlas or GEO" + } + +} + +// +// Parses files from input dataset and creates two subchannels raw and normalized +// with elements like [meta, count_file, normalised] +def parseInputDatasets(samplesheet) { + return channel.fromList( samplesheetToList(samplesheet, "assets/schema_datasets.json") ) + .map { + item -> + def (meta, count_file) = item + def new_meta = meta + [dataset: count_file.getBaseName()] + [new_meta, count_file] + } +} + + +// +// Validate channels from input samplesheet +// +def validateInputSamplesheet( ch_datasets ) { + // checking that all microarray datasets (if any) are normalised + ch_datasets + .filter { + meta, file -> + meta.platform == 'microarray' && !meta.normalised + } + .count() + .map { count -> + if (count > 0) { + def error_text = [ + "Error: You provided at least one microarray dataset that is not normalised. ", + "Microarray datasets must already be normalised before being submitted. ", + "Please perform normalisation (typically using RMA for one-colour intensities / LOESS (limma) for two-colour intensities) and run again." + ].join(' ').trim() + error(error_text) + } + } + + // checking that all count files are well formated (same number of columns in header and rows) + ch_datasets + .map { meta, file -> + def header = file.withReader { reader -> reader.readLine() } + def separator = header.contains(',') ? "," : + header.contains('\t') ? "\t" : + " " + def first_row = file.splitCsv( header: false, skip: 1, limit: 1, sep: separator ) + + assert header.split(separator).size() == first_row[0].size() : "Header and first row do not have the same number of columns in file ${file}" + } +} +// +// Generate methods description for MultiQC +// +def toolCitationText() { + // TODO nf-core: Optionally add in-text citation tools to this list. + // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "Tool (Foo et al. 2023)" : "", + // Uncomment function in methodsDescriptionText to render in MultiQC report + def citation_text = [ + "Tools used in the workflow included:", + "MultiQC (Ewels et al. 2016)", + "." + ].join(' ').trim() + + return citation_text +} + +def toolBibliographyText() { + // TODO nf-core: Optionally add bibliographic entries to this list. + // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "
  • Author (2023) Pub name, Journal, DOI
  • " : "", + // Uncomment function in methodsDescriptionText to render in MultiQC report + def reference_text = [ + "
  • Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics , 32(19), 3047–3048. doi: /10.1093/bioinformatics/btw354
  • " + ].join(' ').trim() + + return reference_text +} + +def methodsDescriptionText(mqc_methods_yaml) { + // Convert to a named map so can be used as with familiar NXF ${workflow} variable syntax in the MultiQC YML file + def meta = [:] + meta.workflow = workflow.toMap() + meta["manifest_map"] = workflow.manifest.toMap() + + // Pipeline DOI + if (meta.manifest_map.doi) { + // Using a loop to handle multiple DOIs + // Removing `https://doi.org/` to handle pipelines using DOIs vs DOI resolvers + // Removing ` ` since the manifest.doi is a string and not a proper list + def temp_doi_ref = "" + def manifest_doi = meta.manifest_map.doi.tokenize(",") + manifest_doi.each { doi_ref -> + temp_doi_ref += "(doi: ${doi_ref.replace("https://doi.org/", "").replace(" ", "")}), " + } + meta["doi_text"] = temp_doi_ref.substring(0, temp_doi_ref.length() - 2) + } else meta["doi_text"] = "" + meta["nodoi_text"] = meta.manifest_map.doi ? "" : "
  • If available, make sure to update the text to include the Zenodo DOI of version of the pipeline used.
  • " + + // Tool references + meta["tool_citations"] = "" + meta["tool_bibliography"] = "" + + // TODO nf-core: Only uncomment below if logic in toolCitationText/toolBibliographyText has been filled! + // meta["tool_citations"] = toolCitationText().replaceAll(", \\.", ".").replaceAll("\\. \\.", ".").replaceAll(", \\.", ".") + // meta["tool_bibliography"] = toolBibliographyText() + + + def methods_text = mqc_methods_yaml.text + + def engine = new groovy.text.SimpleTemplateEngine() + def description_html = engine.createTemplate(methods_text).make(meta) + + return description_html.toString() +} + /* ======================================================================================== - FUNCTIONS + FUNCTIONS FOR FORMATTING DATA FETCHED FROM EXPRESSION ATLAS / GEO ======================================================================================== */ -// Setting these functions as temporary replacements of the native softwareVersionsToYAML / processVersionsFromYAML +// +// Get Expression Atlas Batch ID (accession + data_type) from file stem +// +def addDatasetIdToMetadata( ch_files ) { + return ch_files + .map { + file -> + def meta = [ dataset: file.getSimpleName() ] + [meta, file] + } +} // -// Get software versions for pipeline +// Groups design and data files by accession and data_type +// Design and count files have necessarily the same dataset ID (same file stem) // -def customProcessVersionsFromYAML(yaml_file) { - Yaml yaml = new Yaml() - versions = yaml.load(yaml_file) - return yaml.dumpAsMap(versions).trim() +def groupFilesByDatasetId(ch_design, ch_counts) { + return ch_design + .concat( ch_counts ) // puts counts at the end of the resulting channel + .groupTuple() // groups by dataset ID; design files are necessarily BEFORE count files + .filter { + it.get(1).size() == 2 // only groups with two files + } + .filter { // only groups with first file as design file and second one as count fileWARN: java.net.ConnectException: Connexion refusĂŠe + meta, files -> + files.get(0).name.endsWith('.design.csv') && !files.get(1).name.endsWith('.design.csv') + } + .map { // putting design file in meta + meta, files -> + def new_meta = meta + [design: files[0]] + [new_meta, files[1]] + } } -// def dumpVersionsYAML() +def getNthPartFromEnd(String s, int n) { + def tokens = s.tokenize('.') + return tokens[tokens.size() - n] +} // -// Get channel of software versions used in pipeline in YAML format +// Add normalised: true / false in meta // -def customSoftwareVersionsToYAML(versions) { - return Channel.of(workflowVersionToYAML()) - .concat( - versions - .unique() - .map { - name, tool, version -> [ name.tokenize(':')[-1], [ tool, version ] ] - } - .groupTuple() - .map { - processName, toolInfo -> - def toolVersions = toolInfo.collect { tool, version -> " ${tool}: ${version}" }.join('\n') - "${processName}:\n${toolVersions}\n" - } - .map { customProcessVersionsFromYAML(it) } - ) +def augmentMetadata( ch_files ) { + return ch_files + .map { + meta, file -> + def norm_state = getNthPartFromEnd(file.name, 3) + def normalised = false + if ( norm_state == 'normalised' ) { + normalised = true + } else if ( norm_state == 'raw' ) { + normalised = false + } else { + error("Invalid normalisation state: ${norm_state}") + } + + def platform = getNthPartFromEnd(file.name, 4) + def new_meta = meta + [normalised: normalised, platform: platform] + [new_meta, file] + } +} + + +/* +======================================================================================== + FUNCTIONS FOR CHECKING NB OF DATASETS +======================================================================================== +*/ + +def checkCounts(ch_counts, fetch_geo_accessions) { + + ch_counts.count().map { n -> + if( n == 0 ) { + // display a warning if no datasets are found + def msg_lst = [] + if ( !fetch_geo_accessions ) { + msg_lst = [ + "Could not find any readily usable public dataset...", + "This might be due to connection issues on the Expression Atlas FTP server.", + "If it is the case, please wait for a couple of minutes and run again.", + "Alternatively, datasets for your species of interest might not exist on Expression Atlas.", + "In this case, you can try to get additional datasets from NCBI GEO Datasets using the --fetch_geo_accessions flag (this feature is still experimental)." + ] + } else { + msg_lst = [ + "Could not find any readily usable public dataset...", + "This might be due to connection issues on the Expression Atlas FTP server.", + "If it is the case, please wait for a couple of minutes and run again.", + "You can check directly on NCBI GEO Datasets if there are available datasets for this species that you can prepare yourself:", + "https://www.ncbi.nlm.nih.gov/gds", + "Once you have prepared your own data, you can relaunch the pipeline and provide your prepared count datasets using the --datasets parameter. ", + "For more information, see the online documentation at https://nf-co.re/stableexpression." + ] + } + def msg = msg_lst.join("\n").trim() + error(msg) + } + } } diff --git a/subworkflows/nf-core/utils_nextflow_pipeline/main.nf b/subworkflows/nf-core/utils_nextflow_pipeline/main.nf index ac31f28f..d6e593e8 100644 --- a/subworkflows/nf-core/utils_nextflow_pipeline/main.nf +++ b/subworkflows/nf-core/utils_nextflow_pipeline/main.nf @@ -2,18 +2,13 @@ // Subworkflow with functionality that may be useful for any Nextflow pipeline // -import org.yaml.snakeyaml.Yaml -import groovy.json.JsonOutput -import nextflow.extension.FilesEx - /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SUBWORKFLOW DEFINITION -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ workflow UTILS_NEXTFLOW_PIPELINE { - take: print_version // boolean: print version dump_parameters // boolean: dump parameters @@ -26,7 +21,7 @@ workflow UTILS_NEXTFLOW_PIPELINE { // Print workflow version and exit on --version // if (print_version) { - log.info "${workflow.manifest.name} ${getWorkflowVersion()}" + log.info("${workflow.manifest.name} ${getWorkflowVersion()}") System.exit(0) } @@ -49,16 +44,16 @@ workflow UTILS_NEXTFLOW_PIPELINE { } /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FUNCTIONS -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ // // Generate version string // def getWorkflowVersion() { - String version_string = "" + def version_string = "" as String if (workflow.manifest.version) { def prefix_v = workflow.manifest.version[0] != 'v' ? 'v' : '' version_string += "${prefix_v}${workflow.manifest.version}" @@ -76,13 +71,13 @@ def getWorkflowVersion() { // Dump pipeline parameters to a JSON file // def dumpParametersToJSON(outdir) { - def timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') - def filename = "params_${timestamp}.json" - def temp_pf = new File(workflow.launchDir.toString(), ".${filename}") - def jsonStr = JsonOutput.toJson(params) - temp_pf.text = JsonOutput.prettyPrint(jsonStr) + def timestamp = new java.util.Date().format('yyyy-MM-dd_HH-mm-ss') + def filename = "params_${timestamp}.json" + def temp_pf = new File(workflow.launchDir.toString(), ".${filename}") + def jsonStr = groovy.json.JsonOutput.toJson(params) + temp_pf.text = groovy.json.JsonOutput.prettyPrint(jsonStr) - FilesEx.copyTo(temp_pf.toPath(), "${outdir}/pipeline_info/params_${timestamp}.json") + nextflow.extension.FilesEx.copyTo(temp_pf.toPath(), "${outdir}/pipeline_info/params_${timestamp}.json") temp_pf.delete() } @@ -90,37 +85,42 @@ def dumpParametersToJSON(outdir) { // When running with -profile conda, warn if channels have not been set-up appropriately // def checkCondaChannels() { - Yaml parser = new Yaml() + def parser = new org.yaml.snakeyaml.Yaml() def channels = [] try { def config = parser.load("conda config --show channels".execute().text) channels = config.channels - } catch(NullPointerException | IOException e) { - log.warn "Could not verify conda channel configuration." - return + } + catch (NullPointerException e) { + log.debug(e) + log.warn("Could not verify conda channel configuration.") + return null + } + catch (IOException e) { + log.debug(e) + log.warn("Could not verify conda channel configuration.") + return null } // Check that all channels are present // This channel list is ordered by required channel priority. - def required_channels_in_order = ['conda-forge', 'bioconda', 'defaults'] + def required_channels_in_order = ['conda-forge', 'bioconda'] def channels_missing = ((required_channels_in_order as Set) - (channels as Set)) as Boolean // Check that they are in the right order - def channel_priority_violation = false - def n = required_channels_in_order.size() - for (int i = 0; i < n - 1; i++) { - channel_priority_violation |= !(channels.indexOf(required_channels_in_order[i]) < channels.indexOf(required_channels_in_order[i+1])) - } + def channel_priority_violation = required_channels_in_order != channels.findAll { ch -> ch in required_channels_in_order } if (channels_missing | channel_priority_violation) { - log.warn "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + - " There is a problem with your Conda configuration!\n\n" + - " You will need to set-up the conda-forge and bioconda channels correctly.\n" + - " Please refer to https://bioconda.github.io/\n" + - " The observed channel order is \n" + - " ${channels}\n" + - " but the following channel order is required:\n" + - " ${required_channels_in_order}\n" + - "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + log.warn """\ + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + There is a problem with your Conda configuration! + You will need to set-up the conda-forge and bioconda channels correctly. + Please refer to https://bioconda.github.io/ + The observed channel order is + ${channels} + but the following channel order is required: + ${required_channels_in_order} + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + """.stripIndent(true) } } diff --git a/subworkflows/nf-core/utils_nextflow_pipeline/tests/main.workflow.nf.test b/subworkflows/nf-core/utils_nextflow_pipeline/tests/main.workflow.nf.test index ca964ce8..02dbf094 100644 --- a/subworkflows/nf-core/utils_nextflow_pipeline/tests/main.workflow.nf.test +++ b/subworkflows/nf-core/utils_nextflow_pipeline/tests/main.workflow.nf.test @@ -52,10 +52,12 @@ nextflow_workflow { } then { - assertAll( - { assert workflow.success }, - { assert workflow.stdout.contains("nextflow_workflow v9.9.9") } - ) + expect { + with(workflow) { + assert success + assert "nextflow_workflow v9.9.9" in stdout + } + } } } diff --git a/subworkflows/nf-core/utils_nextflow_pipeline/tests/nextflow.config b/subworkflows/nf-core/utils_nextflow_pipeline/tests/nextflow.config index d0a926bf..a09572e5 100644 --- a/subworkflows/nf-core/utils_nextflow_pipeline/tests/nextflow.config +++ b/subworkflows/nf-core/utils_nextflow_pipeline/tests/nextflow.config @@ -3,7 +3,7 @@ manifest { author = """nf-core""" homePage = 'https://127.0.0.1' description = """Dummy pipeline""" - nextflowVersion = '!>=23.04.0' + nextflowVersion = '!>=23.04.0' version = '9.9.9' doi = 'https://doi.org/10.5281/zenodo.5070524' } diff --git a/subworkflows/nf-core/utils_nextflow_pipeline/tests/tags.yml b/subworkflows/nf-core/utils_nextflow_pipeline/tests/tags.yml deleted file mode 100644 index f8476112..00000000 --- a/subworkflows/nf-core/utils_nextflow_pipeline/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -subworkflows/utils_nextflow_pipeline: - - subworkflows/nf-core/utils_nextflow_pipeline/** diff --git a/subworkflows/nf-core/utils_nfcore_pipeline/main.nf b/subworkflows/nf-core/utils_nfcore_pipeline/main.nf index b290cd7b..bfd25876 100644 --- a/subworkflows/nf-core/utils_nfcore_pipeline/main.nf +++ b/subworkflows/nf-core/utils_nfcore_pipeline/main.nf @@ -2,17 +2,13 @@ // Subworkflow with utility functions specific to the nf-core pipeline template // -import org.yaml.snakeyaml.Yaml -import nextflow.extension.FilesEx - /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SUBWORKFLOW DEFINITION -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ workflow UTILS_NFCORE_PIPELINE { - take: nextflow_cli_args @@ -25,23 +21,20 @@ workflow UTILS_NFCORE_PIPELINE { } /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FUNCTIONS -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ // // Warn if a -profile or Nextflow config has not been provided to run the pipeline // def checkConfigProvided() { - valid_config = true + def valid_config = true as Boolean if (workflow.profile == 'standard' && workflow.configFiles.size() <= 1) { - log.warn "[$workflow.manifest.name] You are attempting to run the pipeline without any custom configuration!\n\n" + - "This will be dependent on your local compute environment but can be achieved via one or more of the following:\n" + - " (1) Using an existing pipeline profile e.g. `-profile docker` or `-profile singularity`\n" + - " (2) Using an existing nf-core/configs for your Institution e.g. `-profile crick` or `-profile uppmax`\n" + - " (3) Using your own local custom config e.g. `-c /path/to/your/custom.config`\n\n" + - "Please refer to the quick start section and usage docs for the pipeline.\n " + log.warn( + "[${workflow.manifest.name}] You are attempting to run the pipeline without any custom configuration!\n\n" + "This will be dependent on your local compute environment but can be achieved via one or more of the following:\n" + " (1) Using an existing pipeline profile e.g. `-profile docker` or `-profile singularity`\n" + " (2) Using an existing nf-core/configs for your Institution e.g. `-profile crick` or `-profile uppmax`\n" + " (3) Using your own local custom config e.g. `-c /path/to/your/custom.config`\n\n" + "Please refer to the quick start section and usage docs for the pipeline.\n " + ) valid_config = false } return valid_config @@ -52,39 +45,22 @@ def checkConfigProvided() { // def checkProfileProvided(nextflow_cli_args) { if (workflow.profile.endsWith(',')) { - error "The `-profile` option cannot end with a trailing comma, please remove it and re-run the pipeline!\n" + - "HINT: A common mistake is to provide multiple values separated by spaces e.g. `-profile test, docker`.\n" + error( + "The `-profile` option cannot end with a trailing comma, please remove it and re-run the pipeline!\n" + "HINT: A common mistake is to provide multiple values separated by spaces e.g. `-profile test, docker`.\n" + ) } if (nextflow_cli_args[0]) { - log.warn "nf-core pipelines do not accept positional arguments. The positional argument `${nextflow_cli_args[0]}` has been detected.\n" + - "HINT: A common mistake is to provide multiple values separated by spaces e.g. `-profile test, docker`.\n" + log.warn( + "nf-core pipelines do not accept positional arguments. The positional argument `${nextflow_cli_args[0]}` has been detected.\n" + "HINT: A common mistake is to provide multiple values separated by spaces e.g. `-profile test, docker`.\n" + ) } } -// -// Citation string for pipeline -// -def workflowCitation() { - def temp_doi_ref = "" - String[] manifest_doi = workflow.manifest.doi.tokenize(",") - // Using a loop to handle multiple DOIs - // Removing `https://doi.org/` to handle pipelines using DOIs vs DOI resolvers - // Removing ` ` since the manifest.doi is a string and not a proper list - for (String doi_ref: manifest_doi) temp_doi_ref += " https://doi.org/${doi_ref.replace('https://doi.org/', '').replace(' ', '')}\n" - return "If you use ${workflow.manifest.name} for your analysis please cite:\n\n" + - "* The pipeline\n" + - temp_doi_ref + "\n" + - "* The nf-core framework\n" + - " https://doi.org/10.1038/s41587-020-0439-x\n\n" + - "* Software dependencies\n" + - " https://github.com/${workflow.manifest.name}/blob/master/CITATIONS.md" -} - // // Generate workflow version string // def getWorkflowVersion() { - String version_string = "" + def version_string = "" as String if (workflow.manifest.version) { def prefix_v = workflow.manifest.version[0] != 'v' ? 'v' : '' version_string += "${prefix_v}${workflow.manifest.version}" @@ -102,8 +78,8 @@ def getWorkflowVersion() { // Get software versions for pipeline // def processVersionsFromYAML(yaml_file) { - Yaml yaml = new Yaml() - versions = yaml.load(yaml_file).collectEntries { k, v -> [ k.tokenize(':')[-1], v ] } + def yaml = new org.yaml.snakeyaml.Yaml() + def versions = yaml.load(yaml_file).collectEntries { k, v -> [k.tokenize(':')[-1], v] } return yaml.dumpAsMap(versions).trim() } @@ -113,20 +89,16 @@ def processVersionsFromYAML(yaml_file) { def workflowVersionToYAML() { return """ Workflow: - $workflow.manifest.name: ${getWorkflowVersion()} - Nextflow: $workflow.nextflow.version + ${workflow.manifest.name}: ${getWorkflowVersion()} + Nextflow: ${workflow.nextflow.version} """.stripIndent().trim() } // // Get channel of software versions used in pipeline in YAML format // -def softwareVersionsToYAML(versions) { - return versions - .unique() - .map { processVersionsFromYAML(it) } - .unique() - .mix(Channel.of(workflowVersionToYAML())) +def softwareVersionsToYAML(ch_versions) { + return ch_versions.unique().map { version -> processVersionsFromYAML(version) }.unique().mix(Channel.of(workflowVersionToYAML())) } // @@ -134,61 +106,40 @@ def softwareVersionsToYAML(versions) { // def paramsSummaryMultiqc(summary_params) { def summary_section = '' - for (group in summary_params.keySet()) { - def group_params = summary_params.get(group) // This gets the parameters of that particular group - if (group_params) { - summary_section += "

    $group

    \n" - summary_section += "
    \n" - for (param in group_params.keySet()) { - summary_section += "
    $param
    ${group_params.get(param) ?: 'N/A'}
    \n" + summary_params + .keySet() + .each { group -> + def group_params = summary_params.get(group) + // This gets the parameters of that particular group + if (group_params) { + summary_section += "

    ${group}

    \n" + summary_section += "
    \n" + group_params + .keySet() + .sort() + .each { param -> + summary_section += "
    ${param}
    ${group_params.get(param) ?: 'N/A'}
    \n" + } + summary_section += "
    \n" } - summary_section += "
    \n" } - } - String yaml_file_text = "id: '${workflow.manifest.name.replace('/','-')}-summary'\n" - yaml_file_text += "description: ' - this information is collected when the pipeline is started.'\n" - yaml_file_text += "section_name: '${workflow.manifest.name} Workflow Summary'\n" - yaml_file_text += "section_href: 'https://github.com/${workflow.manifest.name}'\n" - yaml_file_text += "plot_type: 'html'\n" - yaml_file_text += "data: |\n" - yaml_file_text += "${summary_section}" + def yaml_file_text = "id: '${workflow.manifest.name.replace('/', '-')}-summary'\n" as String + yaml_file_text += "description: ' - this information is collected when the pipeline is started.'\n" + yaml_file_text += "section_name: '${workflow.manifest.name} Workflow Summary'\n" + yaml_file_text += "section_href: 'https://github.com/${workflow.manifest.name}'\n" + yaml_file_text += "plot_type: 'html'\n" + yaml_file_text += "data: |\n" + yaml_file_text += "${summary_section}" return yaml_file_text } -// -// nf-core logo -// -def nfCoreLogo(monochrome_logs=true) { - Map colors = logColours(monochrome_logs) - String.format( - """\n - ${dashedLine(monochrome_logs)} - ${colors.green},--.${colors.black}/${colors.green},-.${colors.reset} - ${colors.blue} ___ __ __ __ ___ ${colors.green}/,-._.--~\'${colors.reset} - ${colors.blue} |\\ | |__ __ / ` / \\ |__) |__ ${colors.yellow}} {${colors.reset} - ${colors.blue} | \\| | \\__, \\__/ | \\ |___ ${colors.green}\\`-._,-`-,${colors.reset} - ${colors.green}`._,._,\'${colors.reset} - ${colors.purple} ${workflow.manifest.name} ${getWorkflowVersion()}${colors.reset} - ${dashedLine(monochrome_logs)} - """.stripIndent() - ) -} - -// -// Return dashed line -// -def dashedLine(monochrome_logs=true) { - Map colors = logColours(monochrome_logs) - return "-${colors.dim}----------------------------------------------------${colors.reset}-" -} - // // ANSII colours used for terminal logging // def logColours(monochrome_logs=true) { - Map colorcodes = [:] + def colorcodes = [:] as Map // Reset / Meta colorcodes['reset'] = monochrome_logs ? '' : "\033[0m" @@ -200,79 +151,76 @@ def logColours(monochrome_logs=true) { colorcodes['hidden'] = monochrome_logs ? '' : "\033[8m" // Regular Colors - colorcodes['black'] = monochrome_logs ? '' : "\033[0;30m" - colorcodes['red'] = monochrome_logs ? '' : "\033[0;31m" - colorcodes['green'] = monochrome_logs ? '' : "\033[0;32m" - colorcodes['yellow'] = monochrome_logs ? '' : "\033[0;33m" - colorcodes['blue'] = monochrome_logs ? '' : "\033[0;34m" - colorcodes['purple'] = monochrome_logs ? '' : "\033[0;35m" - colorcodes['cyan'] = monochrome_logs ? '' : "\033[0;36m" - colorcodes['white'] = monochrome_logs ? '' : "\033[0;37m" + colorcodes['black'] = monochrome_logs ? '' : "\033[0;30m" + colorcodes['red'] = monochrome_logs ? '' : "\033[0;31m" + colorcodes['green'] = monochrome_logs ? '' : "\033[0;32m" + colorcodes['yellow'] = monochrome_logs ? '' : "\033[0;33m" + colorcodes['blue'] = monochrome_logs ? '' : "\033[0;34m" + colorcodes['purple'] = monochrome_logs ? '' : "\033[0;35m" + colorcodes['cyan'] = monochrome_logs ? '' : "\033[0;36m" + colorcodes['white'] = monochrome_logs ? '' : "\033[0;37m" // Bold - colorcodes['bblack'] = monochrome_logs ? '' : "\033[1;30m" - colorcodes['bred'] = monochrome_logs ? '' : "\033[1;31m" - colorcodes['bgreen'] = monochrome_logs ? '' : "\033[1;32m" - colorcodes['byellow'] = monochrome_logs ? '' : "\033[1;33m" - colorcodes['bblue'] = monochrome_logs ? '' : "\033[1;34m" - colorcodes['bpurple'] = monochrome_logs ? '' : "\033[1;35m" - colorcodes['bcyan'] = monochrome_logs ? '' : "\033[1;36m" - colorcodes['bwhite'] = monochrome_logs ? '' : "\033[1;37m" + colorcodes['bblack'] = monochrome_logs ? '' : "\033[1;30m" + colorcodes['bred'] = monochrome_logs ? '' : "\033[1;31m" + colorcodes['bgreen'] = monochrome_logs ? '' : "\033[1;32m" + colorcodes['byellow'] = monochrome_logs ? '' : "\033[1;33m" + colorcodes['bblue'] = monochrome_logs ? '' : "\033[1;34m" + colorcodes['bpurple'] = monochrome_logs ? '' : "\033[1;35m" + colorcodes['bcyan'] = monochrome_logs ? '' : "\033[1;36m" + colorcodes['bwhite'] = monochrome_logs ? '' : "\033[1;37m" // Underline - colorcodes['ublack'] = monochrome_logs ? '' : "\033[4;30m" - colorcodes['ured'] = monochrome_logs ? '' : "\033[4;31m" - colorcodes['ugreen'] = monochrome_logs ? '' : "\033[4;32m" - colorcodes['uyellow'] = monochrome_logs ? '' : "\033[4;33m" - colorcodes['ublue'] = monochrome_logs ? '' : "\033[4;34m" - colorcodes['upurple'] = monochrome_logs ? '' : "\033[4;35m" - colorcodes['ucyan'] = monochrome_logs ? '' : "\033[4;36m" - colorcodes['uwhite'] = monochrome_logs ? '' : "\033[4;37m" + colorcodes['ublack'] = monochrome_logs ? '' : "\033[4;30m" + colorcodes['ured'] = monochrome_logs ? '' : "\033[4;31m" + colorcodes['ugreen'] = monochrome_logs ? '' : "\033[4;32m" + colorcodes['uyellow'] = monochrome_logs ? '' : "\033[4;33m" + colorcodes['ublue'] = monochrome_logs ? '' : "\033[4;34m" + colorcodes['upurple'] = monochrome_logs ? '' : "\033[4;35m" + colorcodes['ucyan'] = monochrome_logs ? '' : "\033[4;36m" + colorcodes['uwhite'] = monochrome_logs ? '' : "\033[4;37m" // High Intensity - colorcodes['iblack'] = monochrome_logs ? '' : "\033[0;90m" - colorcodes['ired'] = monochrome_logs ? '' : "\033[0;91m" - colorcodes['igreen'] = monochrome_logs ? '' : "\033[0;92m" - colorcodes['iyellow'] = monochrome_logs ? '' : "\033[0;93m" - colorcodes['iblue'] = monochrome_logs ? '' : "\033[0;94m" - colorcodes['ipurple'] = monochrome_logs ? '' : "\033[0;95m" - colorcodes['icyan'] = monochrome_logs ? '' : "\033[0;96m" - colorcodes['iwhite'] = monochrome_logs ? '' : "\033[0;97m" + colorcodes['iblack'] = monochrome_logs ? '' : "\033[0;90m" + colorcodes['ired'] = monochrome_logs ? '' : "\033[0;91m" + colorcodes['igreen'] = monochrome_logs ? '' : "\033[0;92m" + colorcodes['iyellow'] = monochrome_logs ? '' : "\033[0;93m" + colorcodes['iblue'] = monochrome_logs ? '' : "\033[0;94m" + colorcodes['ipurple'] = monochrome_logs ? '' : "\033[0;95m" + colorcodes['icyan'] = monochrome_logs ? '' : "\033[0;96m" + colorcodes['iwhite'] = monochrome_logs ? '' : "\033[0;97m" // Bold High Intensity - colorcodes['biblack'] = monochrome_logs ? '' : "\033[1;90m" - colorcodes['bired'] = monochrome_logs ? '' : "\033[1;91m" - colorcodes['bigreen'] = monochrome_logs ? '' : "\033[1;92m" - colorcodes['biyellow'] = monochrome_logs ? '' : "\033[1;93m" - colorcodes['biblue'] = monochrome_logs ? '' : "\033[1;94m" - colorcodes['bipurple'] = monochrome_logs ? '' : "\033[1;95m" - colorcodes['bicyan'] = monochrome_logs ? '' : "\033[1;96m" - colorcodes['biwhite'] = monochrome_logs ? '' : "\033[1;97m" + colorcodes['biblack'] = monochrome_logs ? '' : "\033[1;90m" + colorcodes['bired'] = monochrome_logs ? '' : "\033[1;91m" + colorcodes['bigreen'] = monochrome_logs ? '' : "\033[1;92m" + colorcodes['biyellow'] = monochrome_logs ? '' : "\033[1;93m" + colorcodes['biblue'] = monochrome_logs ? '' : "\033[1;94m" + colorcodes['bipurple'] = monochrome_logs ? '' : "\033[1;95m" + colorcodes['bicyan'] = monochrome_logs ? '' : "\033[1;96m" + colorcodes['biwhite'] = monochrome_logs ? '' : "\033[1;97m" return colorcodes } -// -// Attach the multiqc report to email -// -def attachMultiqcReport(multiqc_report) { - def mqc_report = null - try { - if (workflow.success) { - mqc_report = multiqc_report.getVal() - if (mqc_report.getClass() == ArrayList && mqc_report.size() >= 1) { - if (mqc_report.size() > 1) { - log.warn "[$workflow.manifest.name] Found multiple reports from process 'MULTIQC', will use only one" - } - mqc_report = mqc_report[0] - } - } - } catch (all) { - if (multiqc_report) { - log.warn "[$workflow.manifest.name] Could not attach MultiQC report to summary email" +// Return a single report from an object that may be a Path or List +// +def getSingleReport(multiqc_reports) { + if (multiqc_reports instanceof Path) { + return multiqc_reports + } else if (multiqc_reports instanceof List) { + if (multiqc_reports.size() == 0) { + log.warn("[${workflow.manifest.name}] No reports found from process 'MULTIQC'") + return null + } else if (multiqc_reports.size() == 1) { + return multiqc_reports.first() + } else { + log.warn("[${workflow.manifest.name}] Found multiple reports from process 'MULTIQC', will use only one") + return multiqc_reports.first() } + } else { + return null } - return mqc_report } // @@ -281,26 +229,35 @@ def attachMultiqcReport(multiqc_report) { def completionEmail(summary_params, email, email_on_fail, plaintext_email, outdir, monochrome_logs=true, multiqc_report=null) { // Set up the e-mail variables - def subject = "[$workflow.manifest.name] Successful: $workflow.runName" + def subject = "[${workflow.manifest.name}] Successful: ${workflow.runName}" if (!workflow.success) { - subject = "[$workflow.manifest.name] FAILED: $workflow.runName" + subject = "[${workflow.manifest.name}] FAILED: ${workflow.runName}" } def summary = [:] - for (group in summary_params.keySet()) { - summary << summary_params[group] - } + summary_params + .keySet() + .sort() + .each { group -> + summary << summary_params[group] + } def misc_fields = [:] misc_fields['Date Started'] = workflow.start misc_fields['Date Completed'] = workflow.complete misc_fields['Pipeline script file path'] = workflow.scriptFile misc_fields['Pipeline script hash ID'] = workflow.scriptId - if (workflow.repository) misc_fields['Pipeline repository Git URL'] = workflow.repository - if (workflow.commitId) misc_fields['Pipeline repository Git Commit'] = workflow.commitId - if (workflow.revision) misc_fields['Pipeline Git branch/tag'] = workflow.revision - misc_fields['Nextflow Version'] = workflow.nextflow.version - misc_fields['Nextflow Build'] = workflow.nextflow.build + if (workflow.repository) { + misc_fields['Pipeline repository Git URL'] = workflow.repository + } + if (workflow.commitId) { + misc_fields['Pipeline repository Git Commit'] = workflow.commitId + } + if (workflow.revision) { + misc_fields['Pipeline Git branch/tag'] = workflow.revision + } + misc_fields['Nextflow Version'] = workflow.nextflow.version + misc_fields['Nextflow Build'] = workflow.nextflow.build misc_fields['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp def email_fields = [:] @@ -317,7 +274,7 @@ def completionEmail(summary_params, email, email_on_fail, plaintext_email, outdi email_fields['summary'] = summary << misc_fields // On success try attach the multiqc report - def mqc_report = attachMultiqcReport(multiqc_report) + def mqc_report = getSingleReport(multiqc_report) // Check if we are only sending emails on failure def email_address = email @@ -337,40 +294,45 @@ def completionEmail(summary_params, email, email_on_fail, plaintext_email, outdi def email_html = html_template.toString() // Render the sendmail template - def max_multiqc_email_size = (params.containsKey('max_multiqc_email_size') ? params.max_multiqc_email_size : 0) as nextflow.util.MemoryUnit - def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, projectDir: "${workflow.projectDir}", mqcFile: mqc_report, mqcMaxSize: max_multiqc_email_size.toBytes() ] + def max_multiqc_email_size = (params.containsKey('max_multiqc_email_size') ? params.max_multiqc_email_size : 0) as MemoryUnit + def smail_fields = [email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, projectDir: "${workflow.projectDir}", mqcFile: mqc_report, mqcMaxSize: max_multiqc_email_size.toBytes()] def sf = new File("${workflow.projectDir}/assets/sendmail_template.txt") def sendmail_template = engine.createTemplate(sf).make(smail_fields) def sendmail_html = sendmail_template.toString() // Send the HTML e-mail - Map colors = logColours(monochrome_logs) + def colors = logColours(monochrome_logs) as Map if (email_address) { try { - if (plaintext_email) { throw GroovyException('Send plaintext e-mail, not HTML') } + if (plaintext_email) { + new org.codehaus.groovy.GroovyException('Send plaintext e-mail, not HTML') + } // Try to send HTML e-mail using sendmail def sendmail_tf = new File(workflow.launchDir.toString(), ".sendmail_tmp.html") sendmail_tf.withWriter { w -> w << sendmail_html } - [ 'sendmail', '-t' ].execute() << sendmail_html - log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Sent summary e-mail to $email_address (sendmail)-" - } catch (all) { + ['sendmail', '-t'].execute() << sendmail_html + log.info("-${colors.purple}[${workflow.manifest.name}]${colors.green} Sent summary e-mail to ${email_address} (sendmail)-") + } + catch (Exception msg) { + log.debug(msg.toString()) + log.debug("Trying with mail instead of sendmail") // Catch failures and try with plaintext - def mail_cmd = [ 'mail', '-s', subject, '--content-type=text/html', email_address ] + def mail_cmd = ['mail', '-s', subject, '--content-type=text/html', email_address] mail_cmd.execute() << email_html - log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Sent summary e-mail to $email_address (mail)-" + log.info("-${colors.purple}[${workflow.manifest.name}]${colors.green} Sent summary e-mail to ${email_address} (mail)-") } } // Write summary e-mail HTML to a file def output_hf = new File(workflow.launchDir.toString(), ".pipeline_report.html") output_hf.withWriter { w -> w << email_html } - FilesEx.copyTo(output_hf.toPath(), "${outdir}/pipeline_info/pipeline_report.html"); + nextflow.extension.FilesEx.copyTo(output_hf.toPath(), "${outdir}/pipeline_info/pipeline_report.html") output_hf.delete() // Write summary e-mail TXT to a file def output_tf = new File(workflow.launchDir.toString(), ".pipeline_report.txt") output_tf.withWriter { w -> w << email_txt } - FilesEx.copyTo(output_tf.toPath(), "${outdir}/pipeline_info/pipeline_report.txt"); + nextflow.extension.FilesEx.copyTo(output_tf.toPath(), "${outdir}/pipeline_info/pipeline_report.txt") output_tf.delete() } @@ -378,15 +340,17 @@ def completionEmail(summary_params, email, email_on_fail, plaintext_email, outdi // Print pipeline summary on completion // def completionSummary(monochrome_logs=true) { - Map colors = logColours(monochrome_logs) + def colors = logColours(monochrome_logs) as Map if (workflow.success) { if (workflow.stats.ignoredCount == 0) { - log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Pipeline completed successfully${colors.reset}-" - } else { - log.info "-${colors.purple}[$workflow.manifest.name]${colors.yellow} Pipeline completed successfully, but with errored process(es) ${colors.reset}-" + log.info("-${colors.purple}[${workflow.manifest.name}]${colors.green} Pipeline completed successfully${colors.reset}-") } - } else { - log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} Pipeline completed with errors${colors.reset}-" + else { + log.info("-${colors.purple}[${workflow.manifest.name}]${colors.yellow} Pipeline completed successfully, but with errored process(es) ${colors.reset}-") + } + } + else { + log.info("-${colors.purple}[${workflow.manifest.name}]${colors.red} Pipeline completed with errors${colors.reset}-") } } @@ -395,21 +359,30 @@ def completionSummary(monochrome_logs=true) { // def imNotification(summary_params, hook_url) { def summary = [:] - for (group in summary_params.keySet()) { - summary << summary_params[group] - } + summary_params + .keySet() + .sort() + .each { group -> + summary << summary_params[group] + } def misc_fields = [:] - misc_fields['start'] = workflow.start - misc_fields['complete'] = workflow.complete - misc_fields['scriptfile'] = workflow.scriptFile - misc_fields['scriptid'] = workflow.scriptId - if (workflow.repository) misc_fields['repository'] = workflow.repository - if (workflow.commitId) misc_fields['commitid'] = workflow.commitId - if (workflow.revision) misc_fields['revision'] = workflow.revision - misc_fields['nxf_version'] = workflow.nextflow.version - misc_fields['nxf_build'] = workflow.nextflow.build - misc_fields['nxf_timestamp'] = workflow.nextflow.timestamp + misc_fields['start'] = workflow.start + misc_fields['complete'] = workflow.complete + misc_fields['scriptfile'] = workflow.scriptFile + misc_fields['scriptid'] = workflow.scriptId + if (workflow.repository) { + misc_fields['repository'] = workflow.repository + } + if (workflow.commitId) { + misc_fields['commitid'] = workflow.commitId + } + if (workflow.revision) { + misc_fields['revision'] = workflow.revision + } + misc_fields['nxf_version'] = workflow.nextflow.version + misc_fields['nxf_build'] = workflow.nextflow.build + misc_fields['nxf_timestamp'] = workflow.nextflow.timestamp def msg_fields = [:] msg_fields['version'] = getWorkflowVersion() @@ -434,13 +407,13 @@ def imNotification(summary_params, hook_url) { def json_message = json_template.toString() // POST - def post = new URL(hook_url).openConnection(); + def post = new URL(hook_url).openConnection() post.setRequestMethod("POST") post.setDoOutput(true) post.setRequestProperty("Content-Type", "application/json") - post.getOutputStream().write(json_message.getBytes("UTF-8")); - def postRC = post.getResponseCode(); - if (! postRC.equals(200)) { - log.warn(post.getErrorStream().getText()); + post.getOutputStream().write(json_message.getBytes("UTF-8")) + def postRC = post.getResponseCode() + if (!postRC.equals(200)) { + log.warn(post.getErrorStream().getText()) } } diff --git a/subworkflows/nf-core/utils_nfcore_pipeline/tests/main.function.nf.test b/subworkflows/nf-core/utils_nfcore_pipeline/tests/main.function.nf.test index 1dc317f8..f117040c 100644 --- a/subworkflows/nf-core/utils_nfcore_pipeline/tests/main.function.nf.test +++ b/subworkflows/nf-core/utils_nfcore_pipeline/tests/main.function.nf.test @@ -41,26 +41,14 @@ nextflow_function { } } - test("Test Function workflowCitation") { - - function "workflowCitation" - - then { - assertAll( - { assert function.success }, - { assert snapshot(function.result).match() } - ) - } - } - - test("Test Function nfCoreLogo") { + test("Test Function without logColours") { - function "nfCoreLogo" + function "logColours" when { function { """ - input[0] = false + input[0] = true """ } } @@ -73,9 +61,8 @@ nextflow_function { } } - test("Test Function dashedLine") { - - function "dashedLine" + test("Test Function with logColours") { + function "logColours" when { function { @@ -93,14 +80,13 @@ nextflow_function { } } - test("Test Function without logColours") { - - function "logColours" + test("Test Function getSingleReport with a single file") { + function "getSingleReport" when { function { """ - input[0] = true + input[0] = file(params.modules_testdata_base_path + '/generic/tsv/test.tsv', checkIfExists: true) """ } } @@ -108,18 +94,22 @@ nextflow_function { then { assertAll( { assert function.success }, - { assert snapshot(function.result).match() } + { assert function.result.contains("test.tsv") } ) } } - test("Test Function with logColours") { - function "logColours" + test("Test Function getSingleReport with multiple files") { + function "getSingleReport" when { function { """ - input[0] = false + input[0] = [ + file(params.modules_testdata_base_path + '/generic/tsv/test.tsv', checkIfExists: true), + file(params.modules_testdata_base_path + '/generic/tsv/network.tsv', checkIfExists: true), + file(params.modules_testdata_base_path + '/generic/tsv/expression.tsv', checkIfExists: true) + ] """ } } @@ -127,7 +117,9 @@ nextflow_function { then { assertAll( { assert function.success }, - { assert snapshot(function.result).match() } + { assert function.result.contains("test.tsv") }, + { assert !function.result.contains("network.tsv") }, + { assert !function.result.contains("expression.tsv") } ) } } diff --git a/subworkflows/nf-core/utils_nfcore_pipeline/tests/main.function.nf.test.snap b/subworkflows/nf-core/utils_nfcore_pipeline/tests/main.function.nf.test.snap index 1037232c..02c67014 100644 --- a/subworkflows/nf-core/utils_nfcore_pipeline/tests/main.function.nf.test.snap +++ b/subworkflows/nf-core/utils_nfcore_pipeline/tests/main.function.nf.test.snap @@ -17,26 +17,6 @@ }, "timestamp": "2024-02-28T12:02:59.729647" }, - "Test Function nfCoreLogo": { - "content": [ - "\n\n-\u001b[2m----------------------------------------------------\u001b[0m-\n \u001b[0;32m,--.\u001b[0;30m/\u001b[0;32m,-.\u001b[0m\n\u001b[0;34m ___ __ __ __ ___ \u001b[0;32m/,-._.--~'\u001b[0m\n\u001b[0;34m |\\ | |__ __ / ` / \\ |__) |__ \u001b[0;33m} {\u001b[0m\n\u001b[0;34m | \\| | \\__, \\__/ | \\ |___ \u001b[0;32m\\`-._,-`-,\u001b[0m\n \u001b[0;32m`._,._,'\u001b[0m\n\u001b[0;35m nextflow_workflow v9.9.9\u001b[0m\n-\u001b[2m----------------------------------------------------\u001b[0m-\n" - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-02-28T12:03:10.562934" - }, - "Test Function workflowCitation": { - "content": [ - "If you use nextflow_workflow for your analysis please cite:\n\n* The pipeline\n https://doi.org/10.5281/zenodo.5070524\n\n* The nf-core framework\n https://doi.org/10.1038/s41587-020-0439-x\n\n* Software dependencies\n https://github.com/nextflow_workflow/blob/master/CITATIONS.md" - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-02-28T12:03:07.019761" - }, "Test Function without logColours": { "content": [ { @@ -95,16 +75,6 @@ }, "timestamp": "2024-02-28T12:03:17.969323" }, - "Test Function dashedLine": { - "content": [ - "-\u001b[2m----------------------------------------------------\u001b[0m-" - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-02-28T12:03:14.366181" - }, "Test Function with logColours": { "content": [ { diff --git a/subworkflows/nf-core/utils_nfcore_pipeline/tests/tags.yml b/subworkflows/nf-core/utils_nfcore_pipeline/tests/tags.yml deleted file mode 100644 index ac8523c9..00000000 --- a/subworkflows/nf-core/utils_nfcore_pipeline/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -subworkflows/utils_nfcore_pipeline: - - subworkflows/nf-core/utils_nfcore_pipeline/** diff --git a/subworkflows/nf-core/utils_nfschema_plugin/main.nf b/subworkflows/nf-core/utils_nfschema_plugin/main.nf new file mode 100644 index 00000000..acb39724 --- /dev/null +++ b/subworkflows/nf-core/utils_nfschema_plugin/main.nf @@ -0,0 +1,73 @@ +// +// Subworkflow that uses the nf-schema plugin to validate parameters and render the parameter summary +// + +include { paramsSummaryLog } from 'plugin/nf-schema' +include { validateParameters } from 'plugin/nf-schema' +include { paramsHelp } from 'plugin/nf-schema' + +workflow UTILS_NFSCHEMA_PLUGIN { + + take: + input_workflow // workflow: the workflow object used by nf-schema to get metadata from the workflow + validate_params // boolean: validate the parameters + parameters_schema // string: path to the parameters JSON schema. + // this has to be the same as the schema given to `validation.parametersSchema` + // when this input is empty it will automatically use the configured schema or + // "${projectDir}/nextflow_schema.json" as default. This input should not be empty + // for meta pipelines + help // boolean: show help message + help_full // boolean: show full help message + show_hidden // boolean: show hidden parameters in help message + before_text // string: text to show before the help message and parameters summary + after_text // string: text to show after the help message and parameters summary + command // string: an example command of the pipeline + + main: + + if(help || help_full) { + help_options = [ + beforeText: before_text, + afterText: after_text, + command: command, + showHidden: show_hidden, + fullHelp: help_full, + ] + if(parameters_schema) { + help_options << [parametersSchema: parameters_schema] + } + log.info paramsHelp( + help_options, + params.help instanceof String ? params.help : "", + ) + exit 0 + } + + // + // Print parameter summary to stdout. This will display the parameters + // that differ from the default given in the JSON schema + // + + summary_options = [:] + if(parameters_schema) { + summary_options << [parametersSchema: parameters_schema] + } + log.info before_text + log.info paramsSummaryLog(summary_options, input_workflow) + log.info after_text + + // + // Validate the parameters using nextflow_schema.json or the schema + // given via the validation.parametersSchema configuration option + // + if(validate_params) { + validateOptions = [:] + if(parameters_schema) { + validateOptions << [parametersSchema: parameters_schema] + } + validateParameters(validateOptions) + } + + emit: + dummy_emit = true +} diff --git a/subworkflows/nf-core/utils_nfschema_plugin/meta.yml b/subworkflows/nf-core/utils_nfschema_plugin/meta.yml new file mode 100644 index 00000000..f7d9f028 --- /dev/null +++ b/subworkflows/nf-core/utils_nfschema_plugin/meta.yml @@ -0,0 +1,35 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "utils_nfschema_plugin" +description: Run nf-schema to validate parameters and create a summary of changed parameters +keywords: + - validation + - JSON schema + - plugin + - parameters + - summary +components: [] +input: + - input_workflow: + type: object + description: | + The workflow object of the used pipeline. + This object contains meta data used to create the params summary log + - validate_params: + type: boolean + description: Validate the parameters and error if invalid. + - parameters_schema: + type: string + description: | + Path to the parameters JSON schema. + This has to be the same as the schema given to the `validation.parametersSchema` config + option. When this input is empty it will automatically use the configured schema or + "${projectDir}/nextflow_schema.json" as default. The schema should not be given in this way + for meta pipelines. +output: + - dummy_emit: + type: boolean + description: Dummy emit to make nf-core subworkflows lint happy +authors: + - "@nvnieuwk" +maintainers: + - "@nvnieuwk" diff --git a/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test b/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test new file mode 100644 index 00000000..c977917a --- /dev/null +++ b/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test @@ -0,0 +1,173 @@ +nextflow_workflow { + + name "Test Subworkflow UTILS_NFSCHEMA_PLUGIN" + script "../main.nf" + workflow "UTILS_NFSCHEMA_PLUGIN" + + tag "subworkflows" + tag "subworkflows_nfcore" + tag "subworkflows/utils_nfschema_plugin" + tag "plugin/nf-schema" + + config "./nextflow.config" + + test("Should run nothing") { + + when { + + params { + test_data = '' + } + + workflow { + """ + validate_params = false + input[0] = workflow + input[1] = validate_params + input[2] = "" + input[3] = false + input[4] = false + input[5] = false + input[6] = "" + input[7] = "" + input[8] = "" + """ + } + } + + then { + assertAll( + { assert workflow.success } + ) + } + } + + test("Should validate params") { + + when { + + params { + test_data = '' + outdir = null + } + + workflow { + """ + validate_params = true + input[0] = workflow + input[1] = validate_params + input[2] = "" + input[3] = false + input[4] = false + input[5] = false + input[6] = "" + input[7] = "" + input[8] = "" + """ + } + } + + then { + assertAll( + { assert workflow.failed }, + { assert workflow.stdout.any { it.contains('ERROR ~ Validation of pipeline parameters failed!') } } + ) + } + } + + test("Should run nothing - custom schema") { + + when { + + params { + test_data = '' + } + + workflow { + """ + validate_params = false + input[0] = workflow + input[1] = validate_params + input[2] = "${projectDir}/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow_schema.json" + input[3] = false + input[4] = false + input[5] = false + input[6] = "" + input[7] = "" + input[8] = "" + """ + } + } + + then { + assertAll( + { assert workflow.success } + ) + } + } + + test("Should validate params - custom schema") { + + when { + + params { + test_data = '' + outdir = null + } + + workflow { + """ + validate_params = true + input[0] = workflow + input[1] = validate_params + input[2] = "${projectDir}/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow_schema.json" + input[3] = false + input[4] = false + input[5] = false + input[6] = "" + input[7] = "" + input[8] = "" + """ + } + } + + then { + assertAll( + { assert workflow.failed }, + { assert workflow.stdout.any { it.contains('ERROR ~ Validation of pipeline parameters failed!') } } + ) + } + } + + test("Should create a help message") { + + when { + + params { + test_data = '' + outdir = null + } + + workflow { + """ + validate_params = true + input[0] = workflow + input[1] = validate_params + input[2] = "${projectDir}/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow_schema.json" + input[3] = true + input[4] = false + input[5] = false + input[6] = "Before" + input[7] = "After" + input[8] = "nextflow run test/test" + """ + } + } + + then { + assertAll( + { assert workflow.success } + ) + } + } +} diff --git a/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config b/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config new file mode 100644 index 00000000..8d8c7371 --- /dev/null +++ b/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config @@ -0,0 +1,8 @@ +plugins { + id "nf-schema@2.5.1" +} + +validation { + parametersSchema = "${projectDir}/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow_schema.json" + monochromeLogs = true +} diff --git a/subworkflows/nf-core/utils_nfvalidation_plugin/tests/nextflow_schema.json b/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow_schema.json similarity index 95% rename from subworkflows/nf-core/utils_nfvalidation_plugin/tests/nextflow_schema.json rename to subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow_schema.json index 7626c1c9..331e0d2f 100644 --- a/subworkflows/nf-core/utils_nfvalidation_plugin/tests/nextflow_schema.json +++ b/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow_schema.json @@ -1,10 +1,10 @@ { - "$schema": "http://json-schema.org/draft-07/schema", + "$schema": "https://json-schema.org/draft/2020-12/schema", "$id": "https://raw.githubusercontent.com/./master/nextflow_schema.json", "title": ". pipeline parameters", "description": "", "type": "object", - "definitions": { + "$defs": { "input_output_options": { "title": "Input/output options", "type": "object", @@ -87,10 +87,10 @@ }, "allOf": [ { - "$ref": "#/definitions/input_output_options" + "$ref": "#/$defs/input_output_options" }, { - "$ref": "#/definitions/generic_options" + "$ref": "#/$defs/generic_options" } ] } diff --git a/subworkflows/nf-core/utils_nfvalidation_plugin/main.nf b/subworkflows/nf-core/utils_nfvalidation_plugin/main.nf deleted file mode 100644 index 50155c08..00000000 --- a/subworkflows/nf-core/utils_nfvalidation_plugin/main.nf +++ /dev/null @@ -1,62 +0,0 @@ -// -// Subworkflow that uses the nf-validation plugin to render help text and parameter summary -// - -/* -======================================================================================== - IMPORT NF-VALIDATION PLUGIN -======================================================================================== -*/ - -include { paramsHelp } from 'plugin/nf-schema' -include { paramsSummaryLog } from 'plugin/nf-schema' -include { validateParameters } from 'plugin/nf-schema' - -/* -======================================================================================== - SUBWORKFLOW DEFINITION -======================================================================================== -*/ - -workflow UTILS_NFVALIDATION_PLUGIN { - - take: - print_help // boolean: print help - workflow_command // string: default commmand used to run pipeline - pre_help_text // string: string to be printed before help text and summary log - post_help_text // string: string to be printed after help text and summary log - validate_params // boolean: validate parameters - schema_filename // path: JSON schema file, null to use default value - - main: - - log.debug "Using schema file: ${schema_filename}" - - // Default values for strings - pre_help_text = pre_help_text ?: '' - post_help_text = post_help_text ?: '' - workflow_command = workflow_command ?: '' - - // - // Print help message if needed - // - if (print_help) { - log.info pre_help_text + paramsHelp(workflow_command, parameters_schema: schema_filename) + post_help_text - System.exit(0) - } - - // - // Print parameter summary to stdout - // - log.info pre_help_text + paramsSummaryLog(workflow, parameters_schema: schema_filename) + post_help_text - - // - // Validate parameters relative to the parameter JSON schema - // - if (validate_params){ - validateParameters(parameters_schema: schema_filename) - } - - emit: - dummy_emit = true -} diff --git a/subworkflows/nf-core/utils_nfvalidation_plugin/meta.yml b/subworkflows/nf-core/utils_nfvalidation_plugin/meta.yml deleted file mode 100644 index 3d4a6b04..00000000 --- a/subworkflows/nf-core/utils_nfvalidation_plugin/meta.yml +++ /dev/null @@ -1,44 +0,0 @@ -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json -name: "UTILS_NFVALIDATION_PLUGIN" -description: Use nf-validation to initiate and validate a pipeline -keywords: - - utility - - pipeline - - initialise - - validation -components: [] -input: - - print_help: - type: boolean - description: | - Print help message and exit - - workflow_command: - type: string - description: | - The command to run the workflow e.g. "nextflow run main.nf" - - pre_help_text: - type: string - description: | - Text to print before the help message - - post_help_text: - type: string - description: | - Text to print after the help message - - validate_params: - type: boolean - description: | - Validate the parameters and error if invalid. - - schema_filename: - type: string - description: | - The filename of the schema to validate against. -output: - - dummy_emit: - type: boolean - description: | - Dummy emit to make nf-core subworkflows lint happy -authors: - - "@adamrtalbot" -maintainers: - - "@adamrtalbot" - - "@maxulysse" diff --git a/subworkflows/nf-core/utils_nfvalidation_plugin/tests/main.nf.test b/subworkflows/nf-core/utils_nfvalidation_plugin/tests/main.nf.test deleted file mode 100644 index 5784a33f..00000000 --- a/subworkflows/nf-core/utils_nfvalidation_plugin/tests/main.nf.test +++ /dev/null @@ -1,200 +0,0 @@ -nextflow_workflow { - - name "Test Workflow UTILS_NFVALIDATION_PLUGIN" - script "../main.nf" - workflow "UTILS_NFVALIDATION_PLUGIN" - tag "subworkflows" - tag "subworkflows_nfcore" - tag "plugin/nf-validation" - tag "'plugin/nf-validation'" - tag "utils_nfvalidation_plugin" - tag "subworkflows/utils_nfvalidation_plugin" - - test("Should run nothing") { - - when { - - params { - monochrome_logs = true - test_data = '' - } - - workflow { - """ - help = false - workflow_command = null - pre_help_text = null - post_help_text = null - validate_params = false - schema_filename = "$moduleTestDir/nextflow_schema.json" - - input[0] = help - input[1] = workflow_command - input[2] = pre_help_text - input[3] = post_help_text - input[4] = validate_params - input[5] = schema_filename - """ - } - } - - then { - assertAll( - { assert workflow.success } - ) - } - } - - test("Should run help") { - - - when { - - params { - monochrome_logs = true - test_data = '' - } - workflow { - """ - help = true - workflow_command = null - pre_help_text = null - post_help_text = null - validate_params = false - schema_filename = "$moduleTestDir/nextflow_schema.json" - - input[0] = help - input[1] = workflow_command - input[2] = pre_help_text - input[3] = post_help_text - input[4] = validate_params - input[5] = schema_filename - """ - } - } - - then { - assertAll( - { assert workflow.success }, - { assert workflow.exitStatus == 0 }, - { assert workflow.stdout.any { it.contains('Input/output options') } }, - { assert workflow.stdout.any { it.contains('--outdir') } } - ) - } - } - - test("Should run help with command") { - - when { - - params { - monochrome_logs = true - test_data = '' - } - workflow { - """ - help = true - workflow_command = "nextflow run noorg/doesntexist" - pre_help_text = null - post_help_text = null - validate_params = false - schema_filename = "$moduleTestDir/nextflow_schema.json" - - input[0] = help - input[1] = workflow_command - input[2] = pre_help_text - input[3] = post_help_text - input[4] = validate_params - input[5] = schema_filename - """ - } - } - - then { - assertAll( - { assert workflow.success }, - { assert workflow.exitStatus == 0 }, - { assert workflow.stdout.any { it.contains('nextflow run noorg/doesntexist') } }, - { assert workflow.stdout.any { it.contains('Input/output options') } }, - { assert workflow.stdout.any { it.contains('--outdir') } } - ) - } - } - - test("Should run help with extra text") { - - - when { - - params { - monochrome_logs = true - test_data = '' - } - workflow { - """ - help = true - workflow_command = "nextflow run noorg/doesntexist" - pre_help_text = "pre-help-text" - post_help_text = "post-help-text" - validate_params = false - schema_filename = "$moduleTestDir/nextflow_schema.json" - - input[0] = help - input[1] = workflow_command - input[2] = pre_help_text - input[3] = post_help_text - input[4] = validate_params - input[5] = schema_filename - """ - } - } - - then { - assertAll( - { assert workflow.success }, - { assert workflow.exitStatus == 0 }, - { assert workflow.stdout.any { it.contains('pre-help-text') } }, - { assert workflow.stdout.any { it.contains('nextflow run noorg/doesntexist') } }, - { assert workflow.stdout.any { it.contains('Input/output options') } }, - { assert workflow.stdout.any { it.contains('--outdir') } }, - { assert workflow.stdout.any { it.contains('post-help-text') } } - ) - } - } - - test("Should validate params") { - - when { - - params { - monochrome_logs = true - test_data = '' - outdir = 1 - } - workflow { - """ - help = false - workflow_command = null - pre_help_text = null - post_help_text = null - validate_params = true - schema_filename = "$moduleTestDir/nextflow_schema.json" - - input[0] = help - input[1] = workflow_command - input[2] = pre_help_text - input[3] = post_help_text - input[4] = validate_params - input[5] = schema_filename - """ - } - } - - then { - assertAll( - { assert workflow.failed }, - { assert workflow.stdout.any { it.contains('ERROR ~ ERROR: Validation of pipeline parameters failed!') } } - ) - } - } -} diff --git a/subworkflows/nf-core/utils_nfvalidation_plugin/tests/tags.yml b/subworkflows/nf-core/utils_nfvalidation_plugin/tests/tags.yml deleted file mode 100644 index 60b1cfff..00000000 --- a/subworkflows/nf-core/utils_nfvalidation_plugin/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -subworkflows/utils_nfvalidation_plugin: - - subworkflows/nf-core/utils_nfvalidation_plugin/** diff --git a/tests/.nftignore b/tests/.nftignore new file mode 100644 index 00000000..6b84cdef --- /dev/null +++ b/tests/.nftignore @@ -0,0 +1,16 @@ +.DS_Store +multiqc/multiqc_data/multiqc.parquet +multiqc/multiqc_data/multiqc.log +multiqc/multiqc_data/multiqc_data.json +multiqc/multiqc_data/multiqc_sources.txt +multiqc/multiqc_data/multiqc_software_versions.txt +multiqc/multiqc_data/llms-full.txt +multiqc/multiqc_plots/{svg,pdf,png}/*.{svg,pdf,png} +multiqc/multiqc_report.html +pipeline_info/*.{html,json,txt,yml} +**.py +**.parquet +**multiqc_geo*metadata.txt +**geo*metadata.tsv +**skewness.csv +**skewness.txt diff --git a/tests/act/README.md b/tests/act/README.md new file mode 100644 index 00000000..3c890a45 --- /dev/null +++ b/tests/act/README.md @@ -0,0 +1,47 @@ +# Mimic runs of nf-test in Github runners using act + +This folder contains all the necessary files to run `nf-test` tests using [act](https://nektosact.com/introduction.html). + +## Install act + +To install `act`, simply run: + +``` +curl --proto '=https' --tlsv1.2 -sSf https://raw.githubusercontent.com/nektos/act/master/install.sh | sudo bash +``` + +> [!NOTE] +> You might then have to place the act binary in a folder in your `$PATH`. + +> [!IMPORTANT] +> `act` used `docker` under the hood. To install `docker`, see the [installation instructions](https://docs.docker.com/engine/install/). + +## Setup tests to run + +The `params.env` comprises all the necessary configuration to run the tests you need: + +- profile(s) +- Nextflow version + +## Run tests + +You need to specify in `params.env` the profile(s) that will be used. All the other nf-test arguments must be provided as usual. + +Example: + +```.env +#params.env +NXF_VER=25.04.0 +PROFILE=conda +``` + +``` +# from the root folder of you repo +tests/act/run --tag --debug --verbose +``` + +## Clean generated files + +``` +sudo rm -rf .nf-test +``` diff --git a/tests/act/actions/nf-test/action.yml b/tests/act/actions/nf-test/action.yml new file mode 100644 index 00000000..1b0c638e --- /dev/null +++ b/tests/act/actions/nf-test/action.yml @@ -0,0 +1,85 @@ +name: "nf-test Action" +description: "Runs nf-test with common setup steps" +inputs: + profile: + description: "Profile(s) to use for nf-test" + required: true + args: + description: "Arguments to pass to nf-test" + required: true +runs: + using: "composite" + steps: + - name: Install Node.js + uses: actions/setup-node@v6 + with: + node-version: 22 + + - name: Setup Nextflow + uses: nf-core/setup-nextflow@v2 + with: + version: "${{ env.NXF_VERSION }}" + + - name: Set up Python + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6 + with: + python-version: "3.14" + + - name: Install nf-test + run: | + wget -qO- https://get.nf-test.com | bash + mv nf-test /usr/local/bin + + - uses: actions/cache@v5 + with: + path: /var/cache/apt/archives + key: apt-deps-${{ runner.os }} + + - name: Install apptainer dependencies + if: contains(inputs.profile, 'apptainer') + shell: bash + run: | + apt update + apt install -y libfuse3-3 uidmap fakeroot + + - name: Setup apptainer + if: contains(inputs.profile, 'apptainer') + uses: eWaterCycle/setup-apptainer@v2 + with: + apptainer-version: 1.4.5 + + - name: Set up Singularity + if: contains(inputs.profile, 'singularity') + shell: bash + run: | + mkdir -p $NXF_SINGULARITY_CACHEDIR + mkdir -p $NXF_SINGULARITY_LIBRARYDIR + + - name: Conda setup + if: contains(inputs.profile, 'conda') + uses: conda-incubator/setup-miniconda@505e6394dae86d6a5c7fbb6e3fb8938e3e863830 # v3 + with: + auto-update-conda: true + conda-solver: libmamba + channels: conda-forge + channel-priority: strict + conda-remove-defaults: true + + - name: Run nf-test + shell: bash + env: + NFT_WORKDIR: ${{ env.NFT_WORKDIR }} + run: | + nf-test test \ + --profile ${{ inputs.profile }} \ + ${{ inputs.args }} + + - name: Upload nf-test artifacts + if: failure() + uses: actions/upload-artifact@v4 + with: + name: nf-test-artifacts + path: .nf-test/tests + include-hidden-files: true + overwrite: true + compression-level: 0 diff --git a/tests/act/nf-test.yml b/tests/act/nf-test.yml new file mode 100644 index 00000000..52bde4aa --- /dev/null +++ b/tests/act/nf-test.yml @@ -0,0 +1,44 @@ +name: Run nf-test +on: + push: + +env: + NFT_VER: "0.9.3" + NFT_WORKDIR: "~" + NXF_ANSI_LOG: false + NXF_SINGULARITY_CACHEDIR: ${{ github.workspace }}/.singularity + NXF_SINGULARITY_LIBRARYDIR: ${{ github.workspace }}/.singularity + +jobs: + nf-test: + name: nf-test + runs-on: local + env: + NXF_ANSI_LOG: false + + steps: + - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 + with: + fetch-depth: 0 + + - name: Get profile + id: get_profile + run: | + if [ -z "${{ env.PROFILE }}" ]; then + echo "Using default profile ${{ env.DEFAULT_PROFILE }}" + echo "profile=${{ env.DEFAULT_PROFILE }}" >> $GITHUB_OUTPUT + else + echo "Using profile ${{ env.PROFILE }}" + echo "profile=${{ env.PROFILE }}" >> $GITHUB_OUTPUT + fi + + - name: Run nf-test + id: run_nf_test + uses: ./tests/act/actions/nf-test + continue-on-error: ${{ matrix.NXF_VER == 'latest-everything' }} + env: + NFT_WORKDIR: ${{ env.NFT_WORKDIR }} + NXF_VERSION: ${{ env.NXF_VER }} + with: + profile: ${{ steps.get_profile.outputs.profile }} + args: ${{ env.ARGS }} diff --git a/tests/act/params.env b/tests/act/params.env new file mode 100644 index 00000000..d99a6307 --- /dev/null +++ b/tests/act/params.env @@ -0,0 +1,6 @@ +# for best reproducibility, use the full image +IMAGE=catthehacker/ubuntu:full-24.04 +# on some systems, the base image can work +#IMAGE=catthehacker/ubuntu:act-24.04 +NXF_VER=25.04.0 +DEFAULT_PROFILE=docker diff --git a/tests/act/run b/tests/act/run new file mode 100755 index 00000000..7b6a36ef --- /dev/null +++ b/tests/act/run @@ -0,0 +1,42 @@ +#!/usr/bin/env bash + +act_folder=$(dirname "$(realpath $0)") +root_folder=$(dirname $(dirname "${act_folder}")) + +ACT_OUTPUT_FOLDERNAME="act_output" +act_output_folder="${root_folder}/${ACT_OUTPUT_FOLDERNAME}" +mkdir -p $act_output_folder +######################################### +# Parse arguments +######################################### + +args="" +profile="" +bind_args="" +while [[ $# -gt 0 ]]; do + if [[ "$1" == "--profile" ]]; then + profile="$2" + shift 2 + else + args="${args} $1" # append the string to args + shift + fi +done + +echo "Running with args: ${args} and profile(s): ${profile}" + +######################################### +# Run act +######################################### + +act push \ + --job nf-test \ + --directory "${root_folder}" \ + --env-file "${act_folder}/params.env" \ + --env ARGS="${args}" \ + --env PROFILE="${profile}" \ + --workflows "${act_folder}/nf-test.yml" \ + --platform local=catthehacker/ubuntu:act-24.04 \ + --container-architecture linux/amd64 \ + --container-options "--privileged" \ + --artifact-server-path "${act_output_folder}" diff --git a/tests/default.nf.test b/tests/default.nf.test new file mode 100644 index 00000000..908fb62b --- /dev/null +++ b/tests/default.nf.test @@ -0,0 +1,309 @@ +nextflow_pipeline { + + name "Test pipeline" + script "../main.nf" + tag "pipeline" + + test("-profile test") { + tag "test" + + when { + params { + species = 'beta vulgaris' + outdir = "$outputDir" + } + } + + then { + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + stable_name, + stable_path + ).match() } + ) + } + } + + test("-profile test_dataset_only") { + tag "test_dataset_only" + + when { + params { + species = 'mus musculus' + datasets = "https://raw.githubusercontent.com/nf-core/test-datasets/stableexpression/test_data/input_datasets/input_big.yaml" + skip_fetch_eatlas_accessions = true + outdir = "$outputDir" + } + } + + then { + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + removeNextflowVersion("$outputDir/pipeline_info/nf_core_stableexpression_software_mqc_versions.yml"), + stable_name, + stable_path + ).match() } + ) + } + } + + test("-profile test_public_and_dataset") { + tag "test_public_and_dataset" + + when { + params { + species = 'beta vulgaris' + keywords = "leaf" + datasets = "https://raw.githubusercontent.com/nf-core/test-datasets/stableexpression/test_data/input_datasets/input_beta_vulgaris.csv" + outdir = "$outputDir" + } + } + + then { + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + stable_name, + stable_path + ).match() } + ) + } + } + + /* + //TODO: see why it gives issues in CI + test("-profile test_fetch_geo") { + tag "test_fetch_geo" + + when { + params { + species = 'beta vulgaris' + fetch_geo_accessions = true + outdir = "$outputDir" + } + } + + then { + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + stable_name, + stable_path + ).match() } + ) + } + } + */ + + test("-profile test_accessions_only") { + tag "test_accessions_only" + + when { + params { + species = 'beta vulgaris' + accessions_only = true + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_stableexpression_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } + + test("-profile test_download_only") { + tag "test_download_only" + + when { + params { + species = 'beta vulgaris' + download_only = true + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_stableexpression_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } + + test("-profile test_one_accession_low_gene_count") { + tag "test_one_accession_low_gene_count" + + when { + params { + species = 'arabidopsis thaliana' + accessions = "E-GEOD-51720" + skip_fetch_eatlas_accessions = true + outdir = "$outputDir" + } + } + + then { + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + removeNextflowVersion("$outputDir/pipeline_info/nf_core_stableexpression_software_mqc_versions.yml"), + stable_name, + stable_path + ).match() } + ) + } + } + + test("-profile test_skip_id_mapping") { + tag "test_skip_id_mapping" + + when { + params { + species = 'solanum tuberosum' + datasets = "${projectDir}/tests/test_data/input_datasets/input.csv" + skip_id_mapping = true + skip_fetch_eatlas_accessions = true + outdir = "$outputDir" + } + } + + then { + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + stable_name, + stable_path + ).match() } + ) + } + } + + test("-profile test_dataset_custom_mapping_and_gene_length") { + tag "test_dataset_custom_mapping_and_gene_length" + + when { + params { + species = 'solanum tuberosum' + datasets = "${projectDir}/tests/test_data/input_datasets/input.csv" + skip_id_mapping = true + skip_fetch_eatlas_accessions = true + gene_id_mapping = "${projectDir}/tests/test_data/input_datasets/mapping.csv" + gene_metadata = "${projectDir}/tests/test_data/input_datasets/metadata.csv" + gene_length = "${projectDir}/tests/test_data/input_datasets/gene_lengths.csv" + outdir = "$outputDir" + } + } + + then { + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + removeNextflowVersion("$outputDir/pipeline_info/nf_core_stableexpression_software_mqc_versions.yml"), + stable_name, + stable_path + ).match() } + ) + } + } + /* + // TODO: see why this test works locally, even with act, but fails in CI + test("-profile test_included_and_excluded_accessions") { + tag "test_included_and_excluded_accessions" + + when { + params { + species = "solanum tuberosum" + accessions = "E-MTAB-552,E-GEOD-61690" + excluded_accessions = "E-MTAB-4251" + accessions_file = "${projectDir}/tests/test_data/misc/accessions_to_include.txt" + excluded_accessions_file = "${projectDir}/tests/test_data/misc/excluded_accessions.txt" + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_stableexpression_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } + */ + /* + test("-profile test_gprofiler_target_database_entrez") { + + when { + params { + species = 'beta vulgaris' + gprofiler_target_db = 'ENTREZGENE' + outdir = "$outputDir" + } + } + + then { + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + stable_name, + stable_path + ).match() } + ) + } + } + */ + +} diff --git a/tests/default.nf.test.snap b/tests/default.nf.test.snap new file mode 100644 index 00000000..aad13d07 --- /dev/null +++ b/tests/default.nf.test.snap @@ -0,0 +1,2049 @@ +{ + "-profile test_dataset_only": { + "content": [ + { + "AGGREGATE_RESULTS": { + "polars": "1.39.2", + "python": "3.14.3", + "pyyaml": "6.0.3" + }, + "CLEAN_GENE_IDS": { + "polars": "1.37.1", + "python": "3.12.8" + }, + "COLLECT_ALL_GENE_IDS": { + "python": "3.14.2", + "tqdm": "4.67.1" + }, + "COLLECT_STATISTICS": { + "pandas": "2.3.3", + "python": "3.13.7" + }, + "COMPUTE_GENE_TRANSCRIPT_LENGTHS": { + "pandas": "2.3.3", + "python": "3.13.7" + }, + "COMPUTE_M_MEASURE": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "COMPUTE_STABILITY_SCORES": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "COMPUTE_TPM": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "CROSS_JOIN": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "DASH_APP": { + "python": "3.14.3", + "dash": "3.3.0", + "dash-extensions": "2.0.4", + "dash-mantine-components": "2.4.0", + "dash-ag-grid": "32.3.2", + "polars": "1.39.2", + "pandas": "2.3.3", + "pyarrow": "23.0.1", + "scipy": "1.17.1" + }, + "DESCRIPTIVE_STATISTICS": { + "polars": "1.37.1", + "python": "3.12.8" + }, + "DETECT_RARE_GENES": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "DOWNLOAD_ENSEMBL_ANNOTATION": { + "bs4": "4.14.3", + "httpx": "0.28.1", + "pandas": "3.0.1", + "python": "3.14.3", + "tqdm": "4.67.3" + }, + "EXPRESSION_RATIO": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "EXTRACT_GENE_IDS": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "FILTER_AND_RENAME_GENES": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "GET_CANDIDATE_GENES": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "GLOBAL": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "GPROFILER_IDMAPPING": { + "httpx": "0.28.1", + "pandas": "3.0.1", + "python": "3.14.3" + }, + "IMPUTE_MISSING_VALUES": { + "polars": "1.39.2", + "python": "3.14.3", + "scikit-learn": "1.8.0" + }, + "MAKE_CHUNKS": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "NORMFINDER": { + "numba": "0.64.0", + "numpy": "2.4.3", + "polars": "1.39.2", + "python": "3.14.3", + "tqdm": "4.67.3" + }, + "PLATFORM": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "QUANTILE_NORMALISATION": { + "polars": "1.39.2", + "python": "3.14.3", + "scikit-learn": "1.8.0" + }, + "RATIO_STANDARD_VARIATION": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "TOO_MANY_MISSING_VALUES": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "TOO_MANY_ZEROS": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "Workflow": { + "nf-core/stableexpression": "v1.0.0" + } + }, + [ + "aggregated", + "aggregated/all_genes_summary.csv", + "aggregated/custom_content_multiqc_config.yaml", + "aggregated/section_1.most_stable_genes_summary.csv", + "aggregated/section_1.most_stable_genes_transposed_counts.csv", + "aggregated/section_10.most_stable_genes_summary.csv", + "aggregated/section_10.most_stable_genes_transposed_counts.csv", + "aggregated/section_11.most_stable_genes_summary.csv", + "aggregated/section_11.most_stable_genes_transposed_counts.csv", + "aggregated/section_12.most_stable_genes_summary.csv", + "aggregated/section_12.most_stable_genes_transposed_counts.csv", + "aggregated/section_13.most_stable_genes_summary.csv", + "aggregated/section_13.most_stable_genes_transposed_counts.csv", + "aggregated/section_14.most_stable_genes_summary.csv", + "aggregated/section_14.most_stable_genes_transposed_counts.csv", + "aggregated/section_15.most_stable_genes_summary.csv", + "aggregated/section_15.most_stable_genes_transposed_counts.csv", + "aggregated/section_16.most_stable_genes_summary.csv", + "aggregated/section_16.most_stable_genes_transposed_counts.csv", + "aggregated/section_17.most_stable_genes_summary.csv", + "aggregated/section_17.most_stable_genes_transposed_counts.csv", + "aggregated/section_18.most_stable_genes_summary.csv", + "aggregated/section_18.most_stable_genes_transposed_counts.csv", + "aggregated/section_19.most_stable_genes_summary.csv", + "aggregated/section_19.most_stable_genes_transposed_counts.csv", + "aggregated/section_2.most_stable_genes_summary.csv", + "aggregated/section_2.most_stable_genes_transposed_counts.csv", + "aggregated/section_20.most_stable_genes_summary.csv", + "aggregated/section_20.most_stable_genes_transposed_counts.csv", + "aggregated/section_3.most_stable_genes_summary.csv", + "aggregated/section_3.most_stable_genes_transposed_counts.csv", + "aggregated/section_4.most_stable_genes_summary.csv", + "aggregated/section_4.most_stable_genes_transposed_counts.csv", + "aggregated/section_5.most_stable_genes_summary.csv", + "aggregated/section_5.most_stable_genes_transposed_counts.csv", + "aggregated/section_6.most_stable_genes_summary.csv", + "aggregated/section_6.most_stable_genes_transposed_counts.csv", + "aggregated/section_7.most_stable_genes_summary.csv", + "aggregated/section_7.most_stable_genes_transposed_counts.csv", + "aggregated/section_8.most_stable_genes_summary.csv", + "aggregated/section_8.most_stable_genes_transposed_counts.csv", + "aggregated/section_9.most_stable_genes_summary.csv", + "aggregated/section_9.most_stable_genes_transposed_counts.csv", + "dash_app", + "dash_app/app.py", + "dash_app/assets", + "dash_app/assets/style.css", + "dash_app/data", + "dash_app/data/all_counts.imputed.parquet", + "dash_app/data/all_genes_summary.csv", + "dash_app/data/whole_design.csv", + "dash_app/environment.yml", + "dash_app/src", + "dash_app/src/callbacks", + "dash_app/src/callbacks/common.py", + "dash_app/src/callbacks/genes.py", + "dash_app/src/callbacks/samples.py", + "dash_app/src/components", + "dash_app/src/components/graphs.py", + "dash_app/src/components/icons.py", + "dash_app/src/components/right_sidebar.py", + "dash_app/src/components/settings", + "dash_app/src/components/settings/genes.py", + "dash_app/src/components/settings/samples.py", + "dash_app/src/components/stores.py", + "dash_app/src/components/tables.py", + "dash_app/src/components/tooltips.py", + "dash_app/src/components/top.py", + "dash_app/src/utils", + "dash_app/src/utils/config.py", + "dash_app/src/utils/data_management.py", + "dash_app/src/utils/style.py", + "errors", + "gene_length", + "gene_length/Mus_musculus.GRCm39.115.chr.gff3.gz", + "gene_length/gene_transcript_lengths.csv", + "idmapping", + "idmapping/global_gene_id_mapping.csv", + "idmapping/global_gene_metadata.csv", + "idmapping/gprofiler", + "idmapping/gprofiler/gene_metadata.csv", + "idmapping/gprofiler/mapped_gene_ids.csv", + "idmapping/renamed", + "idmapping/renamed/SRP254919.salmon.merged.gene_counts.top1000cov.assay.cleaned.renamed.parquet", + "merged_datasets", + "merged_datasets/whole_design.csv", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/llms-full.txt", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc.parquet", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_gene_statistics.txt", + "multiqc/multiqc_data/multiqc_genes_section_1.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_1.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_10.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_11.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_12.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_13.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_14.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_15.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_16.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_17.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_18.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_19.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_2.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_3.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_4.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_5.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_6.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_7.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_8.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_9.txt", + "multiqc/multiqc_data/multiqc_id_mapping_stats.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_1.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_10.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_11.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_12.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_13.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_14.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_15.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_16.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_17.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_18.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_19.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_2.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_3.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_4.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_5.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_6.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_7.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_8.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_9.txt", + "multiqc/multiqc_data/multiqc_null_values_filter.txt", + "multiqc/multiqc_data/multiqc_ratio_nulls.txt", + "multiqc/multiqc_data/multiqc_ratio_zeros.txt", + "multiqc/multiqc_data/multiqc_renaming_warning_reasons.txt", + "multiqc/multiqc_data/multiqc_skewness.txt", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_data/multiqc_total_gene_id_occurrence_quantiles.txt", + "multiqc/multiqc_data/multiqc_zero_values_filter.txt", + "multiqc/multiqc_plots", + "multiqc/multiqc_plots/pdf", + "multiqc/multiqc_plots/pdf/genes_section_1.pdf", + "multiqc/multiqc_plots/pdf/normalised_expr_distrib_section_1.pdf", + "multiqc/multiqc_plots/png", + "multiqc/multiqc_plots/png/genes_section_1.png", + "multiqc/multiqc_plots/png/normalised_expr_distrib_section_1.png", + "multiqc/multiqc_plots/svg", + "multiqc/multiqc_plots/svg/genes_section_1.svg", + "multiqc/multiqc_plots/svg/normalised_expr_distrib_section_1.svg", + "multiqc/multiqc_report.html", + "normalised", + "normalised/quantile_normalised", + "normalised/quantile_normalised/SRP254919.salmon.merged.gene_counts.top1000cov.assay", + "normalised/quantile_normalised/SRP254919.salmon.merged.gene_counts.top1000cov.assay/SRP254919.salmon.merged.gene_counts.top1000cov.assay.cleaned.renamed.zeros_filtered.nulls_filtered.tpm.quant_norm.parquet", + "normalised/tpm", + "normalised/tpm/SRP254919.salmon.merged.gene_counts.top1000cov.assay", + "normalised/tpm/SRP254919.salmon.merged.gene_counts.top1000cov.assay/SRP254919.salmon.merged.gene_counts.top1000cov.assay.cleaned.renamed.zeros_filtered.nulls_filtered.tpm.parquet", + "pipeline_info", + "pipeline_info/nf_core_stableexpression_software_mqc_versions.yml", + "statistics", + "statistics/id_mapping_stats.csv", + "statistics/missing_values_filter_stats.csv", + "statistics/ratio_nulls.csv", + "statistics/ratio_nulls_per_sample.csv", + "statistics/ratio_zeros.csv", + "statistics/skewness.csv", + "statistics/zero_values_filter_stats.csv", + "warnings", + "warnings/renaming_warning_reasons.tsv" + ], + [ + "all_genes_summary.csv:md5,67694aeb7cb1bec8e31a604fa5350783", + "custom_content_multiqc_config.yaml:md5,e048085491cb74658cf363545b1278fe", + "section_1.most_stable_genes_summary.csv:md5,f62b0f3c0462c8a6aeba61ed083ce07e", + "section_1.most_stable_genes_transposed_counts.csv:md5,7c197e7b57cdaee0b0250aed93050e24", + "section_10.most_stable_genes_summary.csv:md5,9b8dde024b554d9b2318bad7e4b76252", + "section_10.most_stable_genes_transposed_counts.csv:md5,6d9df5bef8a5b44340bbdc141d229f68", + "section_11.most_stable_genes_summary.csv:md5,255237f708e36bcd0290be1e811fd8b2", + "section_11.most_stable_genes_transposed_counts.csv:md5,08c786a9859eeae873dacbec33bb2e9e", + "section_12.most_stable_genes_summary.csv:md5,7597d72cce386c406bae974f6bccc089", + "section_12.most_stable_genes_transposed_counts.csv:md5,e93e7f64d0b5890ad95b5e0a9398f255", + "section_13.most_stable_genes_summary.csv:md5,5d0906956d78014f3bef28227e4af8f6", + "section_13.most_stable_genes_transposed_counts.csv:md5,93070ff7b4e36a2486a4b8b45f957ac8", + "section_14.most_stable_genes_summary.csv:md5,5e4743e56fbe09cc030f8401944d219c", + "section_14.most_stable_genes_transposed_counts.csv:md5,dd80d0a9118f20f47b3d06e479423713", + "section_15.most_stable_genes_summary.csv:md5,87a32675404d3908277e725954daf477", + "section_15.most_stable_genes_transposed_counts.csv:md5,0e6ed0bc24c70e3fca43691d87b39eec", + "section_16.most_stable_genes_summary.csv:md5,10b6a0507815b5a0f19371945cee71d9", + "section_16.most_stable_genes_transposed_counts.csv:md5,0dedfedd98859fabb0ba0bb57b08efce", + "section_17.most_stable_genes_summary.csv:md5,6e9ea1f25adaafdeec78ad3419815b68", + "section_17.most_stable_genes_transposed_counts.csv:md5,85a7c2959d89324cc6f250493d4520a7", + "section_18.most_stable_genes_summary.csv:md5,cafcd5fff5842789dc2024c9ec2b45d8", + "section_18.most_stable_genes_transposed_counts.csv:md5,69a36fb3843257e1c67fe06282a22671", + "section_19.most_stable_genes_summary.csv:md5,1a2f4da7114df0cdc08c09f024718d0c", + "section_19.most_stable_genes_transposed_counts.csv:md5,4e4b2100fca4c6d461333edf6e5fd9e7", + "section_2.most_stable_genes_summary.csv:md5,74094f7f58405bcfce2972a1073db7ee", + "section_2.most_stable_genes_transposed_counts.csv:md5,67e877f84141462327383d622cab7b49", + "section_20.most_stable_genes_summary.csv:md5,57068c5336541f1cfa8ae699f098b0b6", + "section_20.most_stable_genes_transposed_counts.csv:md5,5691275bafee54ec2520d82011295495", + "section_3.most_stable_genes_summary.csv:md5,537ce26b668dceffd0c23126fb97cef4", + "section_3.most_stable_genes_transposed_counts.csv:md5,907103b9138b1d27b6a17d321ca59bca", + "section_4.most_stable_genes_summary.csv:md5,cb353777ed864969add3036f4aa664ea", + "section_4.most_stable_genes_transposed_counts.csv:md5,2794b4e29fa21c53604cfc189ce6ecdf", + "section_5.most_stable_genes_summary.csv:md5,667468cdafb70c899dd3f7b7cb603ba6", + "section_5.most_stable_genes_transposed_counts.csv:md5,4d4395309a84a820d6c3169906824657", + "section_6.most_stable_genes_summary.csv:md5,62d2ade9627014c122f22bc1a32776a6", + "section_6.most_stable_genes_transposed_counts.csv:md5,73806b0e960342be7378aface0618229", + "section_7.most_stable_genes_summary.csv:md5,56a27a3d372727d3a0dfdef023f9aad2", + "section_7.most_stable_genes_transposed_counts.csv:md5,2067a0574e51f1fe8bf590addbfd6ea9", + "section_8.most_stable_genes_summary.csv:md5,79c41e03a858dd68a5614b1343938c24", + "section_8.most_stable_genes_transposed_counts.csv:md5,35e02009d4aff77f0fca02711a8ad058", + "section_9.most_stable_genes_summary.csv:md5,0e7776950594471fc8dfb914eed0a17a", + "section_9.most_stable_genes_transposed_counts.csv:md5,e102dcbb3bdcec1bd724adbd772655d1", + "style.css:md5,e6ba182eaf06980dbda49920efbf6e64", + "all_genes_summary.csv:md5,67694aeb7cb1bec8e31a604fa5350783", + "whole_design.csv:md5,f29515bc2c783e593fb9028127342593", + "environment.yml:md5,dd081780e1f98d34b13289d019f8bb5b", + "Mus_musculus.GRCm39.115.chr.gff3.gz:md5,66a5d70eeb2ce9685ca871fc7b0f4f96", + "gene_transcript_lengths.csv:md5,09e2d2a8881df9aa96ee71802e9c3451", + "global_gene_id_mapping.csv:md5,78934d2ac5fe7d863f114c5703f57a06", + "global_gene_metadata.csv:md5,bd76860b422e45eca7cd583212a977d2", + "gene_metadata.csv:md5,bd76860b422e45eca7cd583212a977d2", + "mapped_gene_ids.csv:md5,78934d2ac5fe7d863f114c5703f57a06", + "whole_design.csv:md5,f29515bc2c783e593fb9028127342593", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", + "multiqc_gene_statistics.txt:md5,9285ae2cfc531a0987e3172be0aa6483", + "multiqc_genes_section_1.txt:md5,3d44381703173383e455e59e84c3ecd9", + "multiqc_genes_section_1_1.txt:md5,49f24363739b3b2952ad813b0a1dc7c9", + "multiqc_genes_section_1_10.txt:md5,23eda043b3773d143f42c79943785baa", + "multiqc_genes_section_1_11.txt:md5,f221ca530b336e01b6c5a7f3a74b4262", + "multiqc_genes_section_1_12.txt:md5,df9f14313b281daae80438b134f25326", + "multiqc_genes_section_1_13.txt:md5,186cbc4df4a2d8bb05ff272725ad573e", + "multiqc_genes_section_1_14.txt:md5,efb4bfa55b0981b5b683a5d3bcf4fee3", + "multiqc_genes_section_1_15.txt:md5,cc47d6728943dafbdf0fa65ef0d075d9", + "multiqc_genes_section_1_16.txt:md5,35b7292de1819b89ece3df46081e6db3", + "multiqc_genes_section_1_17.txt:md5,8468a21c35f53afffe1f3f5a49f56aa7", + "multiqc_genes_section_1_18.txt:md5,8f2906665b62c75ab1786214124c02d1", + "multiqc_genes_section_1_19.txt:md5,d879c38bd6016d13a55adf59b4de7d99", + "multiqc_genes_section_1_2.txt:md5,58126f33166e756917d3fca0c66aafa8", + "multiqc_genes_section_1_3.txt:md5,dded6f5abee4f377eb093d6b95b6daa0", + "multiqc_genes_section_1_4.txt:md5,2c9a3ba7a78140a1e01afcc15b35c835", + "multiqc_genes_section_1_5.txt:md5,cddedab71e149bb731d6dee130dbea65", + "multiqc_genes_section_1_6.txt:md5,8c21ab38a3c761b0fefc029812b1cc35", + "multiqc_genes_section_1_7.txt:md5,d5779f1d0e92d80de7dc728e375d57ee", + "multiqc_genes_section_1_8.txt:md5,66e168eafc2bd8be9e9a397d0fc1c4b9", + "multiqc_genes_section_1_9.txt:md5,62a4f655b779f14b751e47f32bf7ccf1", + "multiqc_id_mapping_stats.txt:md5,600e9fa5656a06a3288ea7e6d9fef647", + "multiqc_normalised_expr_distrib_section_1.txt:md5,342306198c5930791d9255b481b6daa8", + "multiqc_normalised_expr_distrib_section_1_1.txt:md5,3a1b52103dd52cceeede5b99f0c18d1c", + "multiqc_normalised_expr_distrib_section_1_10.txt:md5,69d9ea655368e4b61ea3c49dc336ccc3", + "multiqc_normalised_expr_distrib_section_1_11.txt:md5,cf5fcd0fb87409255e88f808993570a8", + "multiqc_normalised_expr_distrib_section_1_12.txt:md5,e0767d376933c8849a08ba998acaee39", + "multiqc_normalised_expr_distrib_section_1_13.txt:md5,76d7fd5652e923ad09000bc80f9aa4ca", + "multiqc_normalised_expr_distrib_section_1_14.txt:md5,ce675ff07c194b1c975879f3288a27e8", + "multiqc_normalised_expr_distrib_section_1_15.txt:md5,5936ef07006c81b688289ec764994932", + "multiqc_normalised_expr_distrib_section_1_16.txt:md5,1a5a56d8661bcc0f986058a90ebb81b0", + "multiqc_normalised_expr_distrib_section_1_17.txt:md5,8c4c83bdcc648b4cbd5876c58d8c30d8", + "multiqc_normalised_expr_distrib_section_1_18.txt:md5,832ad87d6f4689a459e443a032f67f9d", + "multiqc_normalised_expr_distrib_section_1_19.txt:md5,83be81b32d7c48685015c2ede21fb511", + "multiqc_normalised_expr_distrib_section_1_2.txt:md5,f1e22de47e393569f3d193a30bbdc9cd", + "multiqc_normalised_expr_distrib_section_1_3.txt:md5,9e190919da3fc866beb78076ad8c4a33", + "multiqc_normalised_expr_distrib_section_1_4.txt:md5,3712b42a48fa257ad75f2deba10f631d", + "multiqc_normalised_expr_distrib_section_1_5.txt:md5,d927439c8e03a2e6f2ad18f16a90afff", + "multiqc_normalised_expr_distrib_section_1_6.txt:md5,b2d87c20485c9ffdd0237c4372d9d6e9", + "multiqc_normalised_expr_distrib_section_1_7.txt:md5,d18c331b910ce9702b5a977521c39aa1", + "multiqc_normalised_expr_distrib_section_1_8.txt:md5,1a6c7f079559251385f268beee39c9cb", + "multiqc_normalised_expr_distrib_section_1_9.txt:md5,688908e507b313d477957cb1d7d6e1a2", + "multiqc_null_values_filter.txt:md5,64ca3e3acc613e1b85733fd847712a37", + "multiqc_ratio_nulls.txt:md5,7063b06cadcf854671bc9cefb51a6fe3", + "multiqc_ratio_zeros.txt:md5,7063b06cadcf854671bc9cefb51a6fe3", + "multiqc_renaming_warning_reasons.txt:md5,6e3001e79809e518b23efc517fc5bc67", + "multiqc_total_gene_id_occurrence_quantiles.txt:md5,ca154d649786ea5336e7c9e980f00eac", + "multiqc_zero_values_filter.txt:md5,64ca3e3acc613e1b85733fd847712a37", + "id_mapping_stats.csv:md5,b47d6ebd34e3fb11a40665b0a38db3da", + "missing_values_filter_stats.csv:md5,310182ec872cf37ffb81370dfcd01207", + "ratio_nulls.csv:md5,2272ebcf58ac8bb283d238f87d508b96", + "ratio_nulls_per_sample.csv:md5,375371c6d3e58ae69430f0e96b71920d", + "ratio_zeros.csv:md5,2272ebcf58ac8bb283d238f87d508b96", + "zero_values_filter_stats.csv:md5,310182ec872cf37ffb81370dfcd01207", + "renaming_warning_reasons.tsv:md5,0a11a59b5b547a39ab7a0e4dac622173" + ] + ], + "timestamp": "2026-04-05T09:10:44.402386214", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + }, + "-profile test_fetch_geo": { + "content": [ + [ + "aggregated", + "aggregated/all_genes_summary.csv", + "aggregated/custom_content_multiqc_config.yaml", + "aggregated/section_1.most_stable_genes_summary.csv", + "aggregated/section_1.most_stable_genes_transposed_counts.csv", + "aggregated/section_10.most_stable_genes_summary.csv", + "aggregated/section_10.most_stable_genes_transposed_counts.csv", + "aggregated/section_11.most_stable_genes_summary.csv", + "aggregated/section_11.most_stable_genes_transposed_counts.csv", + "aggregated/section_12.most_stable_genes_summary.csv", + "aggregated/section_12.most_stable_genes_transposed_counts.csv", + "aggregated/section_13.most_stable_genes_summary.csv", + "aggregated/section_13.most_stable_genes_transposed_counts.csv", + "aggregated/section_14.most_stable_genes_summary.csv", + "aggregated/section_14.most_stable_genes_transposed_counts.csv", + "aggregated/section_15.most_stable_genes_summary.csv", + "aggregated/section_15.most_stable_genes_transposed_counts.csv", + "aggregated/section_16.most_stable_genes_summary.csv", + "aggregated/section_16.most_stable_genes_transposed_counts.csv", + "aggregated/section_17.most_stable_genes_summary.csv", + "aggregated/section_17.most_stable_genes_transposed_counts.csv", + "aggregated/section_18.most_stable_genes_summary.csv", + "aggregated/section_18.most_stable_genes_transposed_counts.csv", + "aggregated/section_19.most_stable_genes_summary.csv", + "aggregated/section_19.most_stable_genes_transposed_counts.csv", + "aggregated/section_2.most_stable_genes_summary.csv", + "aggregated/section_2.most_stable_genes_transposed_counts.csv", + "aggregated/section_20.most_stable_genes_summary.csv", + "aggregated/section_20.most_stable_genes_transposed_counts.csv", + "aggregated/section_3.most_stable_genes_summary.csv", + "aggregated/section_3.most_stable_genes_transposed_counts.csv", + "aggregated/section_4.most_stable_genes_summary.csv", + "aggregated/section_4.most_stable_genes_transposed_counts.csv", + "aggregated/section_5.most_stable_genes_summary.csv", + "aggregated/section_5.most_stable_genes_transposed_counts.csv", + "aggregated/section_6.most_stable_genes_summary.csv", + "aggregated/section_6.most_stable_genes_transposed_counts.csv", + "aggregated/section_7.most_stable_genes_summary.csv", + "aggregated/section_7.most_stable_genes_transposed_counts.csv", + "aggregated/section_8.most_stable_genes_summary.csv", + "aggregated/section_8.most_stable_genes_transposed_counts.csv", + "aggregated/section_9.most_stable_genes_summary.csv", + "aggregated/section_9.most_stable_genes_transposed_counts.csv", + "dash_app", + "dash_app/app.py", + "dash_app/assets", + "dash_app/assets/style.css", + "dash_app/data", + "dash_app/data/all_counts.imputed.parquet", + "dash_app/data/all_genes_summary.csv", + "dash_app/data/whole_design.csv", + "dash_app/environment.yml", + "dash_app/src", + "dash_app/src/callbacks", + "dash_app/src/callbacks/common.py", + "dash_app/src/callbacks/genes.py", + "dash_app/src/callbacks/samples.py", + "dash_app/src/components", + "dash_app/src/components/graphs.py", + "dash_app/src/components/icons.py", + "dash_app/src/components/right_sidebar.py", + "dash_app/src/components/settings", + "dash_app/src/components/settings/genes.py", + "dash_app/src/components/settings/samples.py", + "dash_app/src/components/stores.py", + "dash_app/src/components/tables.py", + "dash_app/src/components/tooltips.py", + "dash_app/src/components/top.py", + "dash_app/src/utils", + "dash_app/src/utils/config.py", + "dash_app/src/utils/data_management.py", + "dash_app/src/utils/style.py", + "errors", + "gene_length", + "gene_length/Beta_vulgaris.RefBeet-1.2.2.62.gff3.gz", + "gene_length/gene_transcript_lengths.csv", + "geo", + "idmapping", + "idmapping/global_gene_id_mapping.csv", + "idmapping/global_gene_metadata.csv", + "idmapping/gprofiler", + "idmapping/gprofiler/gene_metadata.csv", + "idmapping/gprofiler/mapped_gene_ids.csv", + "idmapping/renamed", + "idmapping/renamed/E_MTAB_8187_rnaseq.rnaseq.raw.counts.cleaned.renamed.parquet", + "idmapping/renamed/GSE55951_GPL18429.microarray.normalised.counts.cleaned.renamed.parquet", + "merged_datasets", + "merged_datasets/whole_design.csv", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/llms-full.txt", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc.parquet", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_eatlas_all_experiments_metadata.txt", + "multiqc/multiqc_data/multiqc_eatlas_selected_experiments_metadata.txt", + "multiqc/multiqc_data/multiqc_gene_statistics.txt", + "multiqc/multiqc_data/multiqc_genes_section_1.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_1.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_10.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_11.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_12.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_13.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_14.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_15.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_16.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_17.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_18.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_19.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_2.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_3.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_4.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_5.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_6.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_7.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_8.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_9.txt", + "multiqc/multiqc_data/multiqc_geo_all_experiments_metadata.txt", + "multiqc/multiqc_data/multiqc_geo_rejected_experiments_metadata.txt", + "multiqc/multiqc_data/multiqc_geo_selected_experiments_metadata.txt", + "multiqc/multiqc_data/multiqc_geo_warning_reasons.txt", + "multiqc/multiqc_data/multiqc_id_mapping_stats.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_1.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_10.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_11.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_12.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_13.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_14.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_15.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_16.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_17.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_18.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_19.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_2.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_3.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_4.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_5.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_6.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_7.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_8.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_9.txt", + "multiqc/multiqc_data/multiqc_null_values_filter.txt", + "multiqc/multiqc_data/multiqc_ratio_nulls.txt", + "multiqc/multiqc_data/multiqc_ratio_zeros.txt", + "multiqc/multiqc_data/multiqc_renaming_warning_reasons.txt", + "multiqc/multiqc_data/multiqc_skewness.txt", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_data/multiqc_total_gene_id_occurrence_quantiles.txt", + "multiqc/multiqc_data/multiqc_zero_values_filter.txt", + "multiqc/multiqc_plots", + "multiqc/multiqc_plots/pdf", + "multiqc/multiqc_plots/pdf/genes_section_1.pdf", + "multiqc/multiqc_plots/pdf/normalised_expr_distrib_section_1.pdf", + "multiqc/multiqc_plots/png", + "multiqc/multiqc_plots/png/genes_section_1.png", + "multiqc/multiqc_plots/png/normalised_expr_distrib_section_1.png", + "multiqc/multiqc_plots/svg", + "multiqc/multiqc_plots/svg/genes_section_1.svg", + "multiqc/multiqc_plots/svg/normalised_expr_distrib_section_1.svg", + "multiqc/multiqc_report.html", + "normalised", + "normalised/quantile_normalised", + "normalised/quantile_normalised/E_MTAB_8187_rnaseq", + "normalised/quantile_normalised/E_MTAB_8187_rnaseq/E_MTAB_8187_rnaseq.rnaseq.raw.counts.cleaned.renamed.zeros_filtered.nulls_filtered.tpm.quant_norm.parquet", + "normalised/tpm", + "normalised/tpm/E_MTAB_8187_rnaseq", + "normalised/tpm/E_MTAB_8187_rnaseq/E_MTAB_8187_rnaseq.rnaseq.raw.counts.cleaned.renamed.zeros_filtered.nulls_filtered.tpm.parquet", + "pipeline_info", + "pipeline_info/nf_core_stableexpression_software_mqc_versions.yml", + "public_data", + "public_data/expression_atlas", + "public_data/expression_atlas/accessions", + "public_data/expression_atlas/accessions/accessions.txt", + "public_data/expression_atlas/accessions/selected_experiments.metadata.tsv", + "public_data/expression_atlas/accessions/species_experiments.metadata.tsv", + "public_data/expression_atlas/datasets", + "public_data/expression_atlas/datasets/E_MTAB_8187_rnaseq.design.csv", + "public_data/expression_atlas/datasets/E_MTAB_8187_rnaseq.rnaseq.raw.counts.csv", + "public_data/geo", + "public_data/geo/accessions", + "public_data/geo/accessions/accessions.txt", + "public_data/geo/accessions/geo_all_datasets.metadata.tsv", + "public_data/geo/accessions/geo_rejected_datasets.metadata.tsv", + "public_data/geo/accessions/geo_selected_datasets.metadata.tsv", + "public_data/geo/datasets", + "public_data/geo/datasets/GSE55951_GPL18429.microarray.normalised.counts.csv", + "public_data/geo/datasets/GSE55951_GPL18429.microarray.normalised.design.csv", + "public_data/geo/datasets/rejected", + "public_data/geo/datasets/rejected/GSE135555_suppl", + "public_data/geo/datasets/rejected/GSE135555_suppl/GSE135555_suppl.rnaseq.raw.counts.csv", + "public_data/geo/datasets/rejected/GSE135555_suppl/GSE135555_suppl.rnaseq.raw.design.csv", + "public_data/geo/datasets/rejected/GSE135555_suppl/GSE135555_suppl.rnaseq.raw.platform_metadata.csv", + "public_data/geo/datasets/rejected/GSE135555_suppl/GSE135555_suppl.rnaseq.raw.sample_name_mapping.csv", + "statistics", + "statistics/id_mapping_stats.csv", + "statistics/missing_values_filter_stats.csv", + "statistics/ratio_nulls.csv", + "statistics/ratio_nulls_per_sample.csv", + "statistics/ratio_zeros.csv", + "statistics/skewness.csv", + "statistics/zero_values_filter_stats.csv", + "warnings", + "warnings/geo_warning_reasons.csv", + "warnings/renaming_warning_reasons.tsv" + ], + [ + "all_genes_summary.csv:md5,75d3db24909c578bbc764585cc25bde3", + "custom_content_multiqc_config.yaml:md5,e048085491cb74658cf363545b1278fe", + "section_1.most_stable_genes_summary.csv:md5,52ac3fc87801720530cd8ed8bd027698", + "section_1.most_stable_genes_transposed_counts.csv:md5,8363bc69b84c68fe4ecea13b6dc70d98", + "section_10.most_stable_genes_summary.csv:md5,41c3ba1e338277e40e03c9b043059cb0", + "section_10.most_stable_genes_transposed_counts.csv:md5,4a599908cea31077650911161a4fd155", + "section_11.most_stable_genes_summary.csv:md5,136e636de09496412dc76ef7fb10c47b", + "section_11.most_stable_genes_transposed_counts.csv:md5,9aeb482d2ff0cbfaa8d29a5af4357701", + "section_12.most_stable_genes_summary.csv:md5,c27fb0df29ac4fb3bea8df3fbb6ef2b1", + "section_12.most_stable_genes_transposed_counts.csv:md5,edbe661b7c150c1a8af01c3c52ea45f7", + "section_13.most_stable_genes_summary.csv:md5,0395eed958d9571fae34ae29b8fe643e", + "section_13.most_stable_genes_transposed_counts.csv:md5,3ece34d50b412abddbce5da5c05f10de", + "section_14.most_stable_genes_summary.csv:md5,8677aa89331f67690330becf078260e3", + "section_14.most_stable_genes_transposed_counts.csv:md5,15840cc29d8d27881b59f19804134f97", + "section_15.most_stable_genes_summary.csv:md5,182e3a6e3a855340c50b5d2705b84142", + "section_15.most_stable_genes_transposed_counts.csv:md5,8a4c0d3018f3ed87305b4cafa8d3a7ae", + "section_16.most_stable_genes_summary.csv:md5,6c41bed8aea0f1cfa973ae7dfc93a148", + "section_16.most_stable_genes_transposed_counts.csv:md5,e3196137992a40340e20cb46ebd5cbdd", + "section_17.most_stable_genes_summary.csv:md5,f4aaec1b2af2e89bf26c156b907097e8", + "section_17.most_stable_genes_transposed_counts.csv:md5,af06eab6bc04fc315544fcd0176da4cd", + "section_18.most_stable_genes_summary.csv:md5,5f21148626ed40d0d64b393babcf160d", + "section_18.most_stable_genes_transposed_counts.csv:md5,29fc2248ad428cb3ac8898b0a5471eec", + "section_19.most_stable_genes_summary.csv:md5,5acc2a1b1980004f88c0584a8cf0784e", + "section_19.most_stable_genes_transposed_counts.csv:md5,9586c452f93c486ed667fb343af3b13c", + "section_2.most_stable_genes_summary.csv:md5,b0f6113c0d0b1994ceb844f884c22083", + "section_2.most_stable_genes_transposed_counts.csv:md5,b22984d5b00ee4540fca59b5585a0a88", + "section_20.most_stable_genes_summary.csv:md5,9d9c5cd95d1d1a350a8d1f2ce363f882", + "section_20.most_stable_genes_transposed_counts.csv:md5,e9f4187bdc7079c3130bdff1e4ebf575", + "section_3.most_stable_genes_summary.csv:md5,a440355a3a59a13bb2a3f2def5e936d4", + "section_3.most_stable_genes_transposed_counts.csv:md5,77d118556692fe285590489db96f47d0", + "section_4.most_stable_genes_summary.csv:md5,221b0d42881ada7cd7fcca65cdc827a4", + "section_4.most_stable_genes_transposed_counts.csv:md5,5d1d9ebe8151765fb37176c86f3c7812", + "section_5.most_stable_genes_summary.csv:md5,a3c3edb5fd3cf852185531a4adcd9fd9", + "section_5.most_stable_genes_transposed_counts.csv:md5,51289b18ac41641114892519d2e494a6", + "section_6.most_stable_genes_summary.csv:md5,5a7baf9eadb389cc234808d56ee6fdfe", + "section_6.most_stable_genes_transposed_counts.csv:md5,bb1cddda97df3915d2aad5973e1c8a16", + "section_7.most_stable_genes_summary.csv:md5,a1ed63a57844d1bce998eea23714f071", + "section_7.most_stable_genes_transposed_counts.csv:md5,82c59e866871569fbde316efea5e7ea3", + "section_8.most_stable_genes_summary.csv:md5,40673407e734107f0cebf2045023155a", + "section_8.most_stable_genes_transposed_counts.csv:md5,0bfb8031fc91115a61a57113a6df5c4d", + "section_9.most_stable_genes_summary.csv:md5,7178bb75b1733f71d0aeba2a09750b3b", + "section_9.most_stable_genes_transposed_counts.csv:md5,b02e0d31ed2c0fa925060893062c07a7", + "style.css:md5,e6ba182eaf06980dbda49920efbf6e64", + "all_genes_summary.csv:md5,75d3db24909c578bbc764585cc25bde3", + "whole_design.csv:md5,fbd18d011d7d855452e5a30a303afcbf", + "environment.yml:md5,dd081780e1f98d34b13289d019f8bb5b", + "Beta_vulgaris.RefBeet-1.2.2.62.gff3.gz:md5,6f2c45809441c8776e6578000db2b0e4", + "gene_transcript_lengths.csv:md5,458c7dfd3598bdcbcb6ceb76ccba189f", + "global_gene_id_mapping.csv:md5,63f67fb73898870c360293d30362bc33", + "global_gene_metadata.csv:md5,cc8d4afdbaf03cd39a4e10f2a9040b7e", + "gene_metadata.csv:md5,cc8d4afdbaf03cd39a4e10f2a9040b7e", + "mapped_gene_ids.csv:md5,63f67fb73898870c360293d30362bc33", + "whole_design.csv:md5,fbd18d011d7d855452e5a30a303afcbf", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", + "multiqc_eatlas_all_experiments_metadata.txt:md5,8b7643e0ef8eaaa3fa72f7103fd7ccee", + "multiqc_eatlas_selected_experiments_metadata.txt:md5,8b7643e0ef8eaaa3fa72f7103fd7ccee", + "multiqc_gene_statistics.txt:md5,19eaa1f4db6058db006b045caea1a980", + "multiqc_genes_section_1.txt:md5,e9a38f354b88a9bf35dd9aa4994a4595", + "multiqc_genes_section_1_1.txt:md5,7565958f195732ae112664627bf147e9", + "multiqc_genes_section_1_10.txt:md5,304bd44c0867a1419e7b48e5bb6dff05", + "multiqc_genes_section_1_11.txt:md5,807ad09f10e257546f18e5fb052511e9", + "multiqc_genes_section_1_12.txt:md5,e3d5acc5a292639bc3a1b1b5e7f5a04b", + "multiqc_genes_section_1_13.txt:md5,7dd72d333b12fc101f4a5b555e09d49a", + "multiqc_genes_section_1_14.txt:md5,59d0addf52e85cdf7d0163721c29c095", + "multiqc_genes_section_1_15.txt:md5,b10474b0ad8cd3cdf21dbe8dc4fd3676", + "multiqc_genes_section_1_16.txt:md5,6f038b7c99db654f2d749da25f7c213b", + "multiqc_genes_section_1_17.txt:md5,9f9f97f85d6605978b286942ac69ba2c", + "multiqc_genes_section_1_18.txt:md5,ab6c6e6e1a658ba92baa6dd2b68f56bf", + "multiqc_genes_section_1_19.txt:md5,5d4910983359e122e07fdbe2aeda10f7", + "multiqc_genes_section_1_2.txt:md5,c6541fb7c3e5502da20d1c68cb5a44de", + "multiqc_genes_section_1_3.txt:md5,94130719e096ffd035a155aa59b4bdd0", + "multiqc_genes_section_1_4.txt:md5,ba0275140b46c0c2d2690304bfd008d8", + "multiqc_genes_section_1_5.txt:md5,fcdcb0618858bf79586f679f4834f902", + "multiqc_genes_section_1_6.txt:md5,9cf7cebccab8b0073cad3d43d4d2ef92", + "multiqc_genes_section_1_7.txt:md5,d440bc9cce034ba82dd0d9f3387f9094", + "multiqc_genes_section_1_8.txt:md5,dc1f5de798343036301a059b545a378f", + "multiqc_genes_section_1_9.txt:md5,e9402e81e8c32c8a6b4015c4a55962f0", + "multiqc_geo_warning_reasons.txt:md5,2b53d4be74728c504752515f74c58fd2", + "multiqc_id_mapping_stats.txt:md5,f03d4786d088307ad756b9661fd61ede", + "multiqc_normalised_expr_distrib_section_1.txt:md5,fe7c9f8eff636a38deee18a05e17ed4d", + "multiqc_normalised_expr_distrib_section_1_1.txt:md5,7578a930f8750ecb56e892a54211e28f", + "multiqc_normalised_expr_distrib_section_1_10.txt:md5,696c5b24d54057e4738bbd0b351c5d28", + "multiqc_normalised_expr_distrib_section_1_11.txt:md5,94ef2626cd23a3395ba0f53be43b529e", + "multiqc_normalised_expr_distrib_section_1_12.txt:md5,cf62d3846d7d00b438719e75551bd3fa", + "multiqc_normalised_expr_distrib_section_1_13.txt:md5,825766b14187d801ae2284dffd562ac4", + "multiqc_normalised_expr_distrib_section_1_14.txt:md5,b18a4df24ed61f0315d41d4cddfd6539", + "multiqc_normalised_expr_distrib_section_1_15.txt:md5,4d99b3d87c9a25b18fa5ed2061dfb71c", + "multiqc_normalised_expr_distrib_section_1_16.txt:md5,82305a3ca8a54e44a558d0c83dfca9f3", + "multiqc_normalised_expr_distrib_section_1_17.txt:md5,adf99bc87dd29499a1bfc50c3c26488c", + "multiqc_normalised_expr_distrib_section_1_18.txt:md5,8b64cbab2e0cca85575b18b41f973aa5", + "multiqc_normalised_expr_distrib_section_1_19.txt:md5,4544499f66cd9de554f2d26944028cd5", + "multiqc_normalised_expr_distrib_section_1_2.txt:md5,d74f1b40545293b2dba02a0ff167119d", + "multiqc_normalised_expr_distrib_section_1_3.txt:md5,e5701cd16921b4ce657ac131418e04d1", + "multiqc_normalised_expr_distrib_section_1_4.txt:md5,fd093b2d0d535ff16ba846bde129f690", + "multiqc_normalised_expr_distrib_section_1_5.txt:md5,4dbddb8d44680d3cc45a3053c510ca2d", + "multiqc_normalised_expr_distrib_section_1_6.txt:md5,497c20bb2f2d2c03595c897f30775411", + "multiqc_normalised_expr_distrib_section_1_7.txt:md5,5c3fb8ff5e1b90d0a9904712204fc36d", + "multiqc_normalised_expr_distrib_section_1_8.txt:md5,9e5d9c6fb87d348a893bfed6b24f01ce", + "multiqc_normalised_expr_distrib_section_1_9.txt:md5,6a40889210cec540d4b3a2e903454003", + "multiqc_null_values_filter.txt:md5,32deb66a006e612bb582cc8ce0e253dd", + "multiqc_ratio_nulls.txt:md5,d8ae8a87932da88063c21b9e96d7a0b3", + "multiqc_ratio_zeros.txt:md5,63fd4bb33e1160d0071bd2fda14a3434", + "multiqc_renaming_warning_reasons.txt:md5,317e6da04b74b7e4470616a5e791308f", + "multiqc_total_gene_id_occurrence_quantiles.txt:md5,d1e0b917cd62b17c37700d4ff0f4e3ee", + "multiqc_zero_values_filter.txt:md5,049541b0d77aac9703471aaafedba758", + "accessions.txt:md5,76e5e3af7c72eac7a1993a2bd75b4d1a", + "selected_experiments.metadata.tsv:md5,cf220f0d0aab141abf220c856430f2f2", + "species_experiments.metadata.tsv:md5,cf220f0d0aab141abf220c856430f2f2", + "E_MTAB_8187_rnaseq.design.csv:md5,fbd18d011d7d855452e5a30a303afcbf", + "E_MTAB_8187_rnaseq.rnaseq.raw.counts.csv:md5,fe221fd94f66df7120b0590091e14eb1", + "accessions.txt:md5,a850f625a78be7b4b10ce08a5b638e23", + "GSE55951_GPL18429.microarray.normalised.counts.csv:md5,18fd2d728ad2ec5cb78f994f73375144", + "GSE55951_GPL18429.microarray.normalised.design.csv:md5,f4872dff0edbe441d1600ffe2b67a25d", + "GSE135555_suppl.rnaseq.raw.counts.csv:md5,b34bde25ea5d508a1670ce4264073df1", + "GSE135555_suppl.rnaseq.raw.design.csv:md5,df3abc86dd22710223eb8ecc606c9b52", + "GSE135555_suppl.rnaseq.raw.platform_metadata.csv:md5,68b329da9893e34099c7d8ad5cb9c940", + "GSE135555_suppl.rnaseq.raw.sample_name_mapping.csv:md5,457ec1886bec4a917447f67141a3355d", + "id_mapping_stats.csv:md5,5acc3cfc65b836d60f5929cd3e18329b", + "missing_values_filter_stats.csv:md5,3532d85f8ded121d4a64f779ff07c6a7", + "ratio_nulls.csv:md5,61a1fa51598dd19691274cec72344086", + "ratio_nulls_per_sample.csv:md5,26be7a3455d1c25335aa4f791d3f5fb8", + "ratio_zeros.csv:md5,dbe4685e6c4b2c698f608d220ac0fddd", + "zero_values_filter_stats.csv:md5,5506c189469bd93293ef11c9727ddca6", + "geo_warning_reasons.csv:md5,1d7787445686300070a7af880d707015", + "renaming_warning_reasons.tsv:md5,ae651ff0a559e025e014412009eac136" + ] + ], + "timestamp": "2026-04-05T09:23:41.24478498", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + }, + "-profile test_skip_id_mapping": { + "content": [ + [ + "errors", + "gene_length", + "gene_length/Solanum_tuberosum.SolTub_3.0.62.gff3.gz", + "gene_length/gene_transcript_lengths.csv", + "idmapping", + "idmapping/gene_ids.txt", + "merged_datasets", + "merged_datasets/whole_design.csv", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/llms-full.txt", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc.parquet", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_report.html", + "normalised", + "normalised/quantile_normalised", + "normalised/quantile_normalised/microarray.normalised", + "normalised/quantile_normalised/microarray.normalised/microarray.normalised.zeros_filtered.nulls_filtered.quant_norm.parquet", + "normalised/quantile_normalised/rnaseq.raw", + "normalised/quantile_normalised/rnaseq.raw/rnaseq.raw.zeros_filtered.nulls_filtered.tpm.quant_norm.parquet", + "normalised/tpm", + "normalised/tpm/rnaseq.raw", + "normalised/tpm/rnaseq.raw/rnaseq.raw.zeros_filtered.nulls_filtered.tpm.parquet", + "pipeline_info", + "pipeline_info/nf_core_stableexpression_software_mqc_versions.yml", + "statistics", + "statistics/missing_values_filter_stats.csv", + "statistics/ratio_nulls.csv", + "statistics/ratio_nulls_per_sample.csv", + "statistics/ratio_zeros.csv", + "statistics/skewness.csv", + "statistics/zero_values_filter_stats.csv", + "warnings" + ], + [ + "Solanum_tuberosum.SolTub_3.0.62.gff3.gz:md5,cca99141f43d57d697f6df75de790e05", + "gene_transcript_lengths.csv:md5,217aa7c1e227ce2f78a905138d8e5b39", + "gene_ids.txt:md5,831b47f91a0808802967aa0e53a25de9", + "whole_design.csv:md5,70d6c2673e619ca52d2774fb3e368382", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", + "missing_values_filter_stats.csv:md5,ebad5386e7c670ff04887eff67c8faae", + "ratio_nulls.csv:md5,ab65c49c9b8ba7e242f391438789e080", + "ratio_nulls_per_sample.csv:md5,5c2931cb8c5ecb27ffa9136628fc714c", + "ratio_zeros.csv:md5,1837a5a03a551fdb0a7bba2869157559", + "zero_values_filter_stats.csv:md5,ebad5386e7c670ff04887eff67c8faae" + ] + ], + "timestamp": "2026-04-04T22:10:20.013206958", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + }, + "-profile test_dataset_custom_mapping_and_gene_length": { + "content": [ + { + "EXTRACT_GENE_IDS": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "FILTER_AND_RENAME_GENES": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "Workflow": { + "nf-core/stableexpression": "v1.0.0" + } + }, + [ + "errors", + "errors/renaming_failure_reasons.tsv", + "idmapping", + "idmapping/gene_ids.txt", + "idmapping/global_gene_id_mapping.csv", + "idmapping/global_gene_metadata.csv", + "idmapping/renamed", + "merged_datasets", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/llms-full.txt", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc.parquet", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_report.html", + "pipeline_info", + "pipeline_info/nf_core_stableexpression_software_mqc_versions.yml", + "statistics", + "statistics/id_mapping_stats.csv", + "warnings" + ], + [ + "renaming_failure_reasons.tsv:md5,d5cae52d86b44b02d7bd00c456576b5d", + "gene_ids.txt:md5,831b47f91a0808802967aa0e53a25de9", + "global_gene_id_mapping.csv:md5,187a86074197044846bb8565e122eb8e", + "global_gene_metadata.csv:md5,5ae2d701ca0cb6384d9e1e08a345e452", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", + "id_mapping_stats.csv:md5,20bd1443c864cb013c97efc760465e9c" + ] + ], + "timestamp": "2026-03-21T12:53:02.926804675", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "-profile test": { + "content": [ + [ + "aggregated", + "aggregated/all_genes_summary.csv", + "aggregated/custom_content_multiqc_config.yaml", + "aggregated/section_1.most_stable_genes_summary.csv", + "aggregated/section_1.most_stable_genes_transposed_counts.csv", + "aggregated/section_10.most_stable_genes_summary.csv", + "aggregated/section_10.most_stable_genes_transposed_counts.csv", + "aggregated/section_11.most_stable_genes_summary.csv", + "aggregated/section_11.most_stable_genes_transposed_counts.csv", + "aggregated/section_12.most_stable_genes_summary.csv", + "aggregated/section_12.most_stable_genes_transposed_counts.csv", + "aggregated/section_13.most_stable_genes_summary.csv", + "aggregated/section_13.most_stable_genes_transposed_counts.csv", + "aggregated/section_14.most_stable_genes_summary.csv", + "aggregated/section_14.most_stable_genes_transposed_counts.csv", + "aggregated/section_15.most_stable_genes_summary.csv", + "aggregated/section_15.most_stable_genes_transposed_counts.csv", + "aggregated/section_16.most_stable_genes_summary.csv", + "aggregated/section_16.most_stable_genes_transposed_counts.csv", + "aggregated/section_17.most_stable_genes_summary.csv", + "aggregated/section_17.most_stable_genes_transposed_counts.csv", + "aggregated/section_18.most_stable_genes_summary.csv", + "aggregated/section_18.most_stable_genes_transposed_counts.csv", + "aggregated/section_19.most_stable_genes_summary.csv", + "aggregated/section_19.most_stable_genes_transposed_counts.csv", + "aggregated/section_2.most_stable_genes_summary.csv", + "aggregated/section_2.most_stable_genes_transposed_counts.csv", + "aggregated/section_20.most_stable_genes_summary.csv", + "aggregated/section_20.most_stable_genes_transposed_counts.csv", + "aggregated/section_3.most_stable_genes_summary.csv", + "aggregated/section_3.most_stable_genes_transposed_counts.csv", + "aggregated/section_4.most_stable_genes_summary.csv", + "aggregated/section_4.most_stable_genes_transposed_counts.csv", + "aggregated/section_5.most_stable_genes_summary.csv", + "aggregated/section_5.most_stable_genes_transposed_counts.csv", + "aggregated/section_6.most_stable_genes_summary.csv", + "aggregated/section_6.most_stable_genes_transposed_counts.csv", + "aggregated/section_7.most_stable_genes_summary.csv", + "aggregated/section_7.most_stable_genes_transposed_counts.csv", + "aggregated/section_8.most_stable_genes_summary.csv", + "aggregated/section_8.most_stable_genes_transposed_counts.csv", + "aggregated/section_9.most_stable_genes_summary.csv", + "aggregated/section_9.most_stable_genes_transposed_counts.csv", + "dash_app", + "dash_app/app.py", + "dash_app/assets", + "dash_app/assets/style.css", + "dash_app/data", + "dash_app/data/all_counts.imputed.parquet", + "dash_app/data/all_genes_summary.csv", + "dash_app/data/whole_design.csv", + "dash_app/environment.yml", + "dash_app/src", + "dash_app/src/callbacks", + "dash_app/src/callbacks/common.py", + "dash_app/src/callbacks/genes.py", + "dash_app/src/callbacks/samples.py", + "dash_app/src/components", + "dash_app/src/components/graphs.py", + "dash_app/src/components/icons.py", + "dash_app/src/components/right_sidebar.py", + "dash_app/src/components/settings", + "dash_app/src/components/settings/genes.py", + "dash_app/src/components/settings/samples.py", + "dash_app/src/components/stores.py", + "dash_app/src/components/tables.py", + "dash_app/src/components/tooltips.py", + "dash_app/src/components/top.py", + "dash_app/src/utils", + "dash_app/src/utils/config.py", + "dash_app/src/utils/data_management.py", + "dash_app/src/utils/style.py", + "errors", + "gene_length", + "gene_length/Beta_vulgaris.RefBeet-1.2.2.62.gff3.gz", + "gene_length/gene_transcript_lengths.csv", + "idmapping", + "idmapping/global_gene_id_mapping.csv", + "idmapping/global_gene_metadata.csv", + "idmapping/gprofiler", + "idmapping/gprofiler/gene_metadata.csv", + "idmapping/gprofiler/mapped_gene_ids.csv", + "idmapping/renamed", + "idmapping/renamed/E_MTAB_8187_rnaseq.rnaseq.raw.counts.cleaned.renamed.parquet", + "merged_datasets", + "merged_datasets/whole_design.csv", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/llms-full.txt", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc.parquet", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_eatlas_all_experiments_metadata.txt", + "multiqc/multiqc_data/multiqc_eatlas_selected_experiments_metadata.txt", + "multiqc/multiqc_data/multiqc_gene_statistics.txt", + "multiqc/multiqc_data/multiqc_genes_section_1.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_1.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_10.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_11.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_12.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_13.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_14.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_15.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_16.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_17.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_18.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_19.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_2.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_3.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_4.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_5.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_6.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_7.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_8.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_9.txt", + "multiqc/multiqc_data/multiqc_id_mapping_stats.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_1.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_10.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_11.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_12.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_13.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_14.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_15.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_16.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_17.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_18.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_19.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_2.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_3.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_4.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_5.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_6.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_7.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_8.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_9.txt", + "multiqc/multiqc_data/multiqc_null_values_filter.txt", + "multiqc/multiqc_data/multiqc_ratio_nulls.txt", + "multiqc/multiqc_data/multiqc_ratio_zeros.txt", + "multiqc/multiqc_data/multiqc_skewness.txt", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_data/multiqc_total_gene_id_occurrence_quantiles.txt", + "multiqc/multiqc_data/multiqc_zero_values_filter.txt", + "multiqc/multiqc_plots", + "multiqc/multiqc_plots/pdf", + "multiqc/multiqc_plots/pdf/genes_section_1.pdf", + "multiqc/multiqc_plots/pdf/normalised_expr_distrib_section_1.pdf", + "multiqc/multiqc_plots/png", + "multiqc/multiqc_plots/png/genes_section_1.png", + "multiqc/multiqc_plots/png/normalised_expr_distrib_section_1.png", + "multiqc/multiqc_plots/svg", + "multiqc/multiqc_plots/svg/genes_section_1.svg", + "multiqc/multiqc_plots/svg/normalised_expr_distrib_section_1.svg", + "multiqc/multiqc_report.html", + "normalised", + "normalised/quantile_normalised", + "normalised/quantile_normalised/E_MTAB_8187_rnaseq", + "normalised/quantile_normalised/E_MTAB_8187_rnaseq/E_MTAB_8187_rnaseq.rnaseq.raw.counts.cleaned.renamed.zeros_filtered.nulls_filtered.tpm.quant_norm.parquet", + "normalised/tpm", + "normalised/tpm/E_MTAB_8187_rnaseq", + "normalised/tpm/E_MTAB_8187_rnaseq/E_MTAB_8187_rnaseq.rnaseq.raw.counts.cleaned.renamed.zeros_filtered.nulls_filtered.tpm.parquet", + "pipeline_info", + "pipeline_info/nf_core_stableexpression_software_mqc_versions.yml", + "public_data", + "public_data/expression_atlas", + "public_data/expression_atlas/accessions", + "public_data/expression_atlas/accessions/accessions.txt", + "public_data/expression_atlas/accessions/selected_experiments.metadata.tsv", + "public_data/expression_atlas/accessions/species_experiments.metadata.tsv", + "public_data/expression_atlas/datasets", + "public_data/expression_atlas/datasets/E_MTAB_8187_rnaseq.design.csv", + "public_data/expression_atlas/datasets/E_MTAB_8187_rnaseq.rnaseq.raw.counts.csv", + "statistics", + "statistics/id_mapping_stats.csv", + "statistics/missing_values_filter_stats.csv", + "statistics/ratio_nulls.csv", + "statistics/ratio_nulls_per_sample.csv", + "statistics/ratio_zeros.csv", + "statistics/skewness.csv", + "statistics/zero_values_filter_stats.csv", + "warnings" + ], + [ + "all_genes_summary.csv:md5,e3f8d59accf267c351d0a995ffc9ebf5", + "custom_content_multiqc_config.yaml:md5,e048085491cb74658cf363545b1278fe", + "section_1.most_stable_genes_summary.csv:md5,be640cd7efc6a7ac3df989b9ab9a6448", + "section_1.most_stable_genes_transposed_counts.csv:md5,8363bc69b84c68fe4ecea13b6dc70d98", + "section_10.most_stable_genes_summary.csv:md5,41c3ba1e338277e40e03c9b043059cb0", + "section_10.most_stable_genes_transposed_counts.csv:md5,4a599908cea31077650911161a4fd155", + "section_11.most_stable_genes_summary.csv:md5,136e636de09496412dc76ef7fb10c47b", + "section_11.most_stable_genes_transposed_counts.csv:md5,9aeb482d2ff0cbfaa8d29a5af4357701", + "section_12.most_stable_genes_summary.csv:md5,c27fb0df29ac4fb3bea8df3fbb6ef2b1", + "section_12.most_stable_genes_transposed_counts.csv:md5,edbe661b7c150c1a8af01c3c52ea45f7", + "section_13.most_stable_genes_summary.csv:md5,0395eed958d9571fae34ae29b8fe643e", + "section_13.most_stable_genes_transposed_counts.csv:md5,3ece34d50b412abddbce5da5c05f10de", + "section_14.most_stable_genes_summary.csv:md5,8677aa89331f67690330becf078260e3", + "section_14.most_stable_genes_transposed_counts.csv:md5,15840cc29d8d27881b59f19804134f97", + "section_15.most_stable_genes_summary.csv:md5,182e3a6e3a855340c50b5d2705b84142", + "section_15.most_stable_genes_transposed_counts.csv:md5,8a4c0d3018f3ed87305b4cafa8d3a7ae", + "section_16.most_stable_genes_summary.csv:md5,6c41bed8aea0f1cfa973ae7dfc93a148", + "section_16.most_stable_genes_transposed_counts.csv:md5,e3196137992a40340e20cb46ebd5cbdd", + "section_17.most_stable_genes_summary.csv:md5,f4aaec1b2af2e89bf26c156b907097e8", + "section_17.most_stable_genes_transposed_counts.csv:md5,af06eab6bc04fc315544fcd0176da4cd", + "section_18.most_stable_genes_summary.csv:md5,5f21148626ed40d0d64b393babcf160d", + "section_18.most_stable_genes_transposed_counts.csv:md5,29fc2248ad428cb3ac8898b0a5471eec", + "section_19.most_stable_genes_summary.csv:md5,5acc2a1b1980004f88c0584a8cf0784e", + "section_19.most_stable_genes_transposed_counts.csv:md5,9586c452f93c486ed667fb343af3b13c", + "section_2.most_stable_genes_summary.csv:md5,95e986dad2f0232070aa47079b6465c1", + "section_2.most_stable_genes_transposed_counts.csv:md5,b22984d5b00ee4540fca59b5585a0a88", + "section_20.most_stable_genes_summary.csv:md5,9d9c5cd95d1d1a350a8d1f2ce363f882", + "section_20.most_stable_genes_transposed_counts.csv:md5,e9f4187bdc7079c3130bdff1e4ebf575", + "section_3.most_stable_genes_summary.csv:md5,7825d8dbcfd1c4e5a4e4ca42268d4ea8", + "section_3.most_stable_genes_transposed_counts.csv:md5,77d118556692fe285590489db96f47d0", + "section_4.most_stable_genes_summary.csv:md5,221b0d42881ada7cd7fcca65cdc827a4", + "section_4.most_stable_genes_transposed_counts.csv:md5,5d1d9ebe8151765fb37176c86f3c7812", + "section_5.most_stable_genes_summary.csv:md5,a3c3edb5fd3cf852185531a4adcd9fd9", + "section_5.most_stable_genes_transposed_counts.csv:md5,51289b18ac41641114892519d2e494a6", + "section_6.most_stable_genes_summary.csv:md5,5a7baf9eadb389cc234808d56ee6fdfe", + "section_6.most_stable_genes_transposed_counts.csv:md5,bb1cddda97df3915d2aad5973e1c8a16", + "section_7.most_stable_genes_summary.csv:md5,a1ed63a57844d1bce998eea23714f071", + "section_7.most_stable_genes_transposed_counts.csv:md5,82c59e866871569fbde316efea5e7ea3", + "section_8.most_stable_genes_summary.csv:md5,40673407e734107f0cebf2045023155a", + "section_8.most_stable_genes_transposed_counts.csv:md5,0bfb8031fc91115a61a57113a6df5c4d", + "section_9.most_stable_genes_summary.csv:md5,7178bb75b1733f71d0aeba2a09750b3b", + "section_9.most_stable_genes_transposed_counts.csv:md5,b02e0d31ed2c0fa925060893062c07a7", + "style.css:md5,e6ba182eaf06980dbda49920efbf6e64", + "all_genes_summary.csv:md5,e3f8d59accf267c351d0a995ffc9ebf5", + "whole_design.csv:md5,fbd18d011d7d855452e5a30a303afcbf", + "environment.yml:md5,dd081780e1f98d34b13289d019f8bb5b", + "Beta_vulgaris.RefBeet-1.2.2.62.gff3.gz:md5,6f2c45809441c8776e6578000db2b0e4", + "gene_transcript_lengths.csv:md5,458c7dfd3598bdcbcb6ceb76ccba189f", + "global_gene_id_mapping.csv:md5,7eecbd2d88adaf5f213f238a72d28b99", + "global_gene_metadata.csv:md5,cc8d4afdbaf03cd39a4e10f2a9040b7e", + "gene_metadata.csv:md5,cc8d4afdbaf03cd39a4e10f2a9040b7e", + "mapped_gene_ids.csv:md5,7eecbd2d88adaf5f213f238a72d28b99", + "whole_design.csv:md5,fbd18d011d7d855452e5a30a303afcbf", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", + "multiqc_eatlas_all_experiments_metadata.txt:md5,8b7643e0ef8eaaa3fa72f7103fd7ccee", + "multiqc_eatlas_selected_experiments_metadata.txt:md5,8b7643e0ef8eaaa3fa72f7103fd7ccee", + "multiqc_gene_statistics.txt:md5,d7750cb95663a63219dcec94e03d7af1", + "multiqc_genes_section_1.txt:md5,f310a16068d5e76713497e2d3824cf2d", + "multiqc_genes_section_1_1.txt:md5,d68c3cce20e06aaf226e88e0e52184b3", + "multiqc_genes_section_1_10.txt:md5,304bd44c0867a1419e7b48e5bb6dff05", + "multiqc_genes_section_1_11.txt:md5,807ad09f10e257546f18e5fb052511e9", + "multiqc_genes_section_1_12.txt:md5,e3d5acc5a292639bc3a1b1b5e7f5a04b", + "multiqc_genes_section_1_13.txt:md5,7dd72d333b12fc101f4a5b555e09d49a", + "multiqc_genes_section_1_14.txt:md5,59d0addf52e85cdf7d0163721c29c095", + "multiqc_genes_section_1_15.txt:md5,b10474b0ad8cd3cdf21dbe8dc4fd3676", + "multiqc_genes_section_1_16.txt:md5,6f038b7c99db654f2d749da25f7c213b", + "multiqc_genes_section_1_17.txt:md5,9f9f97f85d6605978b286942ac69ba2c", + "multiqc_genes_section_1_18.txt:md5,ab6c6e6e1a658ba92baa6dd2b68f56bf", + "multiqc_genes_section_1_19.txt:md5,5d4910983359e122e07fdbe2aeda10f7", + "multiqc_genes_section_1_2.txt:md5,89b5e91c54815bd340411210fb7b86a7", + "multiqc_genes_section_1_3.txt:md5,94130719e096ffd035a155aa59b4bdd0", + "multiqc_genes_section_1_4.txt:md5,ba0275140b46c0c2d2690304bfd008d8", + "multiqc_genes_section_1_5.txt:md5,fcdcb0618858bf79586f679f4834f902", + "multiqc_genes_section_1_6.txt:md5,9cf7cebccab8b0073cad3d43d4d2ef92", + "multiqc_genes_section_1_7.txt:md5,d440bc9cce034ba82dd0d9f3387f9094", + "multiqc_genes_section_1_8.txt:md5,dc1f5de798343036301a059b545a378f", + "multiqc_genes_section_1_9.txt:md5,e9402e81e8c32c8a6b4015c4a55962f0", + "multiqc_id_mapping_stats.txt:md5,52e27cd2411482e92177128367f5bcf2", + "multiqc_normalised_expr_distrib_section_1.txt:md5,fe7c9f8eff636a38deee18a05e17ed4d", + "multiqc_normalised_expr_distrib_section_1_1.txt:md5,7578a930f8750ecb56e892a54211e28f", + "multiqc_normalised_expr_distrib_section_1_10.txt:md5,696c5b24d54057e4738bbd0b351c5d28", + "multiqc_normalised_expr_distrib_section_1_11.txt:md5,94ef2626cd23a3395ba0f53be43b529e", + "multiqc_normalised_expr_distrib_section_1_12.txt:md5,cf62d3846d7d00b438719e75551bd3fa", + "multiqc_normalised_expr_distrib_section_1_13.txt:md5,825766b14187d801ae2284dffd562ac4", + "multiqc_normalised_expr_distrib_section_1_14.txt:md5,b18a4df24ed61f0315d41d4cddfd6539", + "multiqc_normalised_expr_distrib_section_1_15.txt:md5,4d99b3d87c9a25b18fa5ed2061dfb71c", + "multiqc_normalised_expr_distrib_section_1_16.txt:md5,82305a3ca8a54e44a558d0c83dfca9f3", + "multiqc_normalised_expr_distrib_section_1_17.txt:md5,adf99bc87dd29499a1bfc50c3c26488c", + "multiqc_normalised_expr_distrib_section_1_18.txt:md5,8b64cbab2e0cca85575b18b41f973aa5", + "multiqc_normalised_expr_distrib_section_1_19.txt:md5,4544499f66cd9de554f2d26944028cd5", + "multiqc_normalised_expr_distrib_section_1_2.txt:md5,d74f1b40545293b2dba02a0ff167119d", + "multiqc_normalised_expr_distrib_section_1_3.txt:md5,e5701cd16921b4ce657ac131418e04d1", + "multiqc_normalised_expr_distrib_section_1_4.txt:md5,fd093b2d0d535ff16ba846bde129f690", + "multiqc_normalised_expr_distrib_section_1_5.txt:md5,4dbddb8d44680d3cc45a3053c510ca2d", + "multiqc_normalised_expr_distrib_section_1_6.txt:md5,497c20bb2f2d2c03595c897f30775411", + "multiqc_normalised_expr_distrib_section_1_7.txt:md5,5c3fb8ff5e1b90d0a9904712204fc36d", + "multiqc_normalised_expr_distrib_section_1_8.txt:md5,9e5d9c6fb87d348a893bfed6b24f01ce", + "multiqc_normalised_expr_distrib_section_1_9.txt:md5,6a40889210cec540d4b3a2e903454003", + "multiqc_null_values_filter.txt:md5,36e80c213d14d0e3942d84e8ad14b9cc", + "multiqc_ratio_nulls.txt:md5,ba7ee3e2a9f20f19bae430c56ca11e9a", + "multiqc_ratio_zeros.txt:md5,238d38905ef9fdc5e6a252255dea7f82", + "multiqc_total_gene_id_occurrence_quantiles.txt:md5,389252b8ba48d86c6100d0abcd762ac1", + "multiqc_zero_values_filter.txt:md5,36e80c213d14d0e3942d84e8ad14b9cc", + "accessions.txt:md5,76e5e3af7c72eac7a1993a2bd75b4d1a", + "selected_experiments.metadata.tsv:md5,cf220f0d0aab141abf220c856430f2f2", + "species_experiments.metadata.tsv:md5,cf220f0d0aab141abf220c856430f2f2", + "E_MTAB_8187_rnaseq.design.csv:md5,fbd18d011d7d855452e5a30a303afcbf", + "E_MTAB_8187_rnaseq.rnaseq.raw.counts.csv:md5,fe221fd94f66df7120b0590091e14eb1", + "id_mapping_stats.csv:md5,17ccaa8e70c67c7d0de4ec3c630c2e5b", + "missing_values_filter_stats.csv:md5,a4a1e6b5e88fc2226c01f237b90214db", + "ratio_nulls.csv:md5,3649422febfc0208bb0f1892d071a0a1", + "ratio_nulls_per_sample.csv:md5,88f76a381ba0635b334ea65f1dc9311f", + "ratio_zeros.csv:md5,e31a1f46c19c75381bd237f520658bf3", + "zero_values_filter_stats.csv:md5,a4a1e6b5e88fc2226c01f237b90214db" + ] + ], + "timestamp": "2026-04-05T09:07:09.566479758", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + }, + "-profile test_accessions_only": { + "content": [ + { + "EXPRESSION_ATLAS": { + "httpx": "0.28.1", + "nltk": "3.9.2", + "pandas": "3.0.1", + "python": "3.14.3", + "pyyaml": "6.0.3" + }, + "Workflow": { + "nf-core/stableexpression": "v1.0.0" + } + }, + [ + "errors", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/llms-full.txt", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc.parquet", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_report.html", + "pipeline_info", + "pipeline_info/nf_core_stableexpression_software_mqc_versions.yml", + "public_data", + "public_data/expression_atlas", + "public_data/expression_atlas/accessions", + "public_data/expression_atlas/accessions/accessions.txt", + "public_data/expression_atlas/accessions/selected_experiments.metadata.tsv", + "public_data/expression_atlas/accessions/species_experiments.metadata.tsv", + "statistics", + "warnings" + ], + [ + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", + "accessions.txt:md5,76e5e3af7c72eac7a1993a2bd75b4d1a", + "selected_experiments.metadata.tsv:md5,cf220f0d0aab141abf220c856430f2f2", + "species_experiments.metadata.tsv:md5,cf220f0d0aab141abf220c856430f2f2" + ] + ], + "timestamp": "2026-03-29T14:35:30.092884854", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "-profile test_one_accession_low_gene_count": { + "content": [ + { + "AGGREGATE_RESULTS": { + "polars": "1.39.2", + "python": "3.14.3", + "pyyaml": "6.0.3" + }, + "CLEAN_GENE_IDS": { + "polars": "1.37.1", + "python": "3.12.8" + }, + "COLLECT_ALL_GENE_IDS": { + "python": "3.14.2", + "tqdm": "4.67.1" + }, + "COLLECT_STATISTICS": { + "pandas": "2.3.3", + "python": "3.13.7" + }, + "COMPUTE_GENE_TRANSCRIPT_LENGTHS": { + "pandas": "2.3.3", + "python": "3.13.7" + }, + "COMPUTE_M_MEASURE": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "COMPUTE_STABILITY_SCORES": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "COMPUTE_TPM": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "CROSS_JOIN": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "DASH_APP": { + "python": "3.14.3", + "dash": "3.3.0", + "dash-extensions": "2.0.4", + "dash-mantine-components": "2.4.0", + "dash-ag-grid": "32.3.2", + "polars": "1.39.2", + "pandas": "2.3.3", + "pyarrow": "23.0.1", + "scipy": "1.17.1" + }, + "DESCRIPTIVE_STATISTICS": { + "polars": "1.37.1", + "python": "3.12.8" + }, + "DETECT_RARE_GENES": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "DOWNLOAD_ENSEMBL_ANNOTATION": { + "bs4": "4.14.3", + "httpx": "0.28.1", + "pandas": "3.0.1", + "python": "3.14.3", + "tqdm": "4.67.3" + }, + "EXPRESSION_ATLAS": { + "ExpressionAtlas": "1.34.0", + "R": "4.4.3 (2025-02-28)" + }, + "EXPRESSION_RATIO": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "EXTRACT_GENE_IDS": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "FILTER_AND_RENAME_GENES": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "GET_CANDIDATE_GENES": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "GLOBAL": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "GPROFILER_IDMAPPING": { + "httpx": "0.28.1", + "pandas": "3.0.1", + "python": "3.14.3" + }, + "IMPUTE_MISSING_VALUES": { + "polars": "1.39.2", + "python": "3.14.3", + "scikit-learn": "1.8.0" + }, + "MAKE_CHUNKS": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "NORMFINDER": { + "numba": "0.64.0", + "numpy": "2.4.3", + "polars": "1.39.2", + "python": "3.14.3", + "tqdm": "4.67.3" + }, + "PLATFORM": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "QUANTILE_NORMALISATION": { + "polars": "1.39.2", + "python": "3.14.3", + "scikit-learn": "1.8.0" + }, + "RATIO_STANDARD_VARIATION": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "TOO_MANY_MISSING_VALUES": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "TOO_MANY_ZEROS": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "Workflow": { + "nf-core/stableexpression": "v1.0.0" + } + }, + [ + "aggregated", + "aggregated/all_genes_summary.csv", + "aggregated/custom_content_multiqc_config.yaml", + "aggregated/section_1.most_stable_genes_summary.csv", + "aggregated/section_1.most_stable_genes_transposed_counts.csv", + "aggregated/section_10.most_stable_genes_summary.csv", + "aggregated/section_10.most_stable_genes_transposed_counts.csv", + "aggregated/section_11.most_stable_genes_summary.csv", + "aggregated/section_11.most_stable_genes_transposed_counts.csv", + "aggregated/section_12.most_stable_genes_summary.csv", + "aggregated/section_12.most_stable_genes_transposed_counts.csv", + "aggregated/section_13.most_stable_genes_summary.csv", + "aggregated/section_13.most_stable_genes_transposed_counts.csv", + "aggregated/section_14.most_stable_genes_summary.csv", + "aggregated/section_14.most_stable_genes_transposed_counts.csv", + "aggregated/section_15.most_stable_genes_summary.csv", + "aggregated/section_15.most_stable_genes_transposed_counts.csv", + "aggregated/section_16.most_stable_genes_summary.csv", + "aggregated/section_16.most_stable_genes_transposed_counts.csv", + "aggregated/section_17.most_stable_genes_summary.csv", + "aggregated/section_17.most_stable_genes_transposed_counts.csv", + "aggregated/section_18.most_stable_genes_summary.csv", + "aggregated/section_18.most_stable_genes_transposed_counts.csv", + "aggregated/section_19.most_stable_genes_summary.csv", + "aggregated/section_19.most_stable_genes_transposed_counts.csv", + "aggregated/section_2.most_stable_genes_summary.csv", + "aggregated/section_2.most_stable_genes_transposed_counts.csv", + "aggregated/section_20.most_stable_genes_summary.csv", + "aggregated/section_20.most_stable_genes_transposed_counts.csv", + "aggregated/section_3.most_stable_genes_summary.csv", + "aggregated/section_3.most_stable_genes_transposed_counts.csv", + "aggregated/section_4.most_stable_genes_summary.csv", + "aggregated/section_4.most_stable_genes_transposed_counts.csv", + "aggregated/section_5.most_stable_genes_summary.csv", + "aggregated/section_5.most_stable_genes_transposed_counts.csv", + "aggregated/section_6.most_stable_genes_summary.csv", + "aggregated/section_6.most_stable_genes_transposed_counts.csv", + "aggregated/section_7.most_stable_genes_summary.csv", + "aggregated/section_7.most_stable_genes_transposed_counts.csv", + "aggregated/section_8.most_stable_genes_summary.csv", + "aggregated/section_8.most_stable_genes_transposed_counts.csv", + "aggregated/section_9.most_stable_genes_summary.csv", + "aggregated/section_9.most_stable_genes_transposed_counts.csv", + "dash_app", + "dash_app/app.py", + "dash_app/assets", + "dash_app/assets/style.css", + "dash_app/data", + "dash_app/data/all_counts.imputed.parquet", + "dash_app/data/all_genes_summary.csv", + "dash_app/data/whole_design.csv", + "dash_app/environment.yml", + "dash_app/src", + "dash_app/src/callbacks", + "dash_app/src/callbacks/common.py", + "dash_app/src/callbacks/genes.py", + "dash_app/src/callbacks/samples.py", + "dash_app/src/components", + "dash_app/src/components/graphs.py", + "dash_app/src/components/icons.py", + "dash_app/src/components/right_sidebar.py", + "dash_app/src/components/settings", + "dash_app/src/components/settings/genes.py", + "dash_app/src/components/settings/samples.py", + "dash_app/src/components/stores.py", + "dash_app/src/components/tables.py", + "dash_app/src/components/tooltips.py", + "dash_app/src/components/top.py", + "dash_app/src/utils", + "dash_app/src/utils/config.py", + "dash_app/src/utils/data_management.py", + "dash_app/src/utils/style.py", + "errors", + "gene_length", + "gene_length/Arabidopsis_thaliana.TAIR10.62.gff3.gz", + "gene_length/gene_transcript_lengths.csv", + "idmapping", + "idmapping/global_gene_id_mapping.csv", + "idmapping/global_gene_metadata.csv", + "idmapping/gprofiler", + "idmapping/gprofiler/gene_metadata.csv", + "idmapping/gprofiler/mapped_gene_ids.csv", + "idmapping/renamed", + "idmapping/renamed/E_GEOD_51720_rnaseq.rnaseq.raw.counts.cleaned.renamed.parquet", + "merged_datasets", + "merged_datasets/whole_design.csv", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/llms-full.txt", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc.parquet", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_gene_statistics.txt", + "multiqc/multiqc_data/multiqc_genes_section_1.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_1.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_10.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_11.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_12.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_13.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_14.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_15.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_16.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_17.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_18.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_19.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_2.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_3.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_4.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_5.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_6.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_7.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_8.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_9.txt", + "multiqc/multiqc_data/multiqc_id_mapping_stats.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_1.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_10.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_11.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_12.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_13.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_14.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_15.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_16.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_17.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_18.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_19.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_2.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_3.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_4.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_5.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_6.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_7.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_8.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_9.txt", + "multiqc/multiqc_data/multiqc_null_values_filter.txt", + "multiqc/multiqc_data/multiqc_ratio_nulls.txt", + "multiqc/multiqc_data/multiqc_ratio_zeros.txt", + "multiqc/multiqc_data/multiqc_skewness.txt", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_data/multiqc_total_gene_id_occurrence_quantiles.txt", + "multiqc/multiqc_data/multiqc_zero_values_filter.txt", + "multiqc/multiqc_plots", + "multiqc/multiqc_plots/pdf", + "multiqc/multiqc_plots/pdf/genes_section_1.pdf", + "multiqc/multiqc_plots/pdf/normalised_expr_distrib_section_1.pdf", + "multiqc/multiqc_plots/png", + "multiqc/multiqc_plots/png/genes_section_1.png", + "multiqc/multiqc_plots/png/normalised_expr_distrib_section_1.png", + "multiqc/multiqc_plots/svg", + "multiqc/multiqc_plots/svg/genes_section_1.svg", + "multiqc/multiqc_plots/svg/normalised_expr_distrib_section_1.svg", + "multiqc/multiqc_report.html", + "normalised", + "normalised/quantile_normalised", + "normalised/quantile_normalised/E_GEOD_51720_rnaseq", + "normalised/quantile_normalised/E_GEOD_51720_rnaseq/E_GEOD_51720_rnaseq.rnaseq.raw.counts.cleaned.renamed.zeros_filtered.nulls_filtered.tpm.quant_norm.parquet", + "normalised/tpm", + "normalised/tpm/E_GEOD_51720_rnaseq", + "normalised/tpm/E_GEOD_51720_rnaseq/E_GEOD_51720_rnaseq.rnaseq.raw.counts.cleaned.renamed.zeros_filtered.nulls_filtered.tpm.parquet", + "pipeline_info", + "pipeline_info/nf_core_stableexpression_software_mqc_versions.yml", + "public_data", + "public_data/expression_atlas", + "public_data/expression_atlas/datasets", + "public_data/expression_atlas/datasets/E_GEOD_51720_rnaseq.design.csv", + "public_data/expression_atlas/datasets/E_GEOD_51720_rnaseq.rnaseq.raw.counts.csv", + "statistics", + "statistics/id_mapping_stats.csv", + "statistics/missing_values_filter_stats.csv", + "statistics/ratio_nulls.csv", + "statistics/ratio_nulls_per_sample.csv", + "statistics/ratio_zeros.csv", + "statistics/skewness.csv", + "statistics/zero_values_filter_stats.csv", + "warnings" + ], + [ + "all_genes_summary.csv:md5,643bb1aa5f128bad6f192bd2aeaa2ee6", + "custom_content_multiqc_config.yaml:md5,e048085491cb74658cf363545b1278fe", + "section_1.most_stable_genes_summary.csv:md5,911c1c687cdc308f5aecaef42d504a89", + "section_1.most_stable_genes_transposed_counts.csv:md5,849135e7f42258dd2975d74f136d23aa", + "section_10.most_stable_genes_summary.csv:md5,131be639b26c51537ec05d67258a2820", + "section_10.most_stable_genes_transposed_counts.csv:md5,aa8748ba5cab4cb2387a616326d82023", + "section_11.most_stable_genes_summary.csv:md5,6e363a28b0762735cdf575f6aec3fb54", + "section_11.most_stable_genes_transposed_counts.csv:md5,705e2b4becb685f21490948f648cee0a", + "section_12.most_stable_genes_summary.csv:md5,6e49cbc5af4a45fcd62f9a9c9d1c82ad", + "section_12.most_stable_genes_transposed_counts.csv:md5,a757cf115a30079e4dea9ebe44e587d5", + "section_13.most_stable_genes_summary.csv:md5,6765ae522f95e29af34c118c36464510", + "section_13.most_stable_genes_transposed_counts.csv:md5,578d598340aa36cf38852e06e619190a", + "section_14.most_stable_genes_summary.csv:md5,dba0d2e1803d588bbc213896ea143d56", + "section_14.most_stable_genes_transposed_counts.csv:md5,53f8590e2ddfbbc80e1e72516f5b821a", + "section_15.most_stable_genes_summary.csv:md5,1ff8d851ef7bceecb1bb96111cf42ed9", + "section_15.most_stable_genes_transposed_counts.csv:md5,2f1987c6e0327610cfaf3b5ac4b17c99", + "section_16.most_stable_genes_summary.csv:md5,22a67ebb023441a8428b8d9277c237f7", + "section_16.most_stable_genes_transposed_counts.csv:md5,467d69d7581c7b2d008b6a69004775f2", + "section_17.most_stable_genes_summary.csv:md5,e5cbc51cfe86c7b2225804410d30665b", + "section_17.most_stable_genes_transposed_counts.csv:md5,d758e3e9e4274ff7815af4fa9f84154d", + "section_18.most_stable_genes_summary.csv:md5,828dd90d5c39cf1b714e2804dd7b8d84", + "section_18.most_stable_genes_transposed_counts.csv:md5,2f633511784b3babc159c4ecfed76fa2", + "section_19.most_stable_genes_summary.csv:md5,b32ed5d4a50671ac38a4a616dc81b2b9", + "section_19.most_stable_genes_transposed_counts.csv:md5,b507a8bbe8e2d3852e7952e932917751", + "section_2.most_stable_genes_summary.csv:md5,439d0e60a30d7232508e695a210053c5", + "section_2.most_stable_genes_transposed_counts.csv:md5,a1803a9577616d7a098ad1567817cb20", + "section_20.most_stable_genes_summary.csv:md5,0d82b5d34b415947bdda4d016fa52f71", + "section_20.most_stable_genes_transposed_counts.csv:md5,3a1ae07c51acb0a1672e210a8a137121", + "section_3.most_stable_genes_summary.csv:md5,1ade5c406fe691b48a7f6b56b4778971", + "section_3.most_stable_genes_transposed_counts.csv:md5,71d9e444731c709189ed569ada9be4c1", + "section_4.most_stable_genes_summary.csv:md5,aa1216a538b2723ac246fd336b8a3fcb", + "section_4.most_stable_genes_transposed_counts.csv:md5,8bd766e3232e4f7591cba721cbf305dc", + "section_5.most_stable_genes_summary.csv:md5,84e099dbe057240baa5542e035214362", + "section_5.most_stable_genes_transposed_counts.csv:md5,180382fc6c81bc94032fb592425d1596", + "section_6.most_stable_genes_summary.csv:md5,e455df268552dbede82debdaff7f2bb5", + "section_6.most_stable_genes_transposed_counts.csv:md5,da8bc59c611f88c51b047f6ccb50d08b", + "section_7.most_stable_genes_summary.csv:md5,ef6db8ade4ffd92d0ef872b8e4c88417", + "section_7.most_stable_genes_transposed_counts.csv:md5,b1d1db3949dd5a07ea45baf10c184d05", + "section_8.most_stable_genes_summary.csv:md5,911c809c86111dc0597a953cbfa26d62", + "section_8.most_stable_genes_transposed_counts.csv:md5,337a0e231598d45291a6a42a25c585b1", + "section_9.most_stable_genes_summary.csv:md5,cfaafcd65fffaed8169835cfc0992430", + "section_9.most_stable_genes_transposed_counts.csv:md5,cdb7220619e76d11963f1f1b08101e42", + "style.css:md5,e6ba182eaf06980dbda49920efbf6e64", + "all_genes_summary.csv:md5,643bb1aa5f128bad6f192bd2aeaa2ee6", + "whole_design.csv:md5,d3aa542c4ad07d0051a84482fe6cd81c", + "environment.yml:md5,dd081780e1f98d34b13289d019f8bb5b", + "Arabidopsis_thaliana.TAIR10.62.gff3.gz:md5,b02566c301d47461db70747b3adaa6ce", + "gene_transcript_lengths.csv:md5,06b4612031f4f300a6d67f36e7625492", + "global_gene_id_mapping.csv:md5,42491ef436cce231258c0358e1af5745", + "global_gene_metadata.csv:md5,b35e20500269d4e6787ef1a3468f16bc", + "gene_metadata.csv:md5,b35e20500269d4e6787ef1a3468f16bc", + "mapped_gene_ids.csv:md5,42491ef436cce231258c0358e1af5745", + "whole_design.csv:md5,d3aa542c4ad07d0051a84482fe6cd81c", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", + "multiqc_gene_statistics.txt:md5,53fe105326f1a097d3437731eb4e3a8d", + "multiqc_genes_section_1.txt:md5,cb79085a6608e6dfd5a96291dcea850b", + "multiqc_genes_section_1_1.txt:md5,7308dd2805d9b530457a1eb839e1b455", + "multiqc_genes_section_1_10.txt:md5,91e61a6a01bdf35096fb02f79476dd58", + "multiqc_genes_section_1_11.txt:md5,f0b8df84a99b2d5ef557ee8896217095", + "multiqc_genes_section_1_12.txt:md5,89e8c3dcd3d970735de56ed6dd618caf", + "multiqc_genes_section_1_13.txt:md5,b7b1b4265c236ba1c8ed7358e34a6dd6", + "multiqc_genes_section_1_14.txt:md5,831514e662296f82a3f0370ae64b1503", + "multiqc_genes_section_1_15.txt:md5,92aa9a142514894d965ce5f41bee781d", + "multiqc_genes_section_1_16.txt:md5,e905ff948cccf03b24177517e39078ad", + "multiqc_genes_section_1_17.txt:md5,920067a6137cbded388b393f4a84d0bf", + "multiqc_genes_section_1_18.txt:md5,e46e3add55d144e8dc04087498b73b65", + "multiqc_genes_section_1_19.txt:md5,72e10039958b0d2667136688b35411cf", + "multiqc_genes_section_1_2.txt:md5,210eff8a16470b70dd186c52aa218512", + "multiqc_genes_section_1_3.txt:md5,15f3d0a57e714b176361689eece78b90", + "multiqc_genes_section_1_4.txt:md5,36cb183f89030a540dc51f83fe0073c4", + "multiqc_genes_section_1_5.txt:md5,6bd50c3d3040facf83fb70d3aad70caf", + "multiqc_genes_section_1_6.txt:md5,420fee370865219de09913c9eb827a49", + "multiqc_genes_section_1_7.txt:md5,fb4c14faf2e007704f1fcb21949deb2d", + "multiqc_genes_section_1_8.txt:md5,df2f893d352fc6992f8d95e18f30a1e4", + "multiqc_genes_section_1_9.txt:md5,75c27fc9730c4346074c667cc8d1c885", + "multiqc_id_mapping_stats.txt:md5,49023d9842e01da40e2c50e9659802d5", + "multiqc_normalised_expr_distrib_section_1.txt:md5,9e50c1075664481653bb278323672633", + "multiqc_normalised_expr_distrib_section_1_1.txt:md5,a1fa5d657a142abbf49fb95bf266d906", + "multiqc_normalised_expr_distrib_section_1_10.txt:md5,f89a15a3af0047f9bd0f5d01ca9ccb33", + "multiqc_normalised_expr_distrib_section_1_11.txt:md5,fead0770f22c316593d6d2353d94e9f7", + "multiqc_normalised_expr_distrib_section_1_12.txt:md5,8cacaee9d1bedf3ec8a4d66f3bab1f7f", + "multiqc_normalised_expr_distrib_section_1_13.txt:md5,2567a9943c1c49e575b4c2fe6a3a3185", + "multiqc_normalised_expr_distrib_section_1_14.txt:md5,02d40fd44721ec46f59736221500078a", + "multiqc_normalised_expr_distrib_section_1_15.txt:md5,0c89badaf4e435df8526ae8e9f4802ab", + "multiqc_normalised_expr_distrib_section_1_16.txt:md5,3fe2b8ffacda4c1f8ca761eb7a1e1086", + "multiqc_normalised_expr_distrib_section_1_17.txt:md5,c86c0cf8c3e4eab7a61979f622f126d7", + "multiqc_normalised_expr_distrib_section_1_18.txt:md5,17554bf8a45621ecdedefe2a9b79835e", + "multiqc_normalised_expr_distrib_section_1_19.txt:md5,28b90411fa811ba678f237e9ee6f20a2", + "multiqc_normalised_expr_distrib_section_1_2.txt:md5,b3876970c55302cb37f1bd8f8ca620ee", + "multiqc_normalised_expr_distrib_section_1_3.txt:md5,2ead25fe7da0f48beca784882fabb1a6", + "multiqc_normalised_expr_distrib_section_1_4.txt:md5,68245ac492b42288c310612a5e88cbe4", + "multiqc_normalised_expr_distrib_section_1_5.txt:md5,ac5f414686facdfc71016982d3824875", + "multiqc_normalised_expr_distrib_section_1_6.txt:md5,88d20ad256f42e564daf79ca8c13a1a2", + "multiqc_normalised_expr_distrib_section_1_7.txt:md5,4cb4700660dd2613194c7b62324d019b", + "multiqc_normalised_expr_distrib_section_1_8.txt:md5,43eb422269b358c59e2d31f9602b24b3", + "multiqc_normalised_expr_distrib_section_1_9.txt:md5,d444233cf608c17cfdc7cc8ebf2c2fe9", + "multiqc_null_values_filter.txt:md5,91eb32460cdebb4e08ae0b1ee559cf59", + "multiqc_ratio_nulls.txt:md5,bcf9aa423c404f2e7f8ea84735810959", + "multiqc_ratio_zeros.txt:md5,c743a773da2858b59923eff1873c26d0", + "multiqc_total_gene_id_occurrence_quantiles.txt:md5,9eb24790b7fbfee4b7c3bcff74a334db", + "multiqc_zero_values_filter.txt:md5,a9ec449705f94f15962e6ca856b87420", + "E_GEOD_51720_rnaseq.design.csv:md5,80805afb29837b6fbb73a6aa6f3a461b", + "E_GEOD_51720_rnaseq.rnaseq.raw.counts.csv:md5,07cd448196fc2fea4663bd9705da2b98", + "id_mapping_stats.csv:md5,cd17a5d4afa6b86a48adb03868d3073f", + "missing_values_filter_stats.csv:md5,cd1ab16f9c485f8e739a54344cde1aed", + "ratio_nulls.csv:md5,9c496b3b8c098a1bc17c6be7a87f2331", + "ratio_nulls_per_sample.csv:md5,9211cb6081071e8825119194faf6241f", + "ratio_zeros.csv:md5,17b7bde6ca29e11bb1e28db6b8053add", + "zero_values_filter_stats.csv:md5,766d888e41179e8a785f634b3b606bc9" + ] + ], + "timestamp": "2026-04-05T09:30:42.794565916", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + }, + "-profile test_public_and_dataset": { + "content": [ + [ + "aggregated", + "aggregated/all_genes_summary.csv", + "aggregated/custom_content_multiqc_config.yaml", + "aggregated/section_1.most_stable_genes_summary.csv", + "aggregated/section_1.most_stable_genes_transposed_counts.csv", + "aggregated/section_10.most_stable_genes_summary.csv", + "aggregated/section_10.most_stable_genes_transposed_counts.csv", + "aggregated/section_11.most_stable_genes_summary.csv", + "aggregated/section_11.most_stable_genes_transposed_counts.csv", + "aggregated/section_12.most_stable_genes_summary.csv", + "aggregated/section_12.most_stable_genes_transposed_counts.csv", + "aggregated/section_13.most_stable_genes_summary.csv", + "aggregated/section_13.most_stable_genes_transposed_counts.csv", + "aggregated/section_14.most_stable_genes_summary.csv", + "aggregated/section_14.most_stable_genes_transposed_counts.csv", + "aggregated/section_15.most_stable_genes_summary.csv", + "aggregated/section_15.most_stable_genes_transposed_counts.csv", + "aggregated/section_16.most_stable_genes_summary.csv", + "aggregated/section_16.most_stable_genes_transposed_counts.csv", + "aggregated/section_17.most_stable_genes_summary.csv", + "aggregated/section_17.most_stable_genes_transposed_counts.csv", + "aggregated/section_18.most_stable_genes_summary.csv", + "aggregated/section_18.most_stable_genes_transposed_counts.csv", + "aggregated/section_19.most_stable_genes_summary.csv", + "aggregated/section_19.most_stable_genes_transposed_counts.csv", + "aggregated/section_2.most_stable_genes_summary.csv", + "aggregated/section_2.most_stable_genes_transposed_counts.csv", + "aggregated/section_20.most_stable_genes_summary.csv", + "aggregated/section_20.most_stable_genes_transposed_counts.csv", + "aggregated/section_3.most_stable_genes_summary.csv", + "aggregated/section_3.most_stable_genes_transposed_counts.csv", + "aggregated/section_4.most_stable_genes_summary.csv", + "aggregated/section_4.most_stable_genes_transposed_counts.csv", + "aggregated/section_5.most_stable_genes_summary.csv", + "aggregated/section_5.most_stable_genes_transposed_counts.csv", + "aggregated/section_6.most_stable_genes_summary.csv", + "aggregated/section_6.most_stable_genes_transposed_counts.csv", + "aggregated/section_7.most_stable_genes_summary.csv", + "aggregated/section_7.most_stable_genes_transposed_counts.csv", + "aggregated/section_8.most_stable_genes_summary.csv", + "aggregated/section_8.most_stable_genes_transposed_counts.csv", + "aggregated/section_9.most_stable_genes_summary.csv", + "aggregated/section_9.most_stable_genes_transposed_counts.csv", + "dash_app", + "dash_app/app.py", + "dash_app/assets", + "dash_app/assets/style.css", + "dash_app/data", + "dash_app/data/all_counts.imputed.parquet", + "dash_app/data/all_genes_summary.csv", + "dash_app/data/whole_design.csv", + "dash_app/environment.yml", + "dash_app/src", + "dash_app/src/callbacks", + "dash_app/src/callbacks/common.py", + "dash_app/src/callbacks/genes.py", + "dash_app/src/callbacks/samples.py", + "dash_app/src/components", + "dash_app/src/components/graphs.py", + "dash_app/src/components/icons.py", + "dash_app/src/components/right_sidebar.py", + "dash_app/src/components/settings", + "dash_app/src/components/settings/genes.py", + "dash_app/src/components/settings/samples.py", + "dash_app/src/components/stores.py", + "dash_app/src/components/tables.py", + "dash_app/src/components/tooltips.py", + "dash_app/src/components/top.py", + "dash_app/src/utils", + "dash_app/src/utils/config.py", + "dash_app/src/utils/data_management.py", + "dash_app/src/utils/style.py", + "errors", + "gene_length", + "gene_length/Beta_vulgaris.RefBeet-1.2.2.62.gff3.gz", + "gene_length/gene_transcript_lengths.csv", + "idmapping", + "idmapping/global_gene_id_mapping.csv", + "idmapping/global_gene_metadata.csv", + "idmapping/gprofiler", + "idmapping/gprofiler/gene_metadata.csv", + "idmapping/gprofiler/mapped_gene_ids.csv", + "idmapping/renamed", + "idmapping/renamed/E_MTAB_8187_rnaseq.rnaseq.raw.counts.cleaned.renamed.parquet", + "idmapping/renamed/beta_vulgaris.rnaseq.raw.counts.cleaned.renamed.parquet", + "merged_datasets", + "merged_datasets/whole_design.csv", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/llms-full.txt", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc.parquet", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_eatlas_all_experiments_metadata.txt", + "multiqc/multiqc_data/multiqc_eatlas_selected_experiments_metadata.txt", + "multiqc/multiqc_data/multiqc_gene_statistics.txt", + "multiqc/multiqc_data/multiqc_genes_section_1.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_1.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_10.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_11.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_12.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_13.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_14.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_15.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_16.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_17.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_18.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_19.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_2.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_3.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_4.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_5.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_6.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_7.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_8.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_9.txt", + "multiqc/multiqc_data/multiqc_id_mapping_stats.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_1.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_10.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_11.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_12.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_13.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_14.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_15.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_16.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_17.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_18.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_19.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_2.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_3.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_4.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_5.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_6.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_7.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_8.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_9.txt", + "multiqc/multiqc_data/multiqc_null_values_filter.txt", + "multiqc/multiqc_data/multiqc_ratio_nulls.txt", + "multiqc/multiqc_data/multiqc_ratio_zeros.txt", + "multiqc/multiqc_data/multiqc_skewness.txt", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_data/multiqc_total_gene_id_occurrence_quantiles.txt", + "multiqc/multiqc_data/multiqc_zero_values_filter.txt", + "multiqc/multiqc_plots", + "multiqc/multiqc_plots/pdf", + "multiqc/multiqc_plots/pdf/genes_section_1.pdf", + "multiqc/multiqc_plots/pdf/normalised_expr_distrib_section_1.pdf", + "multiqc/multiqc_plots/png", + "multiqc/multiqc_plots/png/genes_section_1.png", + "multiqc/multiqc_plots/png/normalised_expr_distrib_section_1.png", + "multiqc/multiqc_plots/svg", + "multiqc/multiqc_plots/svg/genes_section_1.svg", + "multiqc/multiqc_plots/svg/normalised_expr_distrib_section_1.svg", + "multiqc/multiqc_report.html", + "normalised", + "normalised/quantile_normalised", + "normalised/quantile_normalised/E_MTAB_8187_rnaseq", + "normalised/quantile_normalised/E_MTAB_8187_rnaseq/E_MTAB_8187_rnaseq.rnaseq.raw.counts.cleaned.renamed.zeros_filtered.nulls_filtered.tpm.quant_norm.parquet", + "normalised/tpm", + "normalised/tpm/E_MTAB_8187_rnaseq", + "normalised/tpm/E_MTAB_8187_rnaseq/E_MTAB_8187_rnaseq.rnaseq.raw.counts.cleaned.renamed.zeros_filtered.nulls_filtered.tpm.parquet", + "pipeline_info", + "pipeline_info/nf_core_stableexpression_software_mqc_versions.yml", + "public_data", + "public_data/expression_atlas", + "public_data/expression_atlas/accessions", + "public_data/expression_atlas/accessions/accessions.txt", + "public_data/expression_atlas/accessions/selected_experiments.metadata.tsv", + "public_data/expression_atlas/accessions/species_experiments.metadata.tsv", + "public_data/expression_atlas/datasets", + "public_data/expression_atlas/datasets/E_MTAB_8187_rnaseq.design.csv", + "public_data/expression_atlas/datasets/E_MTAB_8187_rnaseq.rnaseq.raw.counts.csv", + "statistics", + "statistics/id_mapping_stats.csv", + "statistics/missing_values_filter_stats.csv", + "statistics/ratio_nulls.csv", + "statistics/ratio_nulls_per_sample.csv", + "statistics/ratio_zeros.csv", + "statistics/skewness.csv", + "statistics/zero_values_filter_stats.csv", + "warnings" + ], + [ + "all_genes_summary.csv:md5,e3f8d59accf267c351d0a995ffc9ebf5", + "custom_content_multiqc_config.yaml:md5,e048085491cb74658cf363545b1278fe", + "section_1.most_stable_genes_summary.csv:md5,be640cd7efc6a7ac3df989b9ab9a6448", + "section_1.most_stable_genes_transposed_counts.csv:md5,8363bc69b84c68fe4ecea13b6dc70d98", + "section_10.most_stable_genes_summary.csv:md5,41c3ba1e338277e40e03c9b043059cb0", + "section_10.most_stable_genes_transposed_counts.csv:md5,4a599908cea31077650911161a4fd155", + "section_11.most_stable_genes_summary.csv:md5,136e636de09496412dc76ef7fb10c47b", + "section_11.most_stable_genes_transposed_counts.csv:md5,9aeb482d2ff0cbfaa8d29a5af4357701", + "section_12.most_stable_genes_summary.csv:md5,c27fb0df29ac4fb3bea8df3fbb6ef2b1", + "section_12.most_stable_genes_transposed_counts.csv:md5,edbe661b7c150c1a8af01c3c52ea45f7", + "section_13.most_stable_genes_summary.csv:md5,0395eed958d9571fae34ae29b8fe643e", + "section_13.most_stable_genes_transposed_counts.csv:md5,3ece34d50b412abddbce5da5c05f10de", + "section_14.most_stable_genes_summary.csv:md5,8677aa89331f67690330becf078260e3", + "section_14.most_stable_genes_transposed_counts.csv:md5,15840cc29d8d27881b59f19804134f97", + "section_15.most_stable_genes_summary.csv:md5,182e3a6e3a855340c50b5d2705b84142", + "section_15.most_stable_genes_transposed_counts.csv:md5,8a4c0d3018f3ed87305b4cafa8d3a7ae", + "section_16.most_stable_genes_summary.csv:md5,6c41bed8aea0f1cfa973ae7dfc93a148", + "section_16.most_stable_genes_transposed_counts.csv:md5,e3196137992a40340e20cb46ebd5cbdd", + "section_17.most_stable_genes_summary.csv:md5,f4aaec1b2af2e89bf26c156b907097e8", + "section_17.most_stable_genes_transposed_counts.csv:md5,af06eab6bc04fc315544fcd0176da4cd", + "section_18.most_stable_genes_summary.csv:md5,5f21148626ed40d0d64b393babcf160d", + "section_18.most_stable_genes_transposed_counts.csv:md5,29fc2248ad428cb3ac8898b0a5471eec", + "section_19.most_stable_genes_summary.csv:md5,5acc2a1b1980004f88c0584a8cf0784e", + "section_19.most_stable_genes_transposed_counts.csv:md5,9586c452f93c486ed667fb343af3b13c", + "section_2.most_stable_genes_summary.csv:md5,95e986dad2f0232070aa47079b6465c1", + "section_2.most_stable_genes_transposed_counts.csv:md5,b22984d5b00ee4540fca59b5585a0a88", + "section_20.most_stable_genes_summary.csv:md5,9d9c5cd95d1d1a350a8d1f2ce363f882", + "section_20.most_stable_genes_transposed_counts.csv:md5,e9f4187bdc7079c3130bdff1e4ebf575", + "section_3.most_stable_genes_summary.csv:md5,7825d8dbcfd1c4e5a4e4ca42268d4ea8", + "section_3.most_stable_genes_transposed_counts.csv:md5,77d118556692fe285590489db96f47d0", + "section_4.most_stable_genes_summary.csv:md5,221b0d42881ada7cd7fcca65cdc827a4", + "section_4.most_stable_genes_transposed_counts.csv:md5,5d1d9ebe8151765fb37176c86f3c7812", + "section_5.most_stable_genes_summary.csv:md5,a3c3edb5fd3cf852185531a4adcd9fd9", + "section_5.most_stable_genes_transposed_counts.csv:md5,51289b18ac41641114892519d2e494a6", + "section_6.most_stable_genes_summary.csv:md5,5a7baf9eadb389cc234808d56ee6fdfe", + "section_6.most_stable_genes_transposed_counts.csv:md5,bb1cddda97df3915d2aad5973e1c8a16", + "section_7.most_stable_genes_summary.csv:md5,a1ed63a57844d1bce998eea23714f071", + "section_7.most_stable_genes_transposed_counts.csv:md5,82c59e866871569fbde316efea5e7ea3", + "section_8.most_stable_genes_summary.csv:md5,40673407e734107f0cebf2045023155a", + "section_8.most_stable_genes_transposed_counts.csv:md5,0bfb8031fc91115a61a57113a6df5c4d", + "section_9.most_stable_genes_summary.csv:md5,7178bb75b1733f71d0aeba2a09750b3b", + "section_9.most_stable_genes_transposed_counts.csv:md5,b02e0d31ed2c0fa925060893062c07a7", + "style.css:md5,e6ba182eaf06980dbda49920efbf6e64", + "all_genes_summary.csv:md5,e3f8d59accf267c351d0a995ffc9ebf5", + "whole_design.csv:md5,fbd18d011d7d855452e5a30a303afcbf", + "environment.yml:md5,dd081780e1f98d34b13289d019f8bb5b", + "Beta_vulgaris.RefBeet-1.2.2.62.gff3.gz:md5,6f2c45809441c8776e6578000db2b0e4", + "gene_transcript_lengths.csv:md5,458c7dfd3598bdcbcb6ceb76ccba189f", + "global_gene_id_mapping.csv:md5,7eecbd2d88adaf5f213f238a72d28b99", + "global_gene_metadata.csv:md5,cc8d4afdbaf03cd39a4e10f2a9040b7e", + "gene_metadata.csv:md5,cc8d4afdbaf03cd39a4e10f2a9040b7e", + "mapped_gene_ids.csv:md5,7eecbd2d88adaf5f213f238a72d28b99", + "whole_design.csv:md5,fbd18d011d7d855452e5a30a303afcbf", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", + "multiqc_eatlas_all_experiments_metadata.txt:md5,8b7643e0ef8eaaa3fa72f7103fd7ccee", + "multiqc_eatlas_selected_experiments_metadata.txt:md5,8b7643e0ef8eaaa3fa72f7103fd7ccee", + "multiqc_gene_statistics.txt:md5,d7750cb95663a63219dcec94e03d7af1", + "multiqc_genes_section_1.txt:md5,f310a16068d5e76713497e2d3824cf2d", + "multiqc_genes_section_1_1.txt:md5,d68c3cce20e06aaf226e88e0e52184b3", + "multiqc_genes_section_1_10.txt:md5,304bd44c0867a1419e7b48e5bb6dff05", + "multiqc_genes_section_1_11.txt:md5,807ad09f10e257546f18e5fb052511e9", + "multiqc_genes_section_1_12.txt:md5,e3d5acc5a292639bc3a1b1b5e7f5a04b", + "multiqc_genes_section_1_13.txt:md5,7dd72d333b12fc101f4a5b555e09d49a", + "multiqc_genes_section_1_14.txt:md5,59d0addf52e85cdf7d0163721c29c095", + "multiqc_genes_section_1_15.txt:md5,b10474b0ad8cd3cdf21dbe8dc4fd3676", + "multiqc_genes_section_1_16.txt:md5,6f038b7c99db654f2d749da25f7c213b", + "multiqc_genes_section_1_17.txt:md5,9f9f97f85d6605978b286942ac69ba2c", + "multiqc_genes_section_1_18.txt:md5,ab6c6e6e1a658ba92baa6dd2b68f56bf", + "multiqc_genes_section_1_19.txt:md5,5d4910983359e122e07fdbe2aeda10f7", + "multiqc_genes_section_1_2.txt:md5,89b5e91c54815bd340411210fb7b86a7", + "multiqc_genes_section_1_3.txt:md5,94130719e096ffd035a155aa59b4bdd0", + "multiqc_genes_section_1_4.txt:md5,ba0275140b46c0c2d2690304bfd008d8", + "multiqc_genes_section_1_5.txt:md5,fcdcb0618858bf79586f679f4834f902", + "multiqc_genes_section_1_6.txt:md5,9cf7cebccab8b0073cad3d43d4d2ef92", + "multiqc_genes_section_1_7.txt:md5,d440bc9cce034ba82dd0d9f3387f9094", + "multiqc_genes_section_1_8.txt:md5,dc1f5de798343036301a059b545a378f", + "multiqc_genes_section_1_9.txt:md5,e9402e81e8c32c8a6b4015c4a55962f0", + "multiqc_id_mapping_stats.txt:md5,d7c6d500c8ea91c32da4980b5557d15e", + "multiqc_normalised_expr_distrib_section_1.txt:md5,fe7c9f8eff636a38deee18a05e17ed4d", + "multiqc_normalised_expr_distrib_section_1_1.txt:md5,7578a930f8750ecb56e892a54211e28f", + "multiqc_normalised_expr_distrib_section_1_10.txt:md5,696c5b24d54057e4738bbd0b351c5d28", + "multiqc_normalised_expr_distrib_section_1_11.txt:md5,94ef2626cd23a3395ba0f53be43b529e", + "multiqc_normalised_expr_distrib_section_1_12.txt:md5,cf62d3846d7d00b438719e75551bd3fa", + "multiqc_normalised_expr_distrib_section_1_13.txt:md5,825766b14187d801ae2284dffd562ac4", + "multiqc_normalised_expr_distrib_section_1_14.txt:md5,b18a4df24ed61f0315d41d4cddfd6539", + "multiqc_normalised_expr_distrib_section_1_15.txt:md5,4d99b3d87c9a25b18fa5ed2061dfb71c", + "multiqc_normalised_expr_distrib_section_1_16.txt:md5,82305a3ca8a54e44a558d0c83dfca9f3", + "multiqc_normalised_expr_distrib_section_1_17.txt:md5,adf99bc87dd29499a1bfc50c3c26488c", + "multiqc_normalised_expr_distrib_section_1_18.txt:md5,8b64cbab2e0cca85575b18b41f973aa5", + "multiqc_normalised_expr_distrib_section_1_19.txt:md5,4544499f66cd9de554f2d26944028cd5", + "multiqc_normalised_expr_distrib_section_1_2.txt:md5,d74f1b40545293b2dba02a0ff167119d", + "multiqc_normalised_expr_distrib_section_1_3.txt:md5,e5701cd16921b4ce657ac131418e04d1", + "multiqc_normalised_expr_distrib_section_1_4.txt:md5,fd093b2d0d535ff16ba846bde129f690", + "multiqc_normalised_expr_distrib_section_1_5.txt:md5,4dbddb8d44680d3cc45a3053c510ca2d", + "multiqc_normalised_expr_distrib_section_1_6.txt:md5,497c20bb2f2d2c03595c897f30775411", + "multiqc_normalised_expr_distrib_section_1_7.txt:md5,5c3fb8ff5e1b90d0a9904712204fc36d", + "multiqc_normalised_expr_distrib_section_1_8.txt:md5,9e5d9c6fb87d348a893bfed6b24f01ce", + "multiqc_normalised_expr_distrib_section_1_9.txt:md5,6a40889210cec540d4b3a2e903454003", + "multiqc_null_values_filter.txt:md5,88b2d9e16cd8ab52f58a48fd5d915b8c", + "multiqc_ratio_nulls.txt:md5,c9ac04a67937c7bacfebc33fcd50aab1", + "multiqc_ratio_zeros.txt:md5,9f50cd64ea4afe3723c7e222182981f6", + "multiqc_total_gene_id_occurrence_quantiles.txt:md5,497b807412eb4478e97ff0c50846c9ce", + "multiqc_zero_values_filter.txt:md5,4082d32f92221ed686e79910c6d2f6b3", + "accessions.txt:md5,76e5e3af7c72eac7a1993a2bd75b4d1a", + "selected_experiments.metadata.tsv:md5,cf220f0d0aab141abf220c856430f2f2", + "species_experiments.metadata.tsv:md5,cf220f0d0aab141abf220c856430f2f2", + "E_MTAB_8187_rnaseq.design.csv:md5,fbd18d011d7d855452e5a30a303afcbf", + "E_MTAB_8187_rnaseq.rnaseq.raw.counts.csv:md5,fe221fd94f66df7120b0590091e14eb1", + "id_mapping_stats.csv:md5,dc2d9d7f34e570411c8cf5885b447719", + "missing_values_filter_stats.csv:md5,7db5e238928f520d761bd4792334304b", + "ratio_nulls.csv:md5,62625b0e4f7f36a59dfe077a4c709a94", + "ratio_nulls_per_sample.csv:md5,be115e6d6c5ed7b7206891ebaa0f7a67", + "ratio_zeros.csv:md5,96bbe4bd2d4c29ab5701588132af9684", + "zero_values_filter_stats.csv:md5,17fc6d525450d34445bf9cc25defe18a" + ] + ], + "timestamp": "2026-04-05T09:16:48.007287186", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + }, + "-profile test_download_only": { + "content": [ + { + "EXPRESSION_ATLAS": { + "ExpressionAtlas": "1.34.0", + "R": "4.4.3 (2025-02-28)", + "httpx": "0.28.1", + "nltk": "3.9.2", + "pandas": "3.0.1", + "python": "3.14.3", + "pyyaml": "6.0.3" + }, + "Workflow": { + "nf-core/stableexpression": "v1.0.0" + } + }, + [ + "errors", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/llms-full.txt", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc.parquet", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_report.html", + "pipeline_info", + "pipeline_info/nf_core_stableexpression_software_mqc_versions.yml", + "public_data", + "public_data/expression_atlas", + "public_data/expression_atlas/accessions", + "public_data/expression_atlas/accessions/accessions.txt", + "public_data/expression_atlas/accessions/selected_experiments.metadata.tsv", + "public_data/expression_atlas/accessions/species_experiments.metadata.tsv", + "public_data/expression_atlas/datasets", + "public_data/expression_atlas/datasets/E_MTAB_8187_rnaseq.design.csv", + "public_data/expression_atlas/datasets/E_MTAB_8187_rnaseq.rnaseq.raw.counts.csv", + "statistics", + "warnings" + ], + [ + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", + "accessions.txt:md5,76e5e3af7c72eac7a1993a2bd75b4d1a", + "selected_experiments.metadata.tsv:md5,cf220f0d0aab141abf220c856430f2f2", + "species_experiments.metadata.tsv:md5,cf220f0d0aab141abf220c856430f2f2", + "E_MTAB_8187_rnaseq.design.csv:md5,fbd18d011d7d855452e5a30a303afcbf", + "E_MTAB_8187_rnaseq.rnaseq.raw.counts.csv:md5,fe221fd94f66df7120b0590091e14eb1" + ] + ], + "timestamp": "2026-03-29T14:36:07.351960504", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/tests/input/custom_datasets/input.csv b/tests/input/custom_datasets/input.csv deleted file mode 100644 index 954bfc0e..00000000 --- a/tests/input/custom_datasets/input.csv +++ /dev/null @@ -1,3 +0,0 @@ -counts,design,normalized -tests/input/custom_datasets/normalized.csv,tests/input/custom_datasets/normalized.design.csv,true -tests/input/custom_datasets/raw.csv,tests/input/custom_datasets/raw.design.csv,false diff --git a/tests/input/custom_datasets/normalized.csv b/tests/input/custom_datasets/normalized.csv deleted file mode 100644 index 2a926476..00000000 --- a/tests/input/custom_datasets/normalized.csv +++ /dev/null @@ -1,10 +0,0 @@ -,GSM1528575,GSM1528576,GSM1528579,GSM1528583,GSM1528584,GSM1528585,GSM1528580,GSM1528586 -ENSRNA049434199,0.126148617281988,0.19838680107162,0.0960190734578323,0.0952170475957331,0.154053780171438,0.188205036552322,0.0678014076474789,0.0721303532291245 -ENSRNA049434246,0.126148617281988,0.19838680107162,0.0960190734578323,0.0952170475957331,0.154053780171438,0.188205036552322,0.0678014076474789,0.0721303532291245 -ENSRNA049434252,0.126148617281988,0.19838680107162,0.0960190734578323,0.0952170475957331,0.154053780171438,0.188205036552322,0.0678014076474789,0.0721303532291245 -ENSRNA049434260,0.126148617281988,0.19838680107162,0.0960190734578323,0.0952170475957331,0.154053780171438,0.188205036552322,0.0678014076474789,0.0721303532291245 -ENSRNA049434273,0.126148617281988,0.19838680107162,0.0960190734578323,0.0952170475957331,0.154053780171438,0.188205036552322,0.0678014076474789,0.0721303532291245 -ENSRNA049434281,0.126148617281988,0.19838680107162,0.0960190734578323,0.0952170475957331,0.154053780171438,0.188205036552322,0.0678014076474789,0.0721303532291245 -ENSRNA049434309,0.126148617281988,0.19838680107162,0.0960190734578323,0.0952170475957331,0.154053780171438,0.188205036552322,0.0678014076474789,0.0721303532291245 -ENSRNA049434344,0.126148617281988,0.19838680107162,0.0960190734578323,0.0952170475957331,0.154053780171438,0.188205036552322,0.0678014076474789,0.0721303532291245 -ENSRNA049434386,0.126148617281988,0.19838680107162,0.0960190734578323,0.0952170475957331,0.154053780171438,0.188205036552322,0.0678014076474789,0.0721303532291245 diff --git a/tests/input/custom_datasets/raw.csv b/tests/input/custom_datasets/raw.csv deleted file mode 100644 index f8c8e32f..00000000 --- a/tests/input/custom_datasets/raw.csv +++ /dev/null @@ -1,10 +0,0 @@ -,ESM1528575,ESM1528576,ESM1528579,ESM1528583,ESM1528584,ESM1528585,ESM1528580,ESM1528586,ESM1528582,ESM1528578,ESM1528581,ESM1528577 -ENSRNA049434199,1,82,8,82,4,68,88,73,46,57,25,22 -ENSRNA049434246,68,93,41,84,36,18,28,92,84,85,92,32 -ENSRNA049434252,38,10,0,23,11,17,95,57,25,82,10,70 -ENSRNA049434260,75,55,7,30,79,60,15,97,12,35,60,56 -ENSRNA049434273,35,64,55,91,48,95,68,100,24,26,100,47 -ENSRNA049434281,8,99,80,48,86,29,80,17,19,9,44,2 -ENSRNA049434309,67,7,98,53,3,10,52,87,4,80,22,15 -ENSRNA049434344,8,40,24,90,42,52,79,81,94,23,35,81 -ENSRNA049434386,45,49,67,73,26,76,41,16,34,47,36,25 diff --git a/tests/input/idmapping/counts.ensembl_ids.csv b/tests/input/idmapping/counts.ensembl_ids.csv deleted file mode 100644 index 0a9dbca4..00000000 --- a/tests/input/idmapping/counts.ensembl_ids.csv +++ /dev/null @@ -1,4 +0,0 @@ -ERR029909,ERR029910,ERR029911,ERR029912,ERR029913,ERR029914,ERR029915,ERR029916,ERR029917,ERR029918,ERR029920,ERR029921,ERR029922,ERR029923,ERR029924 -ENSRNA049434199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -ENSRNA049434246,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -ENSRNA049434252,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 diff --git a/tests/input/merge_designs/design_1.csv b/tests/input/merge_designs/design_1.csv deleted file mode 100644 index 9afd9984..00000000 --- a/tests/input/merge_designs/design_1.csv +++ /dev/null @@ -1,7 +0,0 @@ -batch,group,sample -E-GEOD-456,g1,ERR475578 -E-GEOD-456,g1,ERR475579 -E-GEOD-456,g1,ERR475580 -E-GEOD-456,g2,ERR475581 -E-GEOD-456,g2,ERR475582 -E-GEOD-456,g2,ERR475583 diff --git a/tests/input/merge_designs/design_2.csv b/tests/input/merge_designs/design_2.csv deleted file mode 100644 index 8488c46e..00000000 --- a/tests/input/merge_designs/design_2.csv +++ /dev/null @@ -1,7 +0,0 @@ -batch,group,sample -E-GEOD-457,g1,SRR475578 -E-GEOD-457,g1,SRR475579 -E-GEOD-457,g1,SRR475580 -E-GEOD-457,g3,SRR475581 -E-GEOD-457,g3,SRR475582 -E-GEOD-457,g3,SRR475583 diff --git a/tests/input/variation_coefficient/counts.ncbi_ids_renamed.csv b/tests/input/variation_coefficient/counts.ncbi_ids_renamed.csv deleted file mode 100644 index 5ed0d802..00000000 --- a/tests/input/variation_coefficient/counts.ncbi_ids_renamed.csv +++ /dev/null @@ -1,5 +0,0 @@ -,ARR029909,ARR029910,ARR029911,ARR029912,ARR029913,ARR029914,ARR029915,ARR029916,ARR029917 -AT1G34790,5.295157,34.545567,87.415197,47.819509,80.885364,53.474538,22.531628,45.648511,71.444148 -AT5G35550,47.706579,47.408592,97.530235,0.821778,47.796096,80.552969,55.432428,63.057793,2.289668 -AT5G23260,87.802874,4.881611,69.563466,55.819793,3.572211,99.246230,79.524440,76.626335,1.435747 - diff --git a/tests/input/variation_coefficient/counts.uniprot_ids_renamed.csv b/tests/input/variation_coefficient/counts.uniprot_ids_renamed.csv deleted file mode 100644 index 57c1ce9c..00000000 --- a/tests/input/variation_coefficient/counts.uniprot_ids_renamed.csv +++ /dev/null @@ -1,5 +0,0 @@ -,URR029909,URR029910,URR029911,URR029912,URR029913,URR029914,URR029915,URR029916,URR029917 -AT1G34790,60.113057,64.080682,93.481811,35.197164,20.115891,93.052843,71.002869,65.849011,16.239896 -AT5G35550,71.485047,21.713193, 3.318757,18.404822,70.246917,75.552686,83.366080, 0.340416,23.179154 -AT5G23260,71.122807,47.981484,85.599454,69.023553,40.420572,30.220852,73.996866, 8.559519,80.013134 - diff --git a/tests/main.nf.test b/tests/main.nf.test deleted file mode 100644 index 13ad3464..00000000 --- a/tests/main.nf.test +++ /dev/null @@ -1,24 +0,0 @@ -nextflow_pipeline { - - name "Test Workflow main.nf" - script "main.nf" - tag "pipeline" - - test("Full pipeline - Expression Atlas only") { - - when { - params { - species = 'solanum tuberosum' - eatlas_accessions = "E-MTAB-552,E-GEOD-61690" - fetch_eatlas_accessions = true - eatlas_keywords = "phloem" - } - } - - then { - assert workflow.success - } - - } - -} diff --git a/tests/modules/local/aggregate_results/main.nf.test b/tests/modules/local/aggregate_results/main.nf.test new file mode 100644 index 00000000..27bb47d4 --- /dev/null +++ b/tests/modules/local/aggregate_results/main.nf.test @@ -0,0 +1,154 @@ +nextflow_process { + + name "Test Process AGGREGATE_RESULTS" + script "modules/local/aggregate_results/main.nf" + process "AGGREGATE_RESULTS" + tag "aggregate_results" + + test("Without microarray") { + + when { + process { + """ + input[0] = file( '$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet', checkIfExists: true) + input[1] = [ + file( '$projectDir/tests/test_data/base_statistics/output/section_1.stats_with_scores.csv', checkIfExists: true), + file( '$projectDir/tests/test_data/base_statistics/output/section_2.stats_with_scores.csv', checkIfExists: true) + ] + input[2] = [ file( '$projectDir/tests/test_data/aggregate_results/rnaseq_stats_all_genes.csv', checkIfExists: true) ] + input[3] = [] + input[4] = file( '$projectDir/tests/test_data/aggregate_results/metadata.csv', checkIfExists: true) + input[5] = file( '$projectDir/tests/test_data/aggregate_results/mapping.csv', checkIfExists: true) + input[6] = file( '$projectDir/assets/multiqc_config.custom_content.template.yaml', checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("With microarray") { + + when { + process { + """ + input[0] = file( '$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet', checkIfExists: true) + input[1] = [ + file( '$projectDir/tests/test_data/base_statistics/output/section_1.stats_with_scores.csv', checkIfExists: true), + file( '$projectDir/tests/test_data/base_statistics/output/section_2.stats_with_scores.csv', checkIfExists: true) + ] + input[2] = [ + file( '$projectDir/tests/test_data/aggregate_results/rnaseq_stats_all_genes.csv', checkIfExists: true), + file( '$projectDir/tests/test_data/aggregate_results/microarray_stats_all_genes.csv', checkIfExists: true) + ] + input[3] = [] + input[4] = file( '$projectDir/tests/test_data/aggregate_results/metadata.csv', checkIfExists: true) + input[5] = file( '$projectDir/tests/test_data/aggregate_results/mapping.csv', checkIfExists: true) + input[6] = file( '$projectDir/assets/multiqc_config.custom_content.template.yaml', checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("With valid target genes") { + + when { + process { + """ + input[0] = file( '$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet', checkIfExists: true) + input[1] = [ + file( '$projectDir/tests/test_data/base_statistics/output/section_1.stats_with_scores.csv', checkIfExists: true), + file( '$projectDir/tests/test_data/base_statistics/output/section_2.stats_with_scores.csv', checkIfExists: true) + ] + input[2] = [ + file( '$projectDir/tests/test_data/aggregate_results/rnaseq_stats_all_genes.csv', checkIfExists: true), + file( '$projectDir/tests/test_data/aggregate_results/microarray_stats_all_genes.csv', checkIfExists: true) + ] + input[3] = ["ENSRNA049454747", "ENSRNA049434246"] + input[4] = file( '$projectDir/tests/test_data/aggregate_results/metadata.csv', checkIfExists: true) + input[5] = file( '$projectDir/tests/test_data/aggregate_results/mapping.csv', checkIfExists: true) + input[6] = file( '$projectDir/assets/multiqc_config.custom_content.template.yaml', checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("One invalid target gene") { + + when { + process { + """ + input[0] = file( '$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet', checkIfExists: true) + input[1] = [ + file( '$projectDir/tests/test_data/base_statistics/output/section_1.stats_with_scores.csv', checkIfExists: true), + file( '$projectDir/tests/test_data/base_statistics/output/section_2.stats_with_scores.csv', checkIfExists: true) + ] + input[2] = [ + file( '$projectDir/tests/test_data/aggregate_results/rnaseq_stats_all_genes.csv', checkIfExists: true), + file( '$projectDir/tests/test_data/aggregate_results/microarray_stats_all_genes.csv', checkIfExists: true) + ] + input[3] = ["ENSRNA049454747", "UNKNOWNGENEID1234"] + input[4] = file( '$projectDir/tests/test_data/aggregate_results/metadata.csv', checkIfExists: true) + input[5] = file( '$projectDir/tests/test_data/aggregate_results/mapping.csv', checkIfExists: true) + input[6] = file( '$projectDir/assets/multiqc_config.custom_content.template.yaml', checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("One section") { + + when { + process { + """ + input[0] = file( '$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet', checkIfExists: true) + input[1] = file( '$projectDir/tests/test_data/base_statistics/output/section_1.stats_with_scores.csv', checkIfExists: true) + input[2] = [ file( '$projectDir/tests/test_data/aggregate_results/rnaseq_stats_all_genes.csv', checkIfExists: true) ] + input[3] = [] + input[4] = file( '$projectDir/tests/test_data/aggregate_results/metadata.csv', checkIfExists: true) + input[5] = file( '$projectDir/tests/test_data/aggregate_results/mapping.csv', checkIfExists: true) + input[6] = file( '$projectDir/assets/multiqc_config.custom_content.template.yaml', checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/tests/modules/local/aggregate_results/main.nf.test.snap b/tests/modules/local/aggregate_results/main.nf.test.snap new file mode 100644 index 00000000..ef6e609a --- /dev/null +++ b/tests/modules/local/aggregate_results/main.nf.test.snap @@ -0,0 +1,330 @@ +{ + "With valid target genes": { + "content": [ + { + "0": [ + "all_genes_summary.csv:md5,67da835eca8de21309e7b3ec0f6a31f7" + ], + "1": [ + [ + "section_1.most_stable_genes_summary.csv:md5,a4c693ec5ae3f4e0e5313811dd96fa21", + "section_2.most_stable_genes_summary.csv:md5,1c33432a2576231c821e52424113a65b" + ] + ], + "2": [ + [ + "section_1.most_stable_genes_transposed_counts.csv:md5,30f84570c2104f7cfac4289d583b68cd", + "section_2.most_stable_genes_transposed_counts.csv:md5,3ffbb8370e2bdd0c1867610a51405260" + ] + ], + "3": [ + "custom_content_multiqc_config.yaml:md5,edf372668919bebe05783bc16995c5c4" + ], + "4": [ + [ + "AGGREGATE_RESULTS", + "python", + "3.14.3" + ] + ], + "5": [ + [ + "AGGREGATE_RESULTS", + "polars", + "1.39.2" + ] + ], + "6": [ + [ + "AGGREGATE_RESULTS", + "pyyaml", + "6.0.3" + ] + ], + "all_genes_summary": [ + "all_genes_summary.csv:md5,67da835eca8de21309e7b3ec0f6a31f7" + ], + "custom_content_multiqc_config": [ + "custom_content_multiqc_config.yaml:md5,edf372668919bebe05783bc16995c5c4" + ], + "most_stable_genes_summary": [ + [ + "section_1.most_stable_genes_summary.csv:md5,a4c693ec5ae3f4e0e5313811dd96fa21", + "section_2.most_stable_genes_summary.csv:md5,1c33432a2576231c821e52424113a65b" + ] + ], + "most_stable_genes_transposed_counts_filtered": [ + [ + "section_1.most_stable_genes_transposed_counts.csv:md5,30f84570c2104f7cfac4289d583b68cd", + "section_2.most_stable_genes_transposed_counts.csv:md5,3ffbb8370e2bdd0c1867610a51405260" + ] + ] + } + ], + "timestamp": "2026-04-04T09:38:02.365611798", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "Without microarray": { + "content": [ + { + "0": [ + "all_genes_summary.csv:md5,62a7b6ba136e4e2f7ab954386a6fbe5e" + ], + "1": [ + [ + "section_1.most_stable_genes_summary.csv:md5,8f75c2b1041d3cea08f13dfa05378a78", + "section_2.most_stable_genes_summary.csv:md5,edc6b56e2f4710c490906cd8c9a54790" + ] + ], + "2": [ + [ + "section_1.most_stable_genes_transposed_counts.csv:md5,30f84570c2104f7cfac4289d583b68cd", + "section_2.most_stable_genes_transposed_counts.csv:md5,3ffbb8370e2bdd0c1867610a51405260" + ] + ], + "3": [ + "custom_content_multiqc_config.yaml:md5,3b4d962847a26bdc7c0fa34c4fcff168" + ], + "4": [ + [ + "AGGREGATE_RESULTS", + "python", + "3.14.3" + ] + ], + "5": [ + [ + "AGGREGATE_RESULTS", + "polars", + "1.39.2" + ] + ], + "6": [ + [ + "AGGREGATE_RESULTS", + "pyyaml", + "6.0.3" + ] + ], + "all_genes_summary": [ + "all_genes_summary.csv:md5,62a7b6ba136e4e2f7ab954386a6fbe5e" + ], + "custom_content_multiqc_config": [ + "custom_content_multiqc_config.yaml:md5,3b4d962847a26bdc7c0fa34c4fcff168" + ], + "most_stable_genes_summary": [ + [ + "section_1.most_stable_genes_summary.csv:md5,8f75c2b1041d3cea08f13dfa05378a78", + "section_2.most_stable_genes_summary.csv:md5,edc6b56e2f4710c490906cd8c9a54790" + ] + ], + "most_stable_genes_transposed_counts_filtered": [ + [ + "section_1.most_stable_genes_transposed_counts.csv:md5,30f84570c2104f7cfac4289d583b68cd", + "section_2.most_stable_genes_transposed_counts.csv:md5,3ffbb8370e2bdd0c1867610a51405260" + ] + ] + } + ], + "timestamp": "2026-03-30T14:06:37.615799808", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "One invalid target gene": { + "content": [ + { + "0": [ + "all_genes_summary.csv:md5,67da835eca8de21309e7b3ec0f6a31f7" + ], + "1": [ + [ + "section_1.most_stable_genes_summary.csv:md5,a4c693ec5ae3f4e0e5313811dd96fa21", + "section_2.most_stable_genes_summary.csv:md5,1c33432a2576231c821e52424113a65b" + ] + ], + "2": [ + [ + "section_1.most_stable_genes_transposed_counts.csv:md5,30f84570c2104f7cfac4289d583b68cd", + "section_2.most_stable_genes_transposed_counts.csv:md5,3ffbb8370e2bdd0c1867610a51405260" + ] + ], + "3": [ + "custom_content_multiqc_config.yaml:md5,401b9b35a47e29a8dfac3ca7700e26bd" + ], + "4": [ + [ + "AGGREGATE_RESULTS", + "python", + "3.14.3" + ] + ], + "5": [ + [ + "AGGREGATE_RESULTS", + "polars", + "1.39.2" + ] + ], + "6": [ + [ + "AGGREGATE_RESULTS", + "pyyaml", + "6.0.3" + ] + ], + "all_genes_summary": [ + "all_genes_summary.csv:md5,67da835eca8de21309e7b3ec0f6a31f7" + ], + "custom_content_multiqc_config": [ + "custom_content_multiqc_config.yaml:md5,401b9b35a47e29a8dfac3ca7700e26bd" + ], + "most_stable_genes_summary": [ + [ + "section_1.most_stable_genes_summary.csv:md5,a4c693ec5ae3f4e0e5313811dd96fa21", + "section_2.most_stable_genes_summary.csv:md5,1c33432a2576231c821e52424113a65b" + ] + ], + "most_stable_genes_transposed_counts_filtered": [ + [ + "section_1.most_stable_genes_transposed_counts.csv:md5,30f84570c2104f7cfac4289d583b68cd", + "section_2.most_stable_genes_transposed_counts.csv:md5,3ffbb8370e2bdd0c1867610a51405260" + ] + ] + } + ], + "timestamp": "2026-03-30T14:47:07.501875225", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "One section": { + "content": [ + { + "0": [ + "all_genes_summary.csv:md5,d2d279f4c5243b3af01130ca04b5603d" + ], + "1": [ + "section_1.most_stable_genes_summary.csv:md5,8f75c2b1041d3cea08f13dfa05378a78" + ], + "2": [ + "section_1.most_stable_genes_transposed_counts.csv:md5,30f84570c2104f7cfac4289d583b68cd" + ], + "3": [ + "custom_content_multiqc_config.yaml:md5,4941e220852b2c814302f508cf5837cd" + ], + "4": [ + [ + "AGGREGATE_RESULTS", + "python", + "3.14.3" + ] + ], + "5": [ + [ + "AGGREGATE_RESULTS", + "polars", + "1.39.2" + ] + ], + "6": [ + [ + "AGGREGATE_RESULTS", + "pyyaml", + "6.0.3" + ] + ], + "all_genes_summary": [ + "all_genes_summary.csv:md5,d2d279f4c5243b3af01130ca04b5603d" + ], + "custom_content_multiqc_config": [ + "custom_content_multiqc_config.yaml:md5,4941e220852b2c814302f508cf5837cd" + ], + "most_stable_genes_summary": [ + "section_1.most_stable_genes_summary.csv:md5,8f75c2b1041d3cea08f13dfa05378a78" + ], + "most_stable_genes_transposed_counts_filtered": [ + "section_1.most_stable_genes_transposed_counts.csv:md5,30f84570c2104f7cfac4289d583b68cd" + ] + } + ], + "timestamp": "2026-03-30T14:47:13.474058057", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "With microarray": { + "content": [ + { + "0": [ + "all_genes_summary.csv:md5,67da835eca8de21309e7b3ec0f6a31f7" + ], + "1": [ + [ + "section_1.most_stable_genes_summary.csv:md5,a4c693ec5ae3f4e0e5313811dd96fa21", + "section_2.most_stable_genes_summary.csv:md5,1c33432a2576231c821e52424113a65b" + ] + ], + "2": [ + [ + "section_1.most_stable_genes_transposed_counts.csv:md5,30f84570c2104f7cfac4289d583b68cd", + "section_2.most_stable_genes_transposed_counts.csv:md5,3ffbb8370e2bdd0c1867610a51405260" + ] + ], + "3": [ + "custom_content_multiqc_config.yaml:md5,3b4d962847a26bdc7c0fa34c4fcff168" + ], + "4": [ + [ + "AGGREGATE_RESULTS", + "python", + "3.14.3" + ] + ], + "5": [ + [ + "AGGREGATE_RESULTS", + "polars", + "1.39.2" + ] + ], + "6": [ + [ + "AGGREGATE_RESULTS", + "pyyaml", + "6.0.3" + ] + ], + "all_genes_summary": [ + "all_genes_summary.csv:md5,67da835eca8de21309e7b3ec0f6a31f7" + ], + "custom_content_multiqc_config": [ + "custom_content_multiqc_config.yaml:md5,3b4d962847a26bdc7c0fa34c4fcff168" + ], + "most_stable_genes_summary": [ + [ + "section_1.most_stable_genes_summary.csv:md5,a4c693ec5ae3f4e0e5313811dd96fa21", + "section_2.most_stable_genes_summary.csv:md5,1c33432a2576231c821e52424113a65b" + ] + ], + "most_stable_genes_transposed_counts_filtered": [ + [ + "section_1.most_stable_genes_transposed_counts.csv:md5,30f84570c2104f7cfac4289d583b68cd", + "section_2.most_stable_genes_transposed_counts.csv:md5,3ffbb8370e2bdd0c1867610a51405260" + ] + ] + } + ], + "timestamp": "2026-03-30T14:46:55.231695582", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/tests/modules/local/compute_dataset_statistics/main.nf.test b/tests/modules/local/compute_dataset_statistics/main.nf.test new file mode 100644 index 00000000..d0e815b7 --- /dev/null +++ b/tests/modules/local/compute_dataset_statistics/main.nf.test @@ -0,0 +1,32 @@ +nextflow_process { + + name "Test Process COMPUTE_DATASET_STATISTICS" + script "modules/local/compute_dataset_statistics/main.nf" + process "COMPUTE_DATASET_STATISTICS" + tag "dataset_stats" + + /* + TODO: see why this test works locally, even with act, but fails in CI + test("Should not fail") { + + when { + process { + """ + input[0] = [ + [dataset: 'test'], + file( '$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + */ + +} diff --git a/tests/modules/local/compute_dataset_statistics/main.nf.test.snap b/tests/modules/local/compute_dataset_statistics/main.nf.test.snap new file mode 100644 index 00000000..f0454e06 --- /dev/null +++ b/tests/modules/local/compute_dataset_statistics/main.nf.test.snap @@ -0,0 +1,33 @@ +{ + "Should not fail": { + "content": [ + { + "0": [ + [ + "test", + "skewness.txt:md5,0503443761b306e254ac1c0075ea267e" + ] + ], + "1": [ + [ + "COMPUTE_DATASET_STATISTICS", + "python", + "3.12.8" + ] + ], + "2": [ + [ + "COMPUTE_DATASET_STATISTICS", + "polars", + "1.37.1" + ] + ] + } + ], + "timestamp": "2026-04-02T14:11:44.847136183", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/tests/modules/local/compute_gene_statistics/main.nf.test b/tests/modules/local/compute_gene_statistics/main.nf.test new file mode 100644 index 00000000..9b126a91 --- /dev/null +++ b/tests/modules/local/compute_gene_statistics/main.nf.test @@ -0,0 +1,83 @@ +nextflow_process { + + name "Test Process COMPUTE_GENE_STATISTICS" + script "modules/local/compute_gene_statistics/main.nf" + process "COMPUTE_GENE_STATISTICS" + tag "gene_stats" + + test("No platform") { + + when { + process { + """ + input[0] = [ + [ platform: 'all' ], + file( '$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet', checkIfExists: true), + file( '$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet', checkIfExists: true) + ] + input[1] = file( '$projectDir/tests/test_data/compute_gene_statistics/input/ratio_nulls_per_sample.csv', checkIfExists: true) + input[2] = 0.2 + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("RNAseq platform") { + + when { + process { + """ + input[0] = [ + [ platform: 'rnaseq' ], + file( '$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet', checkIfExists: true), + file( '$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet', checkIfExists: true) + ] + input[1] = file( '$projectDir/tests/test_data/compute_gene_statistics/input/ratio_nulls_per_sample.csv', checkIfExists: true) + input[2] = 0.2 + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("No imputed values") { + + when { + process { + """ + input[0] = [ + [ platform: 'rnaseq' ], + file( '$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet', checkIfExists: true), + [] + ] + input[1] = file( '$projectDir/tests/test_data/compute_gene_statistics/input/ratio_nulls_per_sample.csv', checkIfExists: true) + input[2] = 0.2 + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/tests/modules/local/compute_gene_statistics/main.nf.test.snap b/tests/modules/local/compute_gene_statistics/main.nf.test.snap new file mode 100644 index 00000000..9db76e85 --- /dev/null +++ b/tests/modules/local/compute_gene_statistics/main.nf.test.snap @@ -0,0 +1,95 @@ +{ + "No imputed values": { + "content": [ + { + "0": [ + "rnaseq.stats_all_genes.csv:md5,e2a15d08a3ada8daba6d5b834dbe1de7" + ], + "1": [ + [ + "COMPUTE_GENE_STATISTICS", + "python", + "3.14.3" + ] + ], + "2": [ + [ + "COMPUTE_GENE_STATISTICS", + "polars", + "1.39.2" + ] + ], + "stats": [ + "rnaseq.stats_all_genes.csv:md5,e2a15d08a3ada8daba6d5b834dbe1de7" + ] + } + ], + "timestamp": "2026-03-30T14:48:46.011713833", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "No platform": { + "content": [ + { + "0": [ + "stats_all_genes.csv:md5,42e9e52c43527e80489294a2c2dbbec0" + ], + "1": [ + [ + "COMPUTE_GENE_STATISTICS", + "python", + "3.14.3" + ] + ], + "2": [ + [ + "COMPUTE_GENE_STATISTICS", + "polars", + "1.39.2" + ] + ], + "stats": [ + "stats_all_genes.csv:md5,42e9e52c43527e80489294a2c2dbbec0" + ] + } + ], + "timestamp": "2026-03-30T14:48:33.525954126", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "RNAseq platform": { + "content": [ + { + "0": [ + "rnaseq.stats_all_genes.csv:md5,e2a15d08a3ada8daba6d5b834dbe1de7" + ], + "1": [ + [ + "COMPUTE_GENE_STATISTICS", + "python", + "3.14.3" + ] + ], + "2": [ + [ + "COMPUTE_GENE_STATISTICS", + "polars", + "1.39.2" + ] + ], + "stats": [ + "rnaseq.stats_all_genes.csv:md5,e2a15d08a3ada8daba6d5b834dbe1de7" + ] + } + ], + "timestamp": "2026-03-30T14:48:39.77826003", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/tests/modules/local/compute_stability_scores/main.nf.test b/tests/modules/local/compute_stability_scores/main.nf.test new file mode 100644 index 00000000..275fdf75 --- /dev/null +++ b/tests/modules/local/compute_stability_scores/main.nf.test @@ -0,0 +1,58 @@ +nextflow_process { + + name "Test Process COMPUTE_STABILITY_SCORES" + script "modules/local/compute_stability_scores/main.nf" + process "COMPUTE_STABILITY_SCORES" + tag "stability_scores" + + test("With Genorm") { + + when { + process { + """ + input[0] = [ + [section: "section_1"], + file( '$projectDir/tests/test_data/compute_stability_scores/input/stability_values.normfinder.csv', checkIfExists: true), + file( '$projectDir/tests/test_data/compute_stability_scores/input/genorm.m_measures.csv', checkIfExists: true), + file( '$projectDir/tests/test_data/compute_stability_scores/input/stats_all_genes.parquet', checkIfExists: true), + ] + input[1] = "0.8,0.1,0.1,0.1" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Without Genorm") { + + when { + process { + """ + input[0] = [ + [section: "section_1"], + file( '$projectDir/tests/test_data/compute_stability_scores/input/stability_values.normfinder.csv', checkIfExists: true), + [], + file( '$projectDir/tests/test_data/compute_stability_scores/input/stats_all_genes.parquet', checkIfExists: true), + ] + input[1] = "0.8,0.1,0.1,0.1" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/tests/modules/local/compute_stability_scores/main.nf.test.snap b/tests/modules/local/compute_stability_scores/main.nf.test.snap new file mode 100644 index 00000000..ebedc0ce --- /dev/null +++ b/tests/modules/local/compute_stability_scores/main.nf.test.snap @@ -0,0 +1,64 @@ +{ + "With Genorm": { + "content": [ + { + "0": [ + "section_1.stats_with_scores.csv:md5,7b1dd3c6e4a666561ca6ebe14aae7b74" + ], + "1": [ + [ + "COMPUTE_STABILITY_SCORES", + "python", + "3.14.3" + ] + ], + "2": [ + [ + "COMPUTE_STABILITY_SCORES", + "polars", + "1.39.2" + ] + ], + "stats_with_stability_scores": [ + "section_1.stats_with_scores.csv:md5,7b1dd3c6e4a666561ca6ebe14aae7b74" + ] + } + ], + "timestamp": "2026-03-30T15:20:22.075756497", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "Without Genorm": { + "content": [ + { + "0": [ + "section_1.stats_with_scores.csv:md5,bdf823d07ed6fed0313e5cf2ce1811a6" + ], + "1": [ + [ + "COMPUTE_STABILITY_SCORES", + "python", + "3.14.3" + ] + ], + "2": [ + [ + "COMPUTE_STABILITY_SCORES", + "polars", + "1.39.2" + ] + ], + "stats_with_stability_scores": [ + "section_1.stats_with_scores.csv:md5,bdf823d07ed6fed0313e5cf2ce1811a6" + ] + } + ], + "timestamp": "2026-03-30T15:20:28.206402711", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/tests/modules/local/deseq2/normalize/main.nf.test b/tests/modules/local/deseq2/normalize/main.nf.test deleted file mode 100644 index afd663ed..00000000 --- a/tests/modules/local/deseq2/normalize/main.nf.test +++ /dev/null @@ -1,28 +0,0 @@ -nextflow_process { - - name "Test Process DESEQ2_NORMALIZE" - script "modules/local/deseq2/normalize/main.nf" - process "DESEQ2_NORMALIZE" - tag "deseq2_normalize" - tag "module" - - test("Should run without failures") { - - when { - - process { - """ - meta = [accession: "accession", design: file('$baseDir/tests/input/normalize/design.csv')] - input[0] = [meta, file('$baseDir/tests/input/normalize/all_counts.csv')] - """ - } - } - - then { - assert process.success - assert snapshot(process.out.csv).match() - } - - } - -} diff --git a/tests/modules/local/deseq2/normalize/main.nf.test.snap b/tests/modules/local/deseq2/normalize/main.nf.test.snap deleted file mode 100644 index a80789b2..00000000 --- a/tests/modules/local/deseq2/normalize/main.nf.test.snap +++ /dev/null @@ -1,20 +0,0 @@ -{ - "Should run without failures": { - "content": [ - [ - [ - { - "accession": "accession", - "design": "design.csv:md5,a83dd6a15463b51d94f0a42c196d7933" - }, - "all_counts.log_cpm.csv:md5,4091ece4324283f6edb3b91fafa324fa" - ] - ] - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" - }, - "timestamp": "2024-12-28T22:30:31.238971692" - } -} \ No newline at end of file diff --git a/tests/modules/local/edger/normalize/main.nf.test b/tests/modules/local/edger/normalize/main.nf.test deleted file mode 100644 index 27c87d19..00000000 --- a/tests/modules/local/edger/normalize/main.nf.test +++ /dev/null @@ -1,28 +0,0 @@ -nextflow_process { - - name "Test Process EDGER_NORMALIZE" - script "modules/local/edger/normalize/main.nf" - process "EDGER_NORMALIZE" - tag "edger_normalize" - tag "module" - - test("Should run without failures") { - - when { - - process { - """ - meta = [accession: "accession", design: file('$baseDir/tests/input/normalize/design.csv')] - input[0] = [meta, file('$baseDir/tests/input/normalize/all_counts.csv')] - """ - } - } - - then { - assert process.success - assert snapshot(process.out).match() - } - - } - -} diff --git a/tests/modules/local/edger/normalize/main.nf.test.snap b/tests/modules/local/edger/normalize/main.nf.test.snap deleted file mode 100644 index fc682f12..00000000 --- a/tests/modules/local/edger/normalize/main.nf.test.snap +++ /dev/null @@ -1,45 +0,0 @@ -{ - "Should run without failures": { - "content": [ - { - "0": [ - [ - { - "accession": "accession", - "design": "design.csv:md5,a83dd6a15463b51d94f0a42c196d7933" - }, - "all_counts.log_cpm.csv:md5,9dd697940642a7e2dbac520eb7a2eeff" - ] - ], - "1": [ - [ - "EDGER_NORMALIZE", - "R", - "4.3.3 (2024-02-29)" - ] - ], - "2": [ - [ - "EDGER_NORMALIZE", - "edgeR", - "4.0.16" - ] - ], - "csv": [ - [ - { - "accession": "accession", - "design": "design.csv:md5,a83dd6a15463b51d94f0a42c196d7933" - }, - "all_counts.log_cpm.csv:md5,9dd697940642a7e2dbac520eb7a2eeff" - ] - ] - } - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" - }, - "timestamp": "2024-12-29T12:36:28.080031" - } -} \ No newline at end of file diff --git a/tests/modules/local/expressionatlas/getaccessions/main.nf.test b/tests/modules/local/expressionatlas/getaccessions/main.nf.test index 74688073..a5e4f860 100644 --- a/tests/modules/local/expressionatlas/getaccessions/main.nf.test +++ b/tests/modules/local/expressionatlas/getaccessions/main.nf.test @@ -3,33 +3,72 @@ nextflow_process { name "Test Process EXPRESSIONATLAS_GETACCESSIONS" script "modules/local/expressionatlas/getaccessions/main.nf" process "EXPRESSIONATLAS_GETACCESSIONS" - tag "getaccessions" - tag "module" + tag "eatlas_getaccessions" - test('["solanum_tuberosum", ["potato"]]') { + test("Beta vulgaris one keyword - no platform") { - tag "potato_two_kw" + when { + + process { + """ + input[0] = "beta_vulgaris" + input[1] = "leaf" + input[2] = [] + input[3] = 100 + input[4] = 42 + """ + } + } + + then { + assert process.success + } + + } + + test("Beta vulgaris no keyword - rnaseq platform") { when { process { """ - input[0] = "solanum_tuberosum" - input[1] = "potato" + input[0] = "beta_vulgaris" + input[1] = "" + input[2] = "rnaseq" + input[3] = 100 + input[4] = 42 """ } } then { assert process.success - assert snapshot(process.out).match() } } - test('["solanum_tuberosum", "potato,phloem"]') { + test("Beta vulgaris - no experiments left after random sampling") { + + when { - tag "potato_two_kw" + process { + """ + input[0] = "beta_vulgaris" + input[1] = "" + input[2] = [] + input[3] = 1 + input[4] = 42 + """ + } + } + + then { + assert process.success + } + + } + + test('Solanum tuberosum two keywords - microarray') { when { @@ -37,20 +76,23 @@ nextflow_process { """ input[0] = "solanum_tuberosum" input[1] = "potato,phloem" + input[2] = "microarray" + input[3] = 10000 + input[4] = 42 """ } } then { - assert process.success - assert snapshot(process.out).match() + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) } } - test('["solanum_tuberosum", null]') { - - tag "potato_no_kw" + test('Solanum tuberosum no keyword') { when { @@ -58,13 +100,18 @@ nextflow_process { """ input[0] = "solanum_tuberosum" input[1] = "" + input[2] = "microarray" + input[3] = 100 + input[4] = 42 """ } } then { - assert process.success - assert snapshot(process.out).match() + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) } } diff --git a/tests/modules/local/expressionatlas/getaccessions/main.nf.test.snap b/tests/modules/local/expressionatlas/getaccessions/main.nf.test.snap index 798b30ce..01391db2 100644 --- a/tests/modules/local/expressionatlas/getaccessions/main.nf.test.snap +++ b/tests/modules/local/expressionatlas/getaccessions/main.nf.test.snap @@ -1,247 +1,130 @@ { - "[\"solanum_tuberosum\", [null]": { + "Solanum tuberosum two keywords - microarray": { "content": [ { "0": [ - "accessions.csv:md5,dad63ac7a1715277fa44567bc40b5872" + "accessions.txt:md5,d41d8cd98f00b204e9800998ecf8427e" ], "1": [ - [ - "EXPRESSIONATLAS_GETACCESSIONS", - "python", - "Python 3.13.0" - ] + "ok" ], "2": [ - [ - "EXPRESSIONATLAS_GETACCESSIONS", - "requests", - "2.32.3" - ] + ], "3": [ - [ - "EXPRESSIONATLAS_GETACCESSIONS", - "nltk", - "3.9.1" - ] + "species_experiments.metadata.tsv:md5,68b329da9893e34099c7d8ad5cb9c940" ], - "csv": [ - "accessions.csv:md5,dad63ac7a1715277fa44567bc40b5872" - ] - } - ], - "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" - }, - "timestamp": "2024-10-17T00:26:46.488292622" - }, - "[\"solanum_tuberosum\", [\"potato\", \"phloem\"]]": { - "content": [ - { - "0": [ - "accessions.txt:md5,dad63ac7a1715277fa44567bc40b5872" - ], - "1": [ + "4": [ [ "EXPRESSIONATLAS_GETACCESSIONS", "python", - "Python 3.13.0" + "3.14.3" ] ], - "2": [ + "5": [ [ "EXPRESSIONATLAS_GETACCESSIONS", - "requests", - "2.32.3" + "httpx", + "0.28.1" ] ], - "3": [ + "6": [ [ "EXPRESSIONATLAS_GETACCESSIONS", "nltk", - "3.9.1" + "3.9.2" ] ], - "txt": [ - "accessions.txt:md5,dad63ac7a1715277fa44567bc40b5872" - ] - } - ], - "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" - }, - "timestamp": "2024-11-26T05:57:18.83875097" - }, - "[\"solanum_tuberosum\", [\"potato\", \"phloem\"]": { - "content": [ - { - "0": [ - "accessions.csv:md5,dad63ac7a1715277fa44567bc40b5872" - ], - "1": [ + "7": [ [ "EXPRESSIONATLAS_GETACCESSIONS", - "python", - "Python 3.13.0" + "pyyaml", + "6.0.3" ] ], - "2": [ + "8": [ [ "EXPRESSIONATLAS_GETACCESSIONS", - "requests", - "2.32.3" + "pandas", + "3.0.1" ] ], - "3": [ - [ - "EXPRESSIONATLAS_GETACCESSIONS", - "nltk", - "3.9.1" - ] + "accessions": [ + "accessions.txt:md5,d41d8cd98f00b204e9800998ecf8427e" ], - "csv": [ - "accessions.csv:md5,dad63ac7a1715277fa44567bc40b5872" + "sampling_quota": [ + "ok" ] } ], + "timestamp": "2026-02-19T10:19:07.035607232", "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" - }, - "timestamp": "2024-10-17T00:26:09.77988473" + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } }, - "[\"solanum_tuberosum\", \"potato,phloem\"]": { + "Solanum tuberosum no keyword": { "content": [ { "0": [ - "accessions.txt:md5,dad63ac7a1715277fa44567bc40b5872" + "accessions.txt:md5,d41d8cd98f00b204e9800998ecf8427e" ], "1": [ - [ - "EXPRESSIONATLAS_GETACCESSIONS", - "python", - "3.13.0" - ] + "ok" ], "2": [ - [ - "EXPRESSIONATLAS_GETACCESSIONS", - "requests", - "2.32.3" - ] + ], "3": [ - [ - "EXPRESSIONATLAS_GETACCESSIONS", - "nltk", - "3.9.1" - ] - ], - "txt": [ - "accessions.txt:md5,dad63ac7a1715277fa44567bc40b5872" - ] - } - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" - }, - "timestamp": "2024-12-29T12:37:33.063824" - }, - "[\"solanum_tuberosum\", null]": { - "content": [ - { - "0": [ - "accessions.txt:md5,dad63ac7a1715277fa44567bc40b5872" + "species_experiments.metadata.tsv:md5,68b329da9893e34099c7d8ad5cb9c940" ], - "1": [ + "4": [ [ "EXPRESSIONATLAS_GETACCESSIONS", "python", - "3.13.0" + "3.14.3" ] ], - "2": [ + "5": [ [ "EXPRESSIONATLAS_GETACCESSIONS", - "requests", - "2.32.3" + "httpx", + "0.28.1" ] ], - "3": [ + "6": [ [ "EXPRESSIONATLAS_GETACCESSIONS", "nltk", - "3.9.1" - ] - ], - "txt": [ - "accessions.txt:md5,dad63ac7a1715277fa44567bc40b5872" - ] - } - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" - }, - "timestamp": "2024-12-29T12:37:53.879594" - }, - "[\"solanum_tuberosum\", [\"potato\"]]": { - "content": [ - { - "0": [ - "accessions.txt:md5,dad63ac7a1715277fa44567bc40b5872" - ], - "1": [ - [ - "EXPRESSIONATLAS_GETACCESSIONS", - "python", - "3.13.0" + "3.9.2" ] ], - "2": [ + "7": [ [ "EXPRESSIONATLAS_GETACCESSIONS", - "requests", - "2.32.3" + "pyyaml", + "6.0.3" ] ], - "3": [ + "8": [ [ "EXPRESSIONATLAS_GETACCESSIONS", - "nltk", - "3.9.1" + "pandas", + "3.0.1" ] ], - "txt": [ - "accessions.txt:md5,dad63ac7a1715277fa44567bc40b5872" - ] - } - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" - }, - "timestamp": "2024-12-29T12:37:00.096237" - }, - "Should run without failures": { - "content": [ - { - "0": [ - "accessions.csv:md5,dad63ac7a1715277fa44567bc40b5872" + "accessions": [ + "accessions.txt:md5,d41d8cd98f00b204e9800998ecf8427e" ], - "accession": [ - "accessions.csv:md5,dad63ac7a1715277fa44567bc40b5872" + "sampling_quota": [ + "ok" ] } ], + "timestamp": "2026-02-19T10:19:20.628916067", "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" - }, - "timestamp": "2024-10-14T18:18:45.473497815" + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } } } \ No newline at end of file diff --git a/tests/modules/local/expressionatlas/getdata/main.nf.test b/tests/modules/local/expressionatlas/getdata/main.nf.test index 50dc8311..5af0b2a7 100644 --- a/tests/modules/local/expressionatlas/getdata/main.nf.test +++ b/tests/modules/local/expressionatlas/getdata/main.nf.test @@ -3,8 +3,7 @@ nextflow_process { name "Test Process EXPRESSIONATLAS_GETDATA" script "modules/local/expressionatlas/getdata/main.nf" process "EXPRESSIONATLAS_GETDATA" - tag "getdata" - tag "module" + tag "eatlas_getdata" test("Transcriptome Analysis of the potato (rnaseq)") { @@ -20,8 +19,11 @@ nextflow_process { } then { - assert process.success - assert snapshot(process.out).match() + assertAll( + { assert process.success }, + { assert process.exitStatus == 0 }, + { assert snapshot(process.out).match() } + ) } } @@ -40,8 +42,11 @@ nextflow_process { } then { - assert process.success - assert snapshot(process.out).match() + assertAll( + { assert process.success }, + { assert process.exitStatus == 0 }, + { assert snapshot(process.out).match() } + ) } } @@ -60,8 +65,136 @@ nextflow_process { } then { - assert process.success - assert snapshot(process.out).match() + assertAll( + { assert process.success }, + { assert process.exitStatus == 0 }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Invalid accession") { + + tag "getdata_invalid" + + when { + + process { + """ + input[0] = "fake-accession" + """ + } + } + + // must be successful without any output + then { + assertAll( + { assert process.success }, + { assert process.exitStatus == 0 }, + { assert process.out.counts.size() == 0 }, + { assert process.out.design.size() == 0 } + ) + } + + } + + test("Accession not available") { + + tag "getdata_unavailable" + + when { + + process { + """ + input[0] = "E-GEOD-161565656" + """ + } + } + + // check for the absence of expected output (the error is ignored but no output is produced) + then { + assertAll( + { assert process.success }, + { assert process.exitStatus == 0 }, + { assert process.out.counts.size() == 0 }, + { assert process.out.design.size() == 0 } + ) + } + + } + + test("E-MTAB-5132") { + + tag "getdata_unavailable" + + when { + + process { + """ + input[0] = "E-MTAB-5132" + """ + } + } + + // check for the absence of expected output (the error is ignored but no output is produced) + then { + assertAll( + { assert process.success }, + { assert process.exitStatus == 0 }, + { assert process.out.counts.size() == 0 }, + { assert process.out.design.size() == 0 } + ) + } + + } + + test("E-PROT-138") { + + tag "getdata_unsupported" + + when { + + process { + """ + input[0] = "E-PROT-138" + """ + } + } + + // check for the absence of expected output (the error is ignored but no output is produced) + then { + assertAll( + { assert process.success }, + { assert process.exitStatus == 0 }, + { assert process.out.counts.size() == 0 }, + { assert process.out.design.size() == 0 } + ) + } + + } + + test("E-MTAB-3578 :: serverside error 550") { + + tag "getdata_error_550" + + when { + + process { + """ + input[0] = "E-MTAB-3578" + """ + } + } + + // check for the absence of expected output (the error is ignored but no output is produced) + then { + assertAll( + { assert process.success }, + { assert process.exitStatus == 0 }, + { assert process.out.counts.size() == 0 }, + { assert process.out.design.size() == 0 } + ) } } diff --git a/tests/modules/local/expressionatlas/getdata/main.nf.test.snap b/tests/modules/local/expressionatlas/getdata/main.nf.test.snap index 05fb8186..74bcd293 100644 --- a/tests/modules/local/expressionatlas/getdata/main.nf.test.snap +++ b/tests/modules/local/expressionatlas/getdata/main.nf.test.snap @@ -1,289 +1,131 @@ { - "Transcriptome Analysis of the potato": { + "Transcriptome Analysis of the potato (rnaseq)": { "content": [ { "0": [ - "E-MTAB-552.rnaseq.design.csv:md5,142d6bb20d784ba414fa76a6e73bb37f" + "E_MTAB_552_rnaseq.rnaseq.raw.counts.csv:md5,830f50b60b17b62f9ca2f6a163a2879f" ], "1": [ - "E-MTAB-552.rnaseq.raw.csv:md5,315f6ab694cb809bd33f481ad326c5a7" + "E_MTAB_552_rnaseq.design.csv:md5,b81490696f638e90c1cf14236bb0c08c" ], "2": [ ], "3": [ - [ - "EXPRESSIONATLAS_GETDATA", - "R", - "[1] \"R version 4.3.3 (2024-02-29)\"" - ] - ], - "4": [ - [ - "EXPRESSIONATLAS_GETDATA", - "ExpressionAtlas", - "[1] \u20181.30.0\u2019" - ] - ], - "metadata": [ - "E-MTAB-552.rnaseq.design.csv:md5,142d6bb20d784ba414fa76a6e73bb37f" - ], - "normalized": [ - - ], - "raw": [ - "E-MTAB-552.rnaseq.raw.csv:md5,315f6ab694cb809bd33f481ad326c5a7" - ] - } - ], - "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" - }, - "timestamp": "2024-10-17T13:31:14.414421877" - }, - "arabidopsis issue": { - "content": [ - { - "0": [ ], - "1": [ - [ - "E-GEOD-62537", - "E_GEOD_62537_A_AFFY_2.normalized.csv:md5,673c55171d0ccfc1d036bf43c49ae320" - ] - ], - "2": [ - "E_GEOD_62537_A_AFFY_2.design.csv:md5,7d7dd72be4f5b326dd25a36db01ebf88" - ], - "3": [ + "4": [ [ "EXPRESSIONATLAS_GETDATA", "R", - "[1] \"R version 4.3.3 (2024-02-29)\"" + "4.4.3 (2025-02-28)" ] ], - "4": [ + "5": [ [ "EXPRESSIONATLAS_GETDATA", "ExpressionAtlas", - "[1] \u20181.30.0\u2019" + "1.34.0" ] ], - "design": [ - "E_GEOD_62537_A_AFFY_2.design.csv:md5,7d7dd72be4f5b326dd25a36db01ebf88" - ], - "normalized": [ - [ - "E-GEOD-62537", - "E_GEOD_62537_A_AFFY_2.normalized.csv:md5,673c55171d0ccfc1d036bf43c49ae320" - ] + "counts": [ + "E_MTAB_552_rnaseq.rnaseq.raw.counts.csv:md5,830f50b60b17b62f9ca2f6a163a2879f" ], - "raw": [ - + "design": [ + "E_MTAB_552_rnaseq.design.csv:md5,b81490696f638e90c1cf14236bb0c08c" ] } ], + "timestamp": "2026-03-19T12:17:31.898448037", "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" - }, - "timestamp": "2024-11-03T19:19:10.505238238" + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } }, - "Transcriptome Analysis of the potato (rnaseq)": { + "Arabidopsis Geo dataset": { "content": [ { "0": [ - [ - "E-MTAB-552", - "E_MTAB_552_rnaseq.design.csv:md5,b81490696f638e90c1cf14236bb0c08c", - "E_MTAB_552_rnaseq.raw.csv:md5,830f50b60b17b62f9ca2f6a163a2879f" - ] + "E_GEOD_62537_A_AFFY_2.microarray.normalised.counts.csv:md5,673c55171d0ccfc1d036bf43c49ae320" ], "1": [ - + "E_GEOD_62537_A_AFFY_2.design.csv:md5,7d7dd72be4f5b326dd25a36db01ebf88" ], "2": [ - [ - "EXPRESSIONATLAS_GETDATA", - "R", - "4.3.3 (2024-02-29)" - ] - ], - "3": [ - [ - "EXPRESSIONATLAS_GETDATA", - "ExpressionAtlas", - "1.30.0" - ] - ], - "normalized": [ ], - "raw": [ - [ - "E-MTAB-552", - "E_MTAB_552_rnaseq.design.csv:md5,b81490696f638e90c1cf14236bb0c08c", - "E_MTAB_552_rnaseq.raw.csv:md5,830f50b60b17b62f9ca2f6a163a2879f" - ] - ] - } - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" - }, - "timestamp": "2024-12-29T12:38:14.715423" - }, - "Arabidopsis Geo dataset": { - "content": [ - { - "0": [ + "3": [ ], - "1": [ - [ - "E-GEOD-62537", - "E_GEOD_62537_A_AFFY_2.design.csv:md5,7d7dd72be4f5b326dd25a36db01ebf88", - "E_GEOD_62537_A_AFFY_2.normalized.csv:md5,673c55171d0ccfc1d036bf43c49ae320" - ] - ], - "2": [ + "4": [ [ "EXPRESSIONATLAS_GETDATA", "R", - "4.3.3 (2024-02-29)" + "4.4.3 (2025-02-28)" ] ], - "3": [ + "5": [ [ "EXPRESSIONATLAS_GETDATA", "ExpressionAtlas", - "1.30.0" + "1.34.0" ] ], - "normalized": [ - [ - "E-GEOD-62537", - "E_GEOD_62537_A_AFFY_2.design.csv:md5,7d7dd72be4f5b326dd25a36db01ebf88", - "E_GEOD_62537_A_AFFY_2.normalized.csv:md5,673c55171d0ccfc1d036bf43c49ae320" - ] + "counts": [ + "E_GEOD_62537_A_AFFY_2.microarray.normalised.counts.csv:md5,673c55171d0ccfc1d036bf43c49ae320" ], - "raw": [ - + "design": [ + "E_GEOD_62537_A_AFFY_2.design.csv:md5,7d7dd72be4f5b326dd25a36db01ebf88" ] } ], + "timestamp": "2026-03-29T16:45:22.368557567", "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" - }, - "timestamp": "2024-12-29T12:38:56.536948" + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } }, - "Transcription profiling by array of Arabidopsis mutant for fis2": { + "Transcription profiling by array of Arabidopsis mutant for fis2 (microarray)": { "content": [ { "0": [ - "E-TABM-1007.A-AFFY-2.design.csv:md5,cb094d8a7f92d660a9d88d6eb6ec704a" + "E_TABM_1007_A_AFFY_2.microarray.normalised.counts.csv:md5,a3afe33d7eaed3339da9109bf25bb3ed" ], "1": [ - + "E_TABM_1007_A_AFFY_2.design.csv:md5,120f63cae2193b97d7451483bdbbaab1" ], "2": [ - "E-TABM-1007.A-AFFY-2.normalized.csv:md5,53672c5a95c252dadcf9ba718bdf2866" - ], - "3": [ - [ - "EXPRESSIONATLAS_GETDATA", - "R", - "[1] \"R version 4.3.3 (2024-02-29)\"" - ] - ], - "4": [ - [ - "EXPRESSIONATLAS_GETDATA", - "ExpressionAtlas", - "[1] \u20181.30.0\u2019" - ] - ], - "metadata": [ - "E-TABM-1007.A-AFFY-2.design.csv:md5,cb094d8a7f92d660a9d88d6eb6ec704a" - ], - "normalized": [ - "E-TABM-1007.A-AFFY-2.normalized.csv:md5,53672c5a95c252dadcf9ba718bdf2866" - ], - "raw": [ - ] - } - ], - "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" - }, - "timestamp": "2024-10-17T13:28:23.604650949" - }, - "Should run without failures": { - "content": [ - { - "0": [ - "E-MTAB-552_rnaseq.csv:md5,315f6ab694cb809bd33f481ad326c5a7" ], - "csv": [ - "E-MTAB-552_rnaseq.csv:md5,315f6ab694cb809bd33f481ad326c5a7" - ] - } - ], - "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" - }, - "timestamp": "2024-10-14T18:28:01.099092539" - }, - "Transcription profiling by array of Arabidopsis mutant for fis2 (microarray)": { - "content": [ - { - "0": [ + "3": [ ], - "1": [ - [ - "E-TABM-1007", - "E_TABM_1007_A_AFFY_2.design.csv:md5,120f63cae2193b97d7451483bdbbaab1", - "E_TABM_1007_A_AFFY_2.normalized.csv:md5,a3afe33d7eaed3339da9109bf25bb3ed" - ] - ], - "2": [ + "4": [ [ "EXPRESSIONATLAS_GETDATA", "R", - "4.3.3 (2024-02-29)" + "4.4.3 (2025-02-28)" ] ], - "3": [ + "5": [ [ "EXPRESSIONATLAS_GETDATA", "ExpressionAtlas", - "1.30.0" + "1.34.0" ] ], - "normalized": [ - [ - "E-TABM-1007", - "E_TABM_1007_A_AFFY_2.design.csv:md5,120f63cae2193b97d7451483bdbbaab1", - "E_TABM_1007_A_AFFY_2.normalized.csv:md5,a3afe33d7eaed3339da9109bf25bb3ed" - ] + "counts": [ + "E_TABM_1007_A_AFFY_2.microarray.normalised.counts.csv:md5,a3afe33d7eaed3339da9109bf25bb3ed" ], - "raw": [ - + "design": [ + "E_TABM_1007_A_AFFY_2.design.csv:md5,120f63cae2193b97d7451483bdbbaab1" ] } ], + "timestamp": "2026-03-19T12:17:45.546042421", "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" - }, - "timestamp": "2024-12-29T12:38:35.737187" + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } } } \ No newline at end of file diff --git a/tests/modules/local/filter_and_rename_genes/main.nf.test b/tests/modules/local/filter_and_rename_genes/main.nf.test new file mode 100644 index 00000000..a48b2fba --- /dev/null +++ b/tests/modules/local/filter_and_rename_genes/main.nf.test @@ -0,0 +1,88 @@ +nextflow_process { + + name "Test Process FILTER_AND_RENAME_GENES" + script "modules/local/filter_and_rename_genes/main.nf" + process "FILTER_AND_RENAME_GENES" + tag "filter_and_rename_genes" + + test("Map Ensembl IDs") { + + when { + process { + """ + input[0] = channel.of( + [ + [ dataset: "test" ], + file("$projectDir/tests/test_data/idmapping/base/counts.ensembl_ids.csv", checkIfExists: true) + ] + ) + input[1] = file("$projectDir/tests/test_data/idmapping/mapped/mapped_gene_ids.csv", checkIfExists: true) + input[2] = file("$projectDir/tests/test_data/idmapping/mapped/valid_gene_ids.txt", checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("No valid gene") { + + when { + process { + """ + input[0] = channel.of( + [ + [ dataset: "test" ], + file("$projectDir/tests/test_data/idmapping/base/counts.ensembl_ids.csv", checkIfExists: true) + ] + ) + input[1] = file("$projectDir/tests/test_data/idmapping/mapped/mapped_gene_ids.csv", checkIfExists: true) + input[2] = file("$projectDir/tests/test_data/idmapping/mapped/no_valid_gene_id.txt", checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Custom mapping - TSV") { + + tag "custom_mapping_tsv" + + when { + process { + """ + input[0] = channel.of( + [ + [ dataset: "test" ], + file("$projectDir/tests/test_data/idmapping/tsv/counts.ensembl_ids.tsv", checkIfExists: true) + ] + ) + input[1] = file( "$projectDir/tests/test_data/idmapping/tsv/mapping.tsv", checkIfExists: true) + input[2] = file("$projectDir/tests/test_data/idmapping/tsv/valid_gene_ids.txt", checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/tests/modules/local/filter_and_rename_genes/main.nf.test.snap b/tests/modules/local/filter_and_rename_genes/main.nf.test.snap new file mode 100644 index 00000000..5c57a3d7 --- /dev/null +++ b/tests/modules/local/filter_and_rename_genes/main.nf.test.snap @@ -0,0 +1,156 @@ +{ + "Custom mapping - TSV": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + "test", + "failure_reason.txt:md5,0eea8256c81d0362f3f10979ab2de23e" + ] + ], + "2": [ + + ], + "3": [ + [ + "test", + "0", + "0", + "3", + "0" + ] + ], + "4": [ + [ + "FILTER_AND_RENAME_GENES", + "python", + "3.14.3" + ] + ], + "5": [ + [ + "FILTER_AND_RENAME_GENES", + "polars", + "1.39.2" + ] + ], + "counts": [ + + ] + } + ], + "timestamp": "2026-04-02T15:03:17.972937783", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + }, + "Map Ensembl IDs": { + "content": [ + { + "0": [ + [ + { + "dataset": "test" + }, + "counts.ensembl_ids.renamed.parquet:md5,1fe83a8ee993d02c9df18f7412d20f0f" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + [ + "test", + "2", + "1", + "1", + "0" + ] + ], + "4": [ + [ + "FILTER_AND_RENAME_GENES", + "python", + "3.14.3" + ] + ], + "5": [ + [ + "FILTER_AND_RENAME_GENES", + "polars", + "1.39.2" + ] + ], + "counts": [ + [ + { + "dataset": "test" + }, + "counts.ensembl_ids.renamed.parquet:md5,1fe83a8ee993d02c9df18f7412d20f0f" + ] + ] + } + ], + "timestamp": "2026-04-02T15:03:06.767979138", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + }, + "No valid gene": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + "test", + "failure_reason.txt:md5,0eea8256c81d0362f3f10979ab2de23e" + ] + ], + "2": [ + + ], + "3": [ + [ + "test", + "0", + "0", + "3", + "0" + ] + ], + "4": [ + [ + "FILTER_AND_RENAME_GENES", + "python", + "3.14.3" + ] + ], + "5": [ + [ + "FILTER_AND_RENAME_GENES", + "polars", + "1.39.2" + ] + ], + "counts": [ + + ] + } + ], + "timestamp": "2026-04-02T15:03:12.057594832", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/tests/modules/local/genorm/compute_m_measure/main.nf.test b/tests/modules/local/genorm/compute_m_measure/main.nf.test new file mode 100644 index 00000000..fbcd556c --- /dev/null +++ b/tests/modules/local/genorm/compute_m_measure/main.nf.test @@ -0,0 +1,29 @@ +nextflow_process { + + name "Test Process COMPUTE_M_MEASURE" + script "modules/local/genorm/compute_m_measure/main.nf" + process "COMPUTE_M_MEASURE" + tag "m_measure" + + test("Four initial chunk files") { + + when { + process { + """ + input[0] = [ + [section: "section_1"], + file( '$projectDir/tests/test_data/genorm/make_chunks/input/counts.parquet', checkIfExists: true), + file( '$projectDir/tests/test_data/genorm/compute_m_measure/input/std.*.parquet', checkIfExists: true).collect() + ] + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + + } + +} diff --git a/tests/modules/local/genorm/compute_m_measure/main.nf.test.snap b/tests/modules/local/genorm/compute_m_measure/main.nf.test.snap new file mode 100644 index 00000000..f8ea8893 --- /dev/null +++ b/tests/modules/local/genorm/compute_m_measure/main.nf.test.snap @@ -0,0 +1,43 @@ +{ + "Four initial chunk files": { + "content": [ + { + "0": [ + [ + { + "section": "section_1" + }, + "m_measures.csv:md5,2119b16fe13e2d0bc0fedc3c9d3d1733" + ] + ], + "1": [ + [ + "COMPUTE_M_MEASURE", + "python", + "3.14.3" + ] + ], + "2": [ + [ + "COMPUTE_M_MEASURE", + "polars", + "1.39.2" + ] + ], + "m_measures": [ + [ + { + "section": "section_1" + }, + "m_measures.csv:md5,2119b16fe13e2d0bc0fedc3c9d3d1733" + ] + ] + } + ], + "timestamp": "2026-03-30T15:40:23.09370734", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/tests/modules/local/genorm/cross_join/main.nf.test b/tests/modules/local/genorm/cross_join/main.nf.test new file mode 100644 index 00000000..3e4d4f7a --- /dev/null +++ b/tests/modules/local/genorm/cross_join/main.nf.test @@ -0,0 +1,29 @@ +nextflow_process { + + name "Test Process CROSS_JOIN" + script "modules/local/genorm/cross_join/main.nf" + process "CROSS_JOIN" + tag "cross_join" + + test("Should run without failures") { + + when { + process { + """ + input[0] = [ + [section: "section_1", index_1: 0, index_2: 1], + file( '$projectDir/tests/test_data/genorm/make_chunks/output/count_chunk.0.parquet', checkIfExists: true), + file( '$projectDir/tests/test_data/genorm/make_chunks/output/count_chunk.1.parquet', checkIfExists: true) + ] + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + + } + +} diff --git a/tests/modules/local/genorm/cross_join/main.nf.test.snap b/tests/modules/local/genorm/cross_join/main.nf.test.snap new file mode 100644 index 00000000..b11d3254 --- /dev/null +++ b/tests/modules/local/genorm/cross_join/main.nf.test.snap @@ -0,0 +1,47 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + [ + { + "section": "section_1", + "index_1": 0, + "index_2": 1 + }, + "cross_join.0.1.parquet:md5,10d5591947a85f788dd6db61a1486f14" + ] + ], + "1": [ + [ + "CROSS_JOIN", + "python", + "3.14.3" + ] + ], + "2": [ + [ + "CROSS_JOIN", + "polars", + "1.39.2" + ] + ], + "data": [ + [ + { + "section": "section_1", + "index_1": 0, + "index_2": 1 + }, + "cross_join.0.1.parquet:md5,10d5591947a85f788dd6db61a1486f14" + ] + ] + } + ], + "timestamp": "2026-03-30T15:40:29.248178717", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/tests/modules/local/genorm/expression_ratio/main.nf.test b/tests/modules/local/genorm/expression_ratio/main.nf.test new file mode 100644 index 00000000..9b355b77 --- /dev/null +++ b/tests/modules/local/genorm/expression_ratio/main.nf.test @@ -0,0 +1,28 @@ +nextflow_process { + + name "Test Process EXPRESSION_RATIO" + script "modules/local/genorm/expression_ratio/main.nf" + process "EXPRESSION_RATIO" + tag "expression_ratio" + + test("Should run without failures") { + + when { + process { + """ + input[0] = [ + [section: "section_1", index_1: 0, index_2: 1], + file( '$projectDir/tests/test_data/genorm/cross_join/output/cross_join.0.1.parquet', checkIfExists: true) + ] + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + + } + +} diff --git a/tests/modules/local/genorm/expression_ratio/main.nf.test.snap b/tests/modules/local/genorm/expression_ratio/main.nf.test.snap new file mode 100644 index 00000000..f0347aba --- /dev/null +++ b/tests/modules/local/genorm/expression_ratio/main.nf.test.snap @@ -0,0 +1,47 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + [ + { + "section": "section_1", + "index_1": 0, + "index_2": 1 + }, + "ratios.0.1.parquet:md5,dd929c967bc78a650c33eb0885544f50" + ] + ], + "1": [ + [ + "EXPRESSION_RATIO", + "python", + "3.14.3" + ] + ], + "2": [ + [ + "EXPRESSION_RATIO", + "polars", + "1.39.2" + ] + ], + "data": [ + [ + { + "section": "section_1", + "index_1": 0, + "index_2": 1 + }, + "ratios.0.1.parquet:md5,dd929c967bc78a650c33eb0885544f50" + ] + ] + } + ], + "timestamp": "2026-04-01T09:41:39.459415462", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/tests/modules/local/genorm/make_chunks/main.nf.test b/tests/modules/local/genorm/make_chunks/main.nf.test new file mode 100644 index 00000000..3a9c230f --- /dev/null +++ b/tests/modules/local/genorm/make_chunks/main.nf.test @@ -0,0 +1,28 @@ +nextflow_process { + + name "Test Process MAKE_CHUNKS" + script "modules/local/genorm/make_chunks/main.nf" + process "MAKE_CHUNKS" + tag "make_chunks" + + test("Should run without failures") { + + when { + process { + """ + input[0] = [ + [section: "section_1"], + file( '$projectDir/tests/test_data/genorm/make_chunks/input/counts.parquet', checkIfExists: true) + ] + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + + } + +} diff --git a/tests/modules/local/genorm/make_chunks/main.nf.test.snap b/tests/modules/local/genorm/make_chunks/main.nf.test.snap new file mode 100644 index 00000000..6cb9d08f --- /dev/null +++ b/tests/modules/local/genorm/make_chunks/main.nf.test.snap @@ -0,0 +1,65 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + [ + { + "section": "section_1" + }, + [ + "count_chunk.0.parquet:md5,2b49edb51f57065edec0dbbc3b50cd03", + "count_chunk.1.parquet:md5,a229839cc11b60b51d75e69bda1b079e", + "count_chunk.2.parquet:md5,79e06a8d5438a1fd8c35bb7e861bbb2f", + "count_chunk.3.parquet:md5,b4b75fd8c257684914ea81acec63c7b2", + "count_chunk.4.parquet:md5,938d6eb757a2114fba7c37cb79917fdb", + "count_chunk.5.parquet:md5,7de0a7158eaf28de2728ad10ed68fea3", + "count_chunk.6.parquet:md5,b7bb9a8ed8578bbf661d60dc0cc43a09", + "count_chunk.7.parquet:md5,d424e46fbcab660f7994086d95d83955", + "count_chunk.8.parquet:md5,5411ffdabeda55de3d67ae8cc32e0276", + "count_chunk.9.parquet:md5,484ecc44837b0a0f3098bff5a8144853" + ] + ] + ], + "1": [ + [ + "MAKE_CHUNKS", + "python", + "3.14.3" + ] + ], + "2": [ + [ + "MAKE_CHUNKS", + "polars", + "1.39.2" + ] + ], + "chunks": [ + [ + { + "section": "section_1" + }, + [ + "count_chunk.0.parquet:md5,2b49edb51f57065edec0dbbc3b50cd03", + "count_chunk.1.parquet:md5,a229839cc11b60b51d75e69bda1b079e", + "count_chunk.2.parquet:md5,79e06a8d5438a1fd8c35bb7e861bbb2f", + "count_chunk.3.parquet:md5,b4b75fd8c257684914ea81acec63c7b2", + "count_chunk.4.parquet:md5,938d6eb757a2114fba7c37cb79917fdb", + "count_chunk.5.parquet:md5,7de0a7158eaf28de2728ad10ed68fea3", + "count_chunk.6.parquet:md5,b7bb9a8ed8578bbf661d60dc0cc43a09", + "count_chunk.7.parquet:md5,d424e46fbcab660f7994086d95d83955", + "count_chunk.8.parquet:md5,5411ffdabeda55de3d67ae8cc32e0276", + "count_chunk.9.parquet:md5,484ecc44837b0a0f3098bff5a8144853" + ] + ] + ] + } + ], + "timestamp": "2026-03-30T15:40:46.563584649", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/tests/modules/local/genorm/ratio_standard_variation/main.nf.test b/tests/modules/local/genorm/ratio_standard_variation/main.nf.test new file mode 100644 index 00000000..d235f36e --- /dev/null +++ b/tests/modules/local/genorm/ratio_standard_variation/main.nf.test @@ -0,0 +1,28 @@ +nextflow_process { + + name "Test Process RATIO_STANDARD_VARIATION" + script "modules/local/genorm/ratio_standard_variation/main.nf" + process "RATIO_STANDARD_VARIATION" + tag "ratio_std" + + test("Should run without failures") { + + when { + process { + """ + input[0] = [ + [section: "section_1", index_1: 0, index_2: 1], + file( '$projectDir/tests/test_data/genorm/expression_ratio/output/ratios.0.1.parquet', checkIfExists: true) + ] + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + + } + +} diff --git a/tests/modules/local/genorm/ratio_standard_variation/main.nf.test.snap b/tests/modules/local/genorm/ratio_standard_variation/main.nf.test.snap new file mode 100644 index 00000000..a16d5709 --- /dev/null +++ b/tests/modules/local/genorm/ratio_standard_variation/main.nf.test.snap @@ -0,0 +1,47 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + [ + { + "section": "section_1", + "index_1": 0, + "index_2": 1 + }, + "std.0.1.parquet:md5,10e262fc1dff8efe522a2efcee6ccb87" + ] + ], + "1": [ + [ + "RATIO_STANDARD_VARIATION", + "python", + "3.14.3" + ] + ], + "2": [ + [ + "RATIO_STANDARD_VARIATION", + "polars", + "1.39.2" + ] + ], + "data": [ + [ + { + "section": "section_1", + "index_1": 0, + "index_2": 1 + }, + "std.0.1.parquet:md5,10e262fc1dff8efe522a2efcee6ccb87" + ] + ] + } + ], + "timestamp": "2026-04-01T09:41:51.590963847", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/tests/modules/local/geo/getaccessions/main.nf.test b/tests/modules/local/geo/getaccessions/main.nf.test new file mode 100644 index 00000000..deddb432 --- /dev/null +++ b/tests/modules/local/geo/getaccessions/main.nf.test @@ -0,0 +1,71 @@ +nextflow_process { + + name "Test Process GEO_GETACCESSIONS" + script "modules/local/geo/getaccessions/main.nf" + process "GEO_GETACCESSIONS" + tag "geo_getaccession" + + test("Beta vulgaris - exclude two accessions") { + + when { + process { + """ + input[0] = "beta_vulgaris" + input[1] = "" + input[2] = [] + input[3] = file( '$projectDir/tests/test_data/public_accessions/exclude_one_geo_accession.txt', checkIfExists: true ) + input[4] = 100 + input[5] = 42 + """ + } + } + + then { + assert process.success + } + + } + + test("Beta vulgaris - leaf / microarray") { + + when { + process { + """ + input[0] = "beta_vulgaris" + input[1] = "leaf" + input[2] = "microarray" + input[3] = [] + input[4] = 100 + input[5] = 42 + """ + } + } + + then { + assert process.success + } + + } + + test("Beta vulgaris - leaf / microarray") { + + when { + process { + """ + input[0] = "beta_vulgaris" + input[1] = "leaf" + input[2] = "microarray" + input[3] = [] + input[4] = 100 + input[5] = 42 + """ + } + } + + then { + assert process.success + } + + } + +} diff --git a/tests/modules/local/geo/getaccessions/main.nf.test.snap b/tests/modules/local/geo/getaccessions/main.nf.test.snap new file mode 100644 index 00000000..17bfedde --- /dev/null +++ b/tests/modules/local/geo/getaccessions/main.nf.test.snap @@ -0,0 +1,87 @@ +{ + "Beta vulgaris": { + "content": [ + { + "0": [ + "accessions.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "1": [ + [ + "filtered_datasets.metadata.tsv:md5,4d458c56f3fc71e92feda6d4031ff17e", + "rejected_datasets.metadata.tsv:md5,b7382bbefa84d5bb60089b057e75c09b", + "selected_datasets.metadata.tsv:md5,4d458c56f3fc71e92feda6d4031ff17e", + "species_datasets.metadata.tsv:md5,c36fd625541112de75d4d4ab38ec68e5" + ] + ], + "2": [ + "selected_datasets.keywords.yaml:md5,f7726c8e3b07ed20e5572d79fb7f575e" + ], + "3": [ + [ + "GEO_GETACCESSIONS", + "python", + "3.13.7" + ] + ], + "4": [ + [ + "GEO_GETACCESSIONS", + "requests", + "2.32.5" + ] + ], + "5": [ + [ + "GEO_GETACCESSIONS", + "nltk", + "3.9.1" + ] + ], + "6": [ + [ + "GEO_GETACCESSIONS", + "pyyaml", + "6.0.2" + ] + ], + "7": [ + [ + "GEO_GETACCESSIONS", + "pandas", + "2.3.2" + ] + ], + "8": [ + [ + "GEO_GETACCESSIONS", + "xmltodict", + "0.14.2" + ] + ], + "9": [ + [ + "GEO_GETACCESSIONS", + "biopython", + "1.85" + ] + ], + "accessions": [ + "accessions.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "metadata": [ + [ + "filtered_datasets.metadata.tsv:md5,4d458c56f3fc71e92feda6d4031ff17e", + "rejected_datasets.metadata.tsv:md5,b7382bbefa84d5bb60089b057e75c09b", + "selected_datasets.metadata.tsv:md5,4d458c56f3fc71e92feda6d4031ff17e", + "species_datasets.metadata.tsv:md5,c36fd625541112de75d4d4ab38ec68e5" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.8" + }, + "timestamp": "2025-10-18T11:17:44.966423003" + } +} \ No newline at end of file diff --git a/tests/modules/local/geo/getdata/main.nf.test b/tests/modules/local/geo/getdata/main.nf.test new file mode 100644 index 00000000..d71dbf3b --- /dev/null +++ b/tests/modules/local/geo/getdata/main.nf.test @@ -0,0 +1,237 @@ +nextflow_process { + + name "Test Process GEO_GETDATA" + script "modules/local/geo/getdata/main.nf" + process "GEO_GETDATA" + tag "geo_getdata" + + /* + // TODO: see why these tests give ".command.run: No such file or directory" errors sometimes, even when running locally with act + // since this process is experimental, we can skip it for now + test("Beta vulgaris - Small RNA of sugar beet in response to drought stress") { + when { + + process { + """ + input[0] = [ + [ id: "test" ], + "GSE205328" + ] + input[1] = "beta vulgaris" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.exitStatus == 0 }, + { assert snapshot(process.out).match() } + ) + } + + } + test("Accession does not exist") { + + when { + + process { + """ + input[0] = [ + [ ], + "GSE568945478" + ] + input[1] = "blabla" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.exitStatus == 0 }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Drosophila simulans - Only one sample among several") { + + when { + + process { + """ + input[0] = [ + [ id: "test" ], + "GSE59707" + ] + input[1] = "drosophila_simulans" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.exitStatus == 0 }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Drosophila simulans - No data found") { + + when { + + process { + """ + input[0] = [ + [ id: "test" ], + "GSE124142" + ] + input[1] = "drosophila_simulans" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.exitStatus == 0 }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Drosophila simulans - Expression profiling by array") { + + when { + + process { + """ + input[0] = [ + [ id: "test" ], + "GSE43665" + ] + input[1] = "drosophila_simulans" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.exitStatus == 0 }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Drosophila simulans - Expression profiling by high throughput sequencing / Some raw counts found") { + + when { + + process { + """ + input[0] = [ + [ id: "test" ], + "GSE59707" + ] + input[1] = "drosophila_simulans" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.exitStatus == 0 }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Drosophila simulans - Expression profiling by high throughput sequencing / One raw count found") { + + when { + + process { + """ + input[0] = [ + [ id: "test" ], + "GSE100837" + ] + input[1] = "drosophila_simulans" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.exitStatus == 0 }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Drosophila simulans - Only series suppl data but multiple species") { + + when { + + process { + """ + input[0] = [ + [ id: "test" ], + "GSE274048" + ] + input[1] = "drosophila_simulans" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.exitStatus == 0 }, + { assert process.out.counts.size() == 0 }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Drosophila simulans - Mismatch in suppl data colnames / design") { + + when { + + process { + """ + input[0] = [ + [ id: "test" ], + "GSE49127" + ] + input[1] = "drosophila_simulans" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.exitStatus == 0 }, + { assert process.out.counts.size() == 0 }, + { assert snapshot(process.out).match() } + ) + } + + } + */ + +} diff --git a/tests/modules/local/geo/getdata/main.nf.test.snap b/tests/modules/local/geo/getdata/main.nf.test.snap new file mode 100644 index 00000000..3eab3ef8 --- /dev/null +++ b/tests/modules/local/geo/getdata/main.nf.test.snap @@ -0,0 +1,506 @@ +{ + "Drosophila simulans - Expression profiling by high throughput sequencing / One raw count found": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + [ + "GEO_GETDATA", + "R", + "4.5.3 (2026-03-11)" + ] + ], + "6": [ + [ + "GEO_GETDATA", + "GEOquery", + "2.78.0" + ] + ], + "7": [ + [ + "GEO_GETDATA", + "dplyr", + "1.2.0" + ] + ], + "counts": [ + + ], + "design": [ + + ], + "rejected": [ + + ] + } + ], + "timestamp": "2026-03-30T14:54:35.640938644", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "Drosophila simulans - No data found": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + [ + "GEO_GETDATA", + "R", + "4.5.3 (2026-03-11)" + ] + ], + "6": [ + [ + "GEO_GETDATA", + "GEOquery", + "2.78.0" + ] + ], + "7": [ + [ + "GEO_GETDATA", + "dplyr", + "1.2.0" + ] + ], + "counts": [ + + ], + "design": [ + + ], + "rejected": [ + + ] + } + ], + "timestamp": "2026-03-30T14:54:07.057142353", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "Drosophila simulans - Mismatch in suppl data colnames / design": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + [ + "GEO_GETDATA", + "R", + "4.5.3 (2026-03-11)" + ] + ], + "6": [ + [ + "GEO_GETDATA", + "GEOquery", + "2.78.0" + ] + ], + "7": [ + [ + "GEO_GETDATA", + "dplyr", + "1.2.0" + ] + ], + "counts": [ + + ], + "design": [ + + ], + "rejected": [ + + ] + } + ], + "timestamp": "2026-03-30T14:54:54.838651132", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "Accession does not exist": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + [ + "GEO_GETDATA", + "R", + "4.5.3 (2026-03-11)" + ] + ], + "6": [ + [ + "GEO_GETDATA", + "GEOquery", + "2.78.0" + ] + ], + "7": [ + [ + "GEO_GETDATA", + "dplyr", + "1.2.0" + ] + ], + "counts": [ + + ], + "design": [ + + ], + "rejected": [ + + ] + } + ], + "timestamp": "2026-03-30T14:53:48.164404869", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "Drosophila simulans - Expression profiling by array": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + [ + "GEO_GETDATA", + "R", + "4.5.3 (2026-03-11)" + ] + ], + "6": [ + [ + "GEO_GETDATA", + "GEOquery", + "2.78.0" + ] + ], + "7": [ + [ + "GEO_GETDATA", + "dplyr", + "1.2.0" + ] + ], + "counts": [ + + ], + "design": [ + + ], + "rejected": [ + + ] + } + ], + "timestamp": "2026-03-30T14:54:16.400915284", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "Drosophila simulans - Expression profiling by high throughput sequencing / Some raw counts found": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + [ + "GEO_GETDATA", + "R", + "4.5.3 (2026-03-11)" + ] + ], + "6": [ + [ + "GEO_GETDATA", + "GEOquery", + "2.78.0" + ] + ], + "7": [ + [ + "GEO_GETDATA", + "dplyr", + "1.2.0" + ] + ], + "counts": [ + + ], + "design": [ + + ], + "rejected": [ + + ] + } + ], + "timestamp": "2026-03-30T14:54:25.947789471", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "Drosophila simulans - Only series suppl data but multiple species": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + [ + "GEO_GETDATA", + "R", + "4.5.3 (2026-03-11)" + ] + ], + "6": [ + [ + "GEO_GETDATA", + "GEOquery", + "2.78.0" + ] + ], + "7": [ + [ + "GEO_GETDATA", + "dplyr", + "1.2.0" + ] + ], + "counts": [ + + ], + "design": [ + + ], + "rejected": [ + + ] + } + ], + "timestamp": "2026-03-30T14:54:45.272163295", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "Drosophila simulans - Only one sample among several": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + [ + "GEO_GETDATA", + "R", + "4.5.3 (2026-03-11)" + ] + ], + "6": [ + [ + "GEO_GETDATA", + "GEOquery", + "2.78.0" + ] + ], + "7": [ + [ + "GEO_GETDATA", + "dplyr", + "1.2.0" + ] + ], + "counts": [ + + ], + "design": [ + + ], + "rejected": [ + + ] + } + ], + "timestamp": "2026-03-30T14:53:57.533758257", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "Beta vulgaris - Small RNA of sugar beet in response to drought stress": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + [ + "GEO_GETDATA", + "R", + "4.5.3 (2026-03-11)" + ] + ], + "6": [ + [ + "GEO_GETDATA", + "GEOquery", + "2.78.0" + ] + ], + "7": [ + [ + "GEO_GETDATA", + "dplyr", + "1.2.0" + ] + ], + "counts": [ + + ], + "design": [ + + ], + "rejected": [ + + ] + } + ], + "timestamp": "2026-03-30T14:53:38.690525862", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/tests/modules/local/get_candidate_genes/main.nf.test b/tests/modules/local/get_candidate_genes/main.nf.test new file mode 100644 index 00000000..992a5635 --- /dev/null +++ b/tests/modules/local/get_candidate_genes/main.nf.test @@ -0,0 +1,54 @@ +nextflow_process { + + name "Test Process GET_CANDIDATE_GENES" + script "modules/local/get_candidate_genes/main.nf" + process "GET_CANDIDATE_GENES" + tag "get_candidate_genes" + + test("Nb sections & candidates per section lower than total nb genes") { + + when { + + process { + """ + input[0] = file('$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet', checkIfExists: true) + input[1] = file('$projectDir/tests/test_data/base_statistics/output/stats_all_genes.csv', checkIfExists: true) + input[2] = "2" + input[3] = 3 + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Too many sections") { + + when { + + process { + """ + input[0] = file('$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet', checkIfExists: true) + input[1] = file('$projectDir/tests/test_data/base_statistics/output/stats_all_genes.csv', checkIfExists: true) + input[2] = "50" + input[3] = 20 + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/tests/modules/local/get_candidate_genes/main.nf.test.snap b/tests/modules/local/get_candidate_genes/main.nf.test.snap new file mode 100644 index 00000000..c9e2ede4 --- /dev/null +++ b/tests/modules/local/get_candidate_genes/main.nf.test.snap @@ -0,0 +1,132 @@ +{ + "Nb sections & candidates per section lower than total nb genes": { + "content": [ + { + "0": [ + [ + "section_1.candidate_counts.parquet:md5,7d1a1996214fb07741f2ad5c286fbc69", + "section_2.candidate_counts.parquet:md5,9e2de0ea75c3f839e38690f3d9a57b0b", + "section_3.candidate_counts.parquet:md5,860cdeb5dbfe7d24b2c12635ea85c10e" + ] + ], + "1": [ + [ + "section_1.stats.parquet:md5,3414fd57e9bf4f221b2df93be2e890a2", + "section_2.stats.parquet:md5,99b7bcc7944c77eb569b688c640d70f2", + "section_3.stats.parquet:md5,752dbfb5699fbe6847ac17a4fb6da51a" + ] + ], + "2": [ + [ + "GET_CANDIDATE_GENES", + "python", + "3.14.3" + ] + ], + "3": [ + [ + "GET_CANDIDATE_GENES", + "polars", + "1.39.2" + ] + ], + "counts": [ + [ + "section_1.candidate_counts.parquet:md5,7d1a1996214fb07741f2ad5c286fbc69", + "section_2.candidate_counts.parquet:md5,9e2de0ea75c3f839e38690f3d9a57b0b", + "section_3.candidate_counts.parquet:md5,860cdeb5dbfe7d24b2c12635ea85c10e" + ] + ], + "section_stats": [ + [ + "section_1.stats.parquet:md5,3414fd57e9bf4f221b2df93be2e890a2", + "section_2.stats.parquet:md5,99b7bcc7944c77eb569b688c640d70f2", + "section_3.stats.parquet:md5,752dbfb5699fbe6847ac17a4fb6da51a" + ] + ] + } + ], + "timestamp": "2026-03-30T17:07:03.292271274", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "Too many sections": { + "content": [ + { + "0": [ + [ + "section_12.candidate_counts.parquet:md5,df8b5a6629b2b84b4c73156bbf261a92", + "section_14.candidate_counts.parquet:md5,1ef1a4e6cef6dd6b04ca28a505985bb5", + "section_16.candidate_counts.parquet:md5,f246e9291457873463ba3bff49e07b9d", + "section_18.candidate_counts.parquet:md5,f67b1be510bd1b41f58088ae08cc494c", + "section_20.candidate_counts.parquet:md5,3349d94fa3b42a917704c7abc2d807f9", + "section_3.candidate_counts.parquet:md5,2ddf5f2e7cca3e8df9930520b3131495", + "section_5.candidate_counts.parquet:md5,179d419cc48a36a991fbe74e4dcb28fa", + "section_7.candidate_counts.parquet:md5,41f08e512f5d44eff8fa0ce3d49ac0f4", + "section_9.candidate_counts.parquet:md5,85c942fa7ec2f2b4e5af6149de007328" + ] + ], + "1": [ + [ + "section_12.stats.parquet:md5,069d23175be9d1a733b4996895d4a3ce", + "section_14.stats.parquet:md5,73e852a1083f86d2d99d4f93ab6228c0", + "section_16.stats.parquet:md5,6aa616308faf4403194f18fae7cd1024", + "section_18.stats.parquet:md5,0b17005fab7582663111ea77cefca427", + "section_20.stats.parquet:md5,03c52c419ff94b63d0b16b2e9e87fa26", + "section_3.stats.parquet:md5,14a121ecab4116935fa9df136afc997a", + "section_5.stats.parquet:md5,3a3700641343056feabac2aa76626556", + "section_7.stats.parquet:md5,d6af8c940d55e449397b7fc0c428fedf", + "section_9.stats.parquet:md5,eb375761c7111b78bf8779bf71f876ef" + ] + ], + "2": [ + [ + "GET_CANDIDATE_GENES", + "python", + "3.14.3" + ] + ], + "3": [ + [ + "GET_CANDIDATE_GENES", + "polars", + "1.39.2" + ] + ], + "counts": [ + [ + "section_12.candidate_counts.parquet:md5,df8b5a6629b2b84b4c73156bbf261a92", + "section_14.candidate_counts.parquet:md5,1ef1a4e6cef6dd6b04ca28a505985bb5", + "section_16.candidate_counts.parquet:md5,f246e9291457873463ba3bff49e07b9d", + "section_18.candidate_counts.parquet:md5,f67b1be510bd1b41f58088ae08cc494c", + "section_20.candidate_counts.parquet:md5,3349d94fa3b42a917704c7abc2d807f9", + "section_3.candidate_counts.parquet:md5,2ddf5f2e7cca3e8df9930520b3131495", + "section_5.candidate_counts.parquet:md5,179d419cc48a36a991fbe74e4dcb28fa", + "section_7.candidate_counts.parquet:md5,41f08e512f5d44eff8fa0ce3d49ac0f4", + "section_9.candidate_counts.parquet:md5,85c942fa7ec2f2b4e5af6149de007328" + ] + ], + "section_stats": [ + [ + "section_12.stats.parquet:md5,069d23175be9d1a733b4996895d4a3ce", + "section_14.stats.parquet:md5,73e852a1083f86d2d99d4f93ab6228c0", + "section_16.stats.parquet:md5,6aa616308faf4403194f18fae7cd1024", + "section_18.stats.parquet:md5,0b17005fab7582663111ea77cefca427", + "section_20.stats.parquet:md5,03c52c419ff94b63d0b16b2e9e87fa26", + "section_3.stats.parquet:md5,14a121ecab4116935fa9df136afc997a", + "section_5.stats.parquet:md5,3a3700641343056feabac2aa76626556", + "section_7.stats.parquet:md5,d6af8c940d55e449397b7fc0c428fedf", + "section_9.stats.parquet:md5,eb375761c7111b78bf8779bf71f876ef" + ] + ] + } + ], + "timestamp": "2026-03-30T17:07:09.643611957", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/tests/modules/local/gprofiler/idmapping/main.nf.test b/tests/modules/local/gprofiler/idmapping/main.nf.test index 0a06b9ff..e754ba0f 100644 --- a/tests/modules/local/gprofiler/idmapping/main.nf.test +++ b/tests/modules/local/gprofiler/idmapping/main.nf.test @@ -3,65 +3,49 @@ nextflow_process { name "Test Process GPROFILER_IDMAPPING" script "modules/local/gprofiler/idmapping/main.nf" process "GPROFILER_IDMAPPING" - tag "idmapping" - tag "module" + tag "gprofiler_idmapping" - test("Map Ensembl IDs to themselves") { + test("ENSG - Mapping found") { when { - process { - """ - meta = [] - count_file = file("$baseDir/tests/input/idmapping/counts.ensembl_ids.csv") - input[0] = [meta, count_file, "Solanum tuberosum"] - """ - } - } - then { - assert process.success - assert snapshot(process.out.csv).match() - } - - } - - test("Map NCBI IDs") { - - when { process { """ - meta = [] - count_file = file("$baseDir/tests/input/idmapping/counts.ncbi_ids.csv") - input[0] = [meta, count_file, "Arabidopsis thaliana"] + input[0] = file("$projectDir/tests/test_data/idmapping/gene_ids/gene_ids.txt", checkIfExists: true) + input[1] = "Solanum tuberosum" + input[2] = "ENSG" """ } } then { - assert process.success - assert snapshot(process.out.csv).match() + assertAll( + { assert process.success }, + { assert process.out.mapping.size() == 1 }, + { assert snapshot(process.out).match() } + ) } - } - - - test("Map Uniprot IDs") { + /* + test("Entrez - No mapping found") { when { + process { """ - meta = [] - count_file = file("$baseDir/tests/input/idmapping/counts.uniprot_ids.csv") - input[0] = [meta, count_file, "Arabidopsis thaliana"] + input[0] = file("$projectDir/tests/test_data/idmapping/gene_ids/gene_ids.txt", checkIfExists: true) + input[1] = "Solanum tuberosum" + input[2] = "ENTREZGENE" """ } } then { - assert process.success - assert snapshot(process.out.csv).match() + assertAll( + { assert !process.success } + ) } - } + */ } diff --git a/tests/modules/local/gprofiler/idmapping/main.nf.test.snap b/tests/modules/local/gprofiler/idmapping/main.nf.test.snap index fc4c98ea..33550d23 100644 --- a/tests/modules/local/gprofiler/idmapping/main.nf.test.snap +++ b/tests/modules/local/gprofiler/idmapping/main.nf.test.snap @@ -1,38 +1,43 @@ { - "Map Ensembl IDs to themselves": { + "ENSG - Mapping found": { "content": [ - [ - "counts.ensembl_ids_renamed.csv:md5,7fce29e696d35cf612c8c8c06b77fd56" - ] + { + "0": [ + "mapped_gene_ids.csv:md5,c4ef4df6530509b486662a107ba8de44" + ], + "1": [ + "gene_metadata.csv:md5,f4dad0185e6f2d780f561d3efc301562" + ], + "2": [ + [ + "GPROFILER_IDMAPPING", + "python", + "3.14.3" + ] + ], + "3": [ + [ + "GPROFILER_IDMAPPING", + "pandas", + "3.0.1" + ] + ], + "4": [ + [ + "GPROFILER_IDMAPPING", + "httpx", + "0.28.1" + ] + ], + "metadata": [ + "gene_metadata.csv:md5,f4dad0185e6f2d780f561d3efc301562" + ] + } ], + "timestamp": "2026-02-19T10:26:01.249646558", "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" - }, - "timestamp": "2024-12-20T23:29:07.259617578" - }, - "Map Uniprot IDs": { - "content": [ - [ - "counts.uniprot_ids_renamed.csv:md5,6eceb6063772b12fe48fa81e2418424c" - ] - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" - }, - "timestamp": "2024-12-20T23:37:08.508538995" - }, - "Map NCBI IDs": { - "content": [ - [ - "counts.ncbi_ids_renamed.csv:md5,6eceb6063772b12fe48fa81e2418424c" - ] - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" - }, - "timestamp": "2024-12-20T23:36:51.904926841" + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } } } \ No newline at end of file diff --git a/tests/modules/local/merge_counts/main.nf.test b/tests/modules/local/merge_counts/main.nf.test new file mode 100644 index 00000000..89651847 --- /dev/null +++ b/tests/modules/local/merge_counts/main.nf.test @@ -0,0 +1,85 @@ +nextflow_process { + + name "Test Process MERGE_COUNTS" + script "modules/local/merge_counts/main.nf" + process "MERGE_COUNTS" + tag "merge_counts" + + test("3 files") { + + when { + + process { + """ + input[0] = [ + [ platform: 'rnaseq' ], + [ + file("$projectDir/tests/test_data/merge_data/input/counts1.parquet", checkIfExists: true), + file( "$projectDir/tests/test_data/merge_data/input/counts2.parquet", checkIfExists: true), + file( "$projectDir/tests/test_data/merge_data/input/counts3.parquet", checkIfExists: true), + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + /* + test("2 identical files") { + + when { + + process { + """ + input[0] = [ + [ platform: 'rnaseq' ], + [ + file("$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet", checkIfExists: true), + file( "$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet", checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + */ + test("1 file") { + + when { + + process { + """ + input[0] = [ + [ platform: 'microarray' ], + [ + file("$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet", checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } +} diff --git a/tests/modules/local/merge_counts/main.nf.test.snap b/tests/modules/local/merge_counts/main.nf.test.snap new file mode 100644 index 00000000..7290c51d --- /dev/null +++ b/tests/modules/local/merge_counts/main.nf.test.snap @@ -0,0 +1,84 @@ +{ + "1 file": { + "content": [ + { + "0": [ + [ + { + "platform": "microarray" + }, + "all_counts.parquet:md5,4ceb116e0a52b92ab31ec4e122ed12a1" + ] + ], + "1": [ + [ + "MERGE_COUNTS", + "python", + "3.14.3" + ] + ], + "2": [ + [ + "MERGE_COUNTS", + "polars", + "1.39.2" + ] + ], + "counts": [ + [ + { + "platform": "microarray" + }, + "all_counts.parquet:md5,4ceb116e0a52b92ab31ec4e122ed12a1" + ] + ] + } + ], + "timestamp": "2026-03-30T16:41:25.646239587", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "3 files": { + "content": [ + { + "0": [ + [ + { + "platform": "rnaseq" + }, + "all_counts.parquet:md5,c519c7936217c9399081069a48539c07" + ] + ], + "1": [ + [ + "MERGE_COUNTS", + "python", + "3.14.3" + ] + ], + "2": [ + [ + "MERGE_COUNTS", + "polars", + "1.39.2" + ] + ], + "counts": [ + [ + { + "platform": "rnaseq" + }, + "all_counts.parquet:md5,c519c7936217c9399081069a48539c07" + ] + ] + } + ], + "timestamp": "2026-03-30T16:39:46.447995126", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/tests/modules/local/normalisation/compute_cpm/main.nf.test b/tests/modules/local/normalisation/compute_cpm/main.nf.test new file mode 100644 index 00000000..c32312b8 --- /dev/null +++ b/tests/modules/local/normalisation/compute_cpm/main.nf.test @@ -0,0 +1,101 @@ +nextflow_process { + + name "Test Process NORMALISATION_COMPUTE_CPM" + script "modules/local/normalisation/compute_cpm/main.nf" + process "NORMALISATION_COMPUTE_CPM" + tag "cpm_norm" + + + test("Very small dataset") { + + when { + + process { + """ + input[0] = [ + [ dataset: "test" ], + file('$projectDir/tests/test_data/normalisation/base/counts.csv') + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Rows with many zeros") { + + when { + + process { + """ + input[0] = [ + [ dataset: "test"], + file('$projectDir/tests/test_data/normalisation/many_zeros/counts.csv') + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("One group") { + + when { + + process { + """ + input[0] = [ + [ dataset: "accession" ], + file('$projectDir/tests/test_data/normalisation/one_group/counts.csv') + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("TSV files") { + + when { + + process { + """ + input[0] = [ + [ dataset: "accession" ], + file('$projectDir/tests/test_data/normalisation/base/counts.tsv') + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/tests/modules/local/normalisation/compute_cpm/main.nf.test.snap b/tests/modules/local/normalisation/compute_cpm/main.nf.test.snap new file mode 100644 index 00000000..5bcd2643 --- /dev/null +++ b/tests/modules/local/normalisation/compute_cpm/main.nf.test.snap @@ -0,0 +1,190 @@ +{ + "Very small dataset": { + "content": [ + { + "0": [ + [ + { + "dataset": "test" + }, + "counts.cpm.parquet:md5,8802fdfa77c0da39062bf357dccdd3cd" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + [ + "NORMALISATION_COMPUTE_CPM", + "python", + "3.14.3" + ] + ], + "4": [ + [ + "NORMALISATION_COMPUTE_CPM", + "polars", + "1.39.2" + ] + ], + "counts": [ + [ + { + "dataset": "test" + }, + "counts.cpm.parquet:md5,8802fdfa77c0da39062bf357dccdd3cd" + ] + ] + } + ], + "timestamp": "2026-04-02T15:06:33.626810331", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + }, + "One group": { + "content": [ + { + "0": [ + [ + { + "dataset": "accession" + }, + "counts.cpm.parquet:md5,c8855975f68aad3c3bb060a23c14e2f9" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + [ + "NORMALISATION_COMPUTE_CPM", + "python", + "3.14.3" + ] + ], + "4": [ + [ + "NORMALISATION_COMPUTE_CPM", + "polars", + "1.39.2" + ] + ], + "counts": [ + [ + { + "dataset": "accession" + }, + "counts.cpm.parquet:md5,c8855975f68aad3c3bb060a23c14e2f9" + ] + ] + } + ], + "timestamp": "2026-03-19T12:23:45.874853063", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "TSV files": { + "content": [ + { + "0": [ + [ + { + "dataset": "accession" + }, + "counts.cpm.parquet:md5,8802fdfa77c0da39062bf357dccdd3cd" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + [ + "NORMALISATION_COMPUTE_CPM", + "python", + "3.14.3" + ] + ], + "4": [ + [ + "NORMALISATION_COMPUTE_CPM", + "polars", + "1.39.2" + ] + ], + "counts": [ + [ + { + "dataset": "accession" + }, + "counts.cpm.parquet:md5,8802fdfa77c0da39062bf357dccdd3cd" + ] + ] + } + ], + "timestamp": "2026-03-19T12:23:54.407797312", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "Rows with many zeros": { + "content": [ + { + "0": [ + [ + { + "dataset": "test" + }, + "counts.cpm.parquet:md5,ab2596a5bb8b3b2e39754191a2dce2aa" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + [ + "NORMALISATION_COMPUTE_CPM", + "python", + "3.14.3" + ] + ], + "4": [ + [ + "NORMALISATION_COMPUTE_CPM", + "polars", + "1.39.2" + ] + ], + "counts": [ + [ + { + "dataset": "test" + }, + "counts.cpm.parquet:md5,ab2596a5bb8b3b2e39754191a2dce2aa" + ] + ] + } + ], + "timestamp": "2026-04-02T15:06:39.394957809", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/tests/modules/local/normalisation/compute_tpm/main.nf.test b/tests/modules/local/normalisation/compute_tpm/main.nf.test new file mode 100644 index 00000000..23463223 --- /dev/null +++ b/tests/modules/local/normalisation/compute_tpm/main.nf.test @@ -0,0 +1,106 @@ +nextflow_process { + + name "Test Process NORMALISATION_COMPUTE_TPM" + script "modules/local/normalisation/compute_tpm/main.nf" + process "NORMALISATION_COMPUTE_TPM" + tag "tpm_norm" + + + test("Very small dataset") { + + when { + + process { + """ + input[0] = [ + [ dataset: "test" ], + file('$projectDir/tests/test_data/normalisation/base/counts.csv') + ] + input[1] = file('$projectDir/tests/test_data/normalisation/base/gene_lengths.csv') + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Rows with many zeros") { + + when { + + process { + """ + input[0] = [ + [ dataset: "test"], + file('$projectDir/tests/test_data/normalisation/many_zeros/counts.csv') + ] + input[1] = file('$projectDir/tests/test_data/normalisation/many_zeros/gene_lengths.csv') + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("One group") { + + when { + + process { + """ + input[0] = [ + [ dataset: "test" ], + file('$projectDir/tests/test_data/normalisation/one_group/counts.csv') + ] + input[1] = file('$projectDir/tests/test_data/normalisation/one_group/gene_lengths.csv') + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("TSV files") { + + when { + + process { + """ + input[0] = [ + [ dataset: "test" ], + file('$projectDir/tests/test_data/normalisation/base/counts.tsv') + ] + input[1] = file('$projectDir/tests/test_data/normalisation/base/gene_lengths.csv') + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + +} diff --git a/tests/modules/local/normalisation/compute_tpm/main.nf.test.snap b/tests/modules/local/normalisation/compute_tpm/main.nf.test.snap new file mode 100644 index 00000000..f150a846 --- /dev/null +++ b/tests/modules/local/normalisation/compute_tpm/main.nf.test.snap @@ -0,0 +1,190 @@ +{ + "Very small dataset": { + "content": [ + { + "0": [ + [ + { + "dataset": "test" + }, + "counts.tpm.parquet:md5,e8e08e6af6b76fe41793259203925e37" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + [ + "NORMALISATION_COMPUTE_TPM", + "python", + "3.14.3" + ] + ], + "4": [ + [ + "NORMALISATION_COMPUTE_TPM", + "polars", + "1.39.2" + ] + ], + "counts": [ + [ + { + "dataset": "test" + }, + "counts.tpm.parquet:md5,e8e08e6af6b76fe41793259203925e37" + ] + ] + } + ], + "timestamp": "2026-04-02T15:06:57.587153369", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + }, + "One group": { + "content": [ + { + "0": [ + [ + { + "dataset": "test" + }, + "counts.tpm.parquet:md5,2bb5797b24bcd02a06b2794c94567638" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + [ + "NORMALISATION_COMPUTE_TPM", + "python", + "3.14.3" + ] + ], + "4": [ + [ + "NORMALISATION_COMPUTE_TPM", + "polars", + "1.39.2" + ] + ], + "counts": [ + [ + { + "dataset": "test" + }, + "counts.tpm.parquet:md5,2bb5797b24bcd02a06b2794c94567638" + ] + ] + } + ], + "timestamp": "2026-04-02T15:07:10.080660208", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + }, + "TSV files": { + "content": [ + { + "0": [ + [ + { + "dataset": "test" + }, + "counts.tpm.parquet:md5,e8e08e6af6b76fe41793259203925e37" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + [ + "NORMALISATION_COMPUTE_TPM", + "python", + "3.14.3" + ] + ], + "4": [ + [ + "NORMALISATION_COMPUTE_TPM", + "polars", + "1.39.2" + ] + ], + "counts": [ + [ + { + "dataset": "test" + }, + "counts.tpm.parquet:md5,e8e08e6af6b76fe41793259203925e37" + ] + ] + } + ], + "timestamp": "2026-04-02T15:07:16.231969197", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + }, + "Rows with many zeros": { + "content": [ + { + "0": [ + [ + { + "dataset": "test" + }, + "counts.tpm.parquet:md5,95563b1ba1083cfc31c2b9c18c5aeaaa" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + [ + "NORMALISATION_COMPUTE_TPM", + "python", + "3.14.3" + ] + ], + "4": [ + [ + "NORMALISATION_COMPUTE_TPM", + "polars", + "1.39.2" + ] + ], + "counts": [ + [ + { + "dataset": "test" + }, + "counts.tpm.parquet:md5,95563b1ba1083cfc31c2b9c18c5aeaaa" + ] + ] + } + ], + "timestamp": "2026-04-02T15:07:03.89821637", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/tests/modules/local/normfinder/main.nf.test b/tests/modules/local/normfinder/main.nf.test new file mode 100644 index 00000000..105de0d3 --- /dev/null +++ b/tests/modules/local/normfinder/main.nf.test @@ -0,0 +1,50 @@ +nextflow_process { + + name "Test Process NORMFINDER" + script "modules/local/normfinder/main.nf" + process "NORMFINDER" + tag "normfinder" + + test("Very small dataset - Cq values") { + + when { + process { + """ + input[0] = [ + [section: "section_1"], + file( '$projectDir/tests/test_data/normfinder/very_small_cq/all_counts.normalised.parquet', checkIfExists: true) + ] + input[1] = file( '$projectDir/tests/test_data/normfinder/very_small_cq/design.csv', checkIfExists: true) + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + + } + + test("Small dataset - Real expression values") { + + when { + process { + """ + input[0] = [ + [section: "section_1"], + file( '$projectDir/tests/test_data/normfinder/small_normalised/all_counts.normalised.parquet', checkIfExists: true) + ] + input[1] = file( '$projectDir/tests/test_data/normfinder/small_normalised/design.csv', checkIfExists: true) + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + + } + +} diff --git a/tests/modules/local/normfinder/main.nf.test.snap b/tests/modules/local/normfinder/main.nf.test.snap new file mode 100644 index 00000000..4e90417c --- /dev/null +++ b/tests/modules/local/normfinder/main.nf.test.snap @@ -0,0 +1,126 @@ +{ + "Small dataset - Real expression values": { + "content": [ + { + "0": [ + [ + { + "section": "section_1" + }, + "stability_values.normfinder.csv:md5,05b3b9508930923bd86c281e8febe6b6" + ] + ], + "1": [ + [ + "NORMFINDER", + "python", + "3.14.3" + ] + ], + "2": [ + [ + "NORMFINDER", + "polars", + "1.39.2" + ] + ], + "3": [ + [ + "NORMFINDER", + "tqdm", + "4.67.3" + ] + ], + "4": [ + [ + "NORMFINDER", + "numpy", + "2.4.3" + ] + ], + "5": [ + [ + "NORMFINDER", + "numba", + "0.64.0" + ] + ], + "stability_values": [ + [ + { + "section": "section_1" + }, + "stability_values.normfinder.csv:md5,05b3b9508930923bd86c281e8febe6b6" + ] + ] + } + ], + "timestamp": "2026-03-30T15:45:00.995645591", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "Very small dataset - Cq values": { + "content": [ + { + "0": [ + [ + { + "section": "section_1" + }, + "stability_values.normfinder.csv:md5,a7c936faa9135439fd1b86c195f60414" + ] + ], + "1": [ + [ + "NORMFINDER", + "python", + "3.14.3" + ] + ], + "2": [ + [ + "NORMFINDER", + "polars", + "1.39.2" + ] + ], + "3": [ + [ + "NORMFINDER", + "tqdm", + "4.67.3" + ] + ], + "4": [ + [ + "NORMFINDER", + "numpy", + "2.4.3" + ] + ], + "5": [ + [ + "NORMFINDER", + "numba", + "0.64.0" + ] + ], + "stability_values": [ + [ + { + "section": "section_1" + }, + "stability_values.normfinder.csv:md5,a7c936faa9135439fd1b86c195f60414" + ] + ] + } + ], + "timestamp": "2026-03-30T15:44:51.060894512", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/tests/modules/local/quantile_normalisation/main.nf.test b/tests/modules/local/quantile_normalisation/main.nf.test new file mode 100644 index 00000000..469c52bb --- /dev/null +++ b/tests/modules/local/quantile_normalisation/main.nf.test @@ -0,0 +1,54 @@ +nextflow_process { + + name "Test Process QUANTILE_NORMALISATION" + script "modules/local/quantile_normalisation/main.nf" + process "QUANTILE_NORMALISATION" + tag "quant_norm" + + test("Uniform target distribution") { + + when { + process { + """ + input[0] = [ + [dataset: 'test'], + file( '$projectDir/tests/test_data/quantile_normalisation/count.raw.cpm.csv', checkIfExists: true) + ] + input[1] = "uniform" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Normal target distribution") { + + when { + process { + """ + input[0] = [ + [dataset: 'test'], + file( '$projectDir/tests/test_data/quantile_normalisation/count.raw.cpm.csv', checkIfExists: true) + ] + input[1] = "normal" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/tests/modules/local/quantile_normalisation/main.nf.test.snap b/tests/modules/local/quantile_normalisation/main.nf.test.snap new file mode 100644 index 00000000..ef0a01eb --- /dev/null +++ b/tests/modules/local/quantile_normalisation/main.nf.test.snap @@ -0,0 +1,98 @@ +{ + "Uniform target distribution": { + "content": [ + { + "0": [ + [ + { + "dataset": "test" + }, + "count.raw.cpm.quant_norm.parquet:md5,4ceb116e0a52b92ab31ec4e122ed12a1" + ] + ], + "1": [ + [ + "QUANTILE_NORMALISATION", + "python", + "3.14.3" + ] + ], + "2": [ + [ + "QUANTILE_NORMALISATION", + "polars", + "1.39.2" + ] + ], + "3": [ + [ + "QUANTILE_NORMALISATION", + "scikit-learn", + "1.8.0" + ] + ], + "counts": [ + [ + { + "dataset": "test" + }, + "count.raw.cpm.quant_norm.parquet:md5,4ceb116e0a52b92ab31ec4e122ed12a1" + ] + ] + } + ], + "timestamp": "2026-04-02T15:07:43.185701068", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + }, + "Normal target distribution": { + "content": [ + { + "0": [ + [ + { + "dataset": "test" + }, + "count.raw.cpm.quant_norm.parquet:md5,10c118fd62dad210b585f30620679732" + ] + ], + "1": [ + [ + "QUANTILE_NORMALISATION", + "python", + "3.14.3" + ] + ], + "2": [ + [ + "QUANTILE_NORMALISATION", + "polars", + "1.39.2" + ] + ], + "3": [ + [ + "QUANTILE_NORMALISATION", + "scikit-learn", + "1.8.0" + ] + ], + "counts": [ + [ + { + "dataset": "test" + }, + "count.raw.cpm.quant_norm.parquet:md5,10c118fd62dad210b585f30620679732" + ] + ] + } + ], + "timestamp": "2026-04-02T15:07:51.53574005", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/tests/modules/local/variation_coefficient/main.nf.test b/tests/modules/local/variation_coefficient/main.nf.test deleted file mode 100644 index eb779c2e..00000000 --- a/tests/modules/local/variation_coefficient/main.nf.test +++ /dev/null @@ -1,28 +0,0 @@ -nextflow_process { - - name "Test Process VARIATION_COEFFICIENT" - script "modules/local/variation_coefficient/main.nf" - process "VARIATION_COEFFICIENT" - tag "var_coeff" - tag "module" - - test("Should run without failures") { - - when { - - process { - """ - ch_csv_files = Channel.fromPath( '$baseDir/tests/input/variation_coefficient/*.csv', checkIfExists: true) - input[0] = ch_csv_files.collect() - """ - } - } - - then { - assert process.success - assert snapshot(process.out).match() - } - - } - -} diff --git a/tests/modules/local/variation_coefficient/main.nf.test.snap b/tests/modules/local/variation_coefficient/main.nf.test.snap deleted file mode 100644 index f2f76bba..00000000 --- a/tests/modules/local/variation_coefficient/main.nf.test.snap +++ /dev/null @@ -1,26 +0,0 @@ -{ - "Should run without failures": { - "content": [ - { - "0": [ - "variation_coefficients.csv:md5,3b7f865c61386784acf289c526b14e55" - ], - "1": [ - [ - "VARIATION_COEFFICIENT", - "R", - "4.3.3 (2024-02-29)" - ] - ], - "csv": [ - "variation_coefficients.csv:md5,3b7f865c61386784acf289c526b14e55" - ] - } - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" - }, - "timestamp": "2024-12-29T00:31:42.877767395" - } -} \ No newline at end of file diff --git a/tests/nextflow.config b/tests/nextflow.config index b828e7a6..70fa952f 100644 --- a/tests/nextflow.config +++ b/tests/nextflow.config @@ -1,7 +1,15 @@ /* ======================================================================================== - Nextflow config file for running tests + Nextflow config file for running nf-test tests ======================================================================================== */ +// TODO nf-core: Specify any additional parameters here +// Or any resources requirements +params { + modules_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/' + pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/stableexpression' +} + +aws.client.anonymous = true // fixes S3 access issues on self-hosted runners enable.moduleBinaries = true diff --git a/tests/subworkflows/local/download_public_datasets/main.nf.test b/tests/subworkflows/local/download_public_datasets/main.nf.test new file mode 100644 index 00000000..54391b25 --- /dev/null +++ b/tests/subworkflows/local/download_public_datasets/main.nf.test @@ -0,0 +1,52 @@ +nextflow_workflow { + + name "Test Workflow DOWNLOAD_PUBLIC_DATASETS" + script "subworkflows/local/download_public_datasets/main.nf" + workflow "DOWNLOAD_PUBLIC_DATASETS" + tag "download_public_datasets" + + test("Beta vulgaris - Eatlas + GEO - all accessions") { + + when { + params { + species = 'beta vulgaris' + } + workflow { + """ + input[0] = params.species.split(' ').join('_') + input[1] = channel.fromList(['E-MTAB-8187', 'GSE107627', 'GSE114968', 'GSE135555', 'GSE205413', 'GSE269454', 'GSE281272', 'GSE55951', 'GSE79526', 'GSE92859']) + """ + } + } + + then { + assert workflow.success + assert snapshot(workflow.out).match() + } + + } + + test("Beta vulgaris - Eatlas only") { + + when { + params { + species = 'beta vulgaris' + } + workflow { + """ + input[0] = params.species.split(' ').join('_') + input[1] = channel.fromList(['E-MTAB-8187']) + """ + } + } + + then { + assert workflow.success + assert snapshot(workflow.out).match() + } + + } + + + +} diff --git a/tests/subworkflows/local/download_public_datasets/main.nf.test.snap b/tests/subworkflows/local/download_public_datasets/main.nf.test.snap new file mode 100644 index 00000000..f02a885b --- /dev/null +++ b/tests/subworkflows/local/download_public_datasets/main.nf.test.snap @@ -0,0 +1,86 @@ +{ + "Beta vulgaris - Eatlas only": { + "content": [ + { + "0": [ + [ + { + "dataset": "E_MTAB_8187_rnaseq", + "design": "E_MTAB_8187_rnaseq.design.csv:md5,fbd18d011d7d855452e5a30a303afcbf", + "normalised": false, + "platform": "rnaseq" + }, + "E_MTAB_8187_rnaseq.rnaseq.raw.counts.csv:md5,fe221fd94f66df7120b0590091e14eb1" + ] + ], + "datasets": [ + [ + { + "dataset": "E_MTAB_8187_rnaseq", + "design": "E_MTAB_8187_rnaseq.design.csv:md5,fbd18d011d7d855452e5a30a303afcbf", + "normalised": false, + "platform": "rnaseq" + }, + "E_MTAB_8187_rnaseq.rnaseq.raw.counts.csv:md5,fe221fd94f66df7120b0590091e14eb1" + ] + ] + } + ], + "timestamp": "2025-12-16T15:18:21.726044151", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + } + }, + "Beta vulgaris - Eatlas + GEO - all accessions": { + "content": [ + { + "0": [ + [ + { + "dataset": "E_MTAB_8187_rnaseq", + "design": "E_MTAB_8187_rnaseq.design.csv:md5,fbd18d011d7d855452e5a30a303afcbf", + "normalised": false, + "platform": "rnaseq" + }, + "E_MTAB_8187_rnaseq.rnaseq.raw.counts.csv:md5,fe221fd94f66df7120b0590091e14eb1" + ], + [ + { + "dataset": "GSE55951_GPL18429", + "design": "GSE55951_GPL18429.microarray.normalised.design.csv:md5,f4872dff0edbe441d1600ffe2b67a25d", + "normalised": true, + "platform": "microarray" + }, + "GSE55951_GPL18429.microarray.normalised.counts.csv:md5,18fd2d728ad2ec5cb78f994f73375144" + ] + ], + "datasets": [ + [ + { + "dataset": "E_MTAB_8187_rnaseq", + "design": "E_MTAB_8187_rnaseq.design.csv:md5,fbd18d011d7d855452e5a30a303afcbf", + "normalised": false, + "platform": "rnaseq" + }, + "E_MTAB_8187_rnaseq.rnaseq.raw.counts.csv:md5,fe221fd94f66df7120b0590091e14eb1" + ], + [ + { + "dataset": "GSE55951_GPL18429", + "design": "GSE55951_GPL18429.microarray.normalised.design.csv:md5,f4872dff0edbe441d1600ffe2b67a25d", + "normalised": true, + "platform": "microarray" + }, + "GSE55951_GPL18429.microarray.normalised.counts.csv:md5,18fd2d728ad2ec5cb78f994f73375144" + ] + ] + } + ], + "timestamp": "2025-12-16T15:18:08.622422246", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + } + } +} \ No newline at end of file diff --git a/tests/subworkflows/local/expression_normalisation/main.nf.test b/tests/subworkflows/local/expression_normalisation/main.nf.test new file mode 100644 index 00000000..34632aff --- /dev/null +++ b/tests/subworkflows/local/expression_normalisation/main.nf.test @@ -0,0 +1,172 @@ +nextflow_workflow { + + name "Test Workflow EXPRESSION_NORMALISATION" + script "subworkflows/local/expression_normalisation/main.nf" + workflow "EXPRESSION_NORMALISATION" + tag "subworkflow_expression_normalisation" + tag "subworkflow" + + test("TPM Normalisation") { + + when { + workflow { + """ + input[0] = "solanum_tuberosum" + input[1] = channel.of( + [ + [ + normalised: false, + design: file( '$projectDir/tests/test_data/input_datasets/rnaseq.raw.design.csv', checkIfExists: true ), + dataset: "rnaseq_raw", + platform: "rnaseq" + ], + file( '$projectDir/tests/test_data/input_datasets/rnaseq.raw.csv', checkIfExists: true ) + ], + [ + [ + normalised: true, + design: file( '$projectDir/tests/test_data/input_datasets/microarray.normalised.design.csv', checkIfExists: true ), + dataset: "microarray_normalised", + platform: "microarray" + ], + file( '$projectDir/tests/test_data/input_datasets/microarray.normalised.csv', checkIfExists: true ) + ] + ) + input[2] = "tpm" + input[3] = "uniform" + input[4] = null + input[5] = null + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out).match() } + ) + } + + } + + test("TPM Normalisation with gene length") { + + when { + workflow { + """ + input[0] = "solanum_tuberosum" + input[1] = channel.of( + [ + [ + normalised: false, + design: file( '$projectDir/tests/test_data/input_datasets/rnaseq.raw.design.csv', checkIfExists: true ), + dataset: "rnaseq_raw", + platform: "rnaseq" + ], + file( '$projectDir/tests/test_data/input_datasets/rnaseq.raw.csv', checkIfExists: true ) + ], + [ + [ + normalised: true, + design: file( '$projectDir/tests/test_data/input_datasets/microarray.normalised.design.csv', checkIfExists: true ), + dataset: "microarray_normalised", + platform: "microarray" + ], + file( '$projectDir/tests/test_data/input_datasets/microarray.normalised.csv', checkIfExists: true ) + ] + ) + input[2] = "tpm" + input[3] = "uniform" + input[4] = null + input[5] = file( '$projectDir/tests/test_data/input_datasets/gene_lengths.csv', checkIfExists: true ) + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out).match() } + ) + } + + } + + test("CPM Normalisation") { + + when { + workflow { + """ + input[0] = "solanum_tuberosum" + input[1] = channel.of( + [ + [ + normalised: false, + design: file( '$projectDir/tests/test_data/input_datasets/rnaseq.raw.design.csv', checkIfExists: true ), + dataset: "rnaseq_raw", + platform: "rnaseq" + ], + file( '$projectDir/tests/test_data/input_datasets/rnaseq.raw.csv', checkIfExists: true ) + ], + [ + [ + normalised: true, + design: file( '$projectDir/tests/test_data/input_datasets/microarray.normalised.design.csv', checkIfExists: true ), + dataset: "microarray_normalised", + platform: "microarray" + ], + file( '$projectDir/tests/test_data/input_datasets/microarray.normalised.csv', checkIfExists: true ) + ] + ) + input[2] = "cpm" + input[3] = "uniform" + input[4] = null + input[5] = null + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out).match() } + ) + } + + } + + test("No rnaseq normalisation") { + + when { + workflow { + """ + input[0] = "solanum_tuberosum" + input[1] = channel.of( + [ + [ + normalised: true, + design: file( '$projectDir/tests/test_data/input_datasets/microarray.normalised.design.csv', checkIfExists: true ), + dataset: "microarray_normalised", + platform: "microarray" + ], + file( '$projectDir/tests/test_data/input_datasets/microarray.normalised.csv', checkIfExists: true ) + ] + ) + input[2] = "tpm " + input[3] = "uniform" + input[4] = null + input[5] = null + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out).match() } + ) + } + + } + +} diff --git a/tests/subworkflows/local/expression_normalisation/main.nf.test.snap b/tests/subworkflows/local/expression_normalisation/main.nf.test.snap new file mode 100644 index 00000000..0b893abd --- /dev/null +++ b/tests/subworkflows/local/expression_normalisation/main.nf.test.snap @@ -0,0 +1,188 @@ +{ + "CPM Normalisation": { + "content": [ + { + "0": [ + [ + { + "normalised": false, + "design": "rnaseq.raw.design.csv:md5,39470b02a211aff791f9e4851b017488", + "dataset": "rnaseq_raw", + "platform": "rnaseq" + }, + "rnaseq.raw.cpm.quant_norm.parquet:md5,9f7988ca916b47ed614c824e001d2512" + ], + [ + { + "normalised": true, + "design": "microarray.normalised.design.csv:md5,9662cc2f58d86cc552d6ff9cf094dd67", + "dataset": "microarray_normalised", + "platform": "microarray" + }, + "microarray.normalised.quant_norm.parquet:md5,0f9ed5a872e8c424a9ccc83b1c33753f" + ] + ], + "counts": [ + [ + { + "normalised": false, + "design": "rnaseq.raw.design.csv:md5,39470b02a211aff791f9e4851b017488", + "dataset": "rnaseq_raw", + "platform": "rnaseq" + }, + "rnaseq.raw.cpm.quant_norm.parquet:md5,9f7988ca916b47ed614c824e001d2512" + ], + [ + { + "normalised": true, + "design": "microarray.normalised.design.csv:md5,9662cc2f58d86cc552d6ff9cf094dd67", + "dataset": "microarray_normalised", + "platform": "microarray" + }, + "microarray.normalised.quant_norm.parquet:md5,0f9ed5a872e8c424a9ccc83b1c33753f" + ] + ] + } + ], + "timestamp": "2026-03-19T12:27:13.766132141", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "No rnaseq normalisation": { + "content": [ + { + "0": [ + [ + { + "normalised": true, + "design": "microarray.normalised.design.csv:md5,9662cc2f58d86cc552d6ff9cf094dd67", + "dataset": "microarray_normalised", + "platform": "microarray" + }, + "microarray.normalised.quant_norm.parquet:md5,0f9ed5a872e8c424a9ccc83b1c33753f" + ] + ], + "counts": [ + [ + { + "normalised": true, + "design": "microarray.normalised.design.csv:md5,9662cc2f58d86cc552d6ff9cf094dd67", + "dataset": "microarray_normalised", + "platform": "microarray" + }, + "microarray.normalised.quant_norm.parquet:md5,0f9ed5a872e8c424a9ccc83b1c33753f" + ] + ] + } + ], + "timestamp": "2026-03-19T12:27:25.897836784", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "TPM Normalisation with gene length": { + "content": [ + { + "0": [ + [ + { + "normalised": false, + "design": "rnaseq.raw.design.csv:md5,39470b02a211aff791f9e4851b017488", + "dataset": "rnaseq_raw", + "platform": "rnaseq" + }, + "rnaseq.raw.tpm.quant_norm.parquet:md5,590b3bd6ec2b09533ef75ce9950d3a92" + ], + [ + { + "normalised": true, + "design": "microarray.normalised.design.csv:md5,9662cc2f58d86cc552d6ff9cf094dd67", + "dataset": "microarray_normalised", + "platform": "microarray" + }, + "microarray.normalised.quant_norm.parquet:md5,0f9ed5a872e8c424a9ccc83b1c33753f" + ] + ], + "counts": [ + [ + { + "normalised": false, + "design": "rnaseq.raw.design.csv:md5,39470b02a211aff791f9e4851b017488", + "dataset": "rnaseq_raw", + "platform": "rnaseq" + }, + "rnaseq.raw.tpm.quant_norm.parquet:md5,590b3bd6ec2b09533ef75ce9950d3a92" + ], + [ + { + "normalised": true, + "design": "microarray.normalised.design.csv:md5,9662cc2f58d86cc552d6ff9cf094dd67", + "dataset": "microarray_normalised", + "platform": "microarray" + }, + "microarray.normalised.quant_norm.parquet:md5,0f9ed5a872e8c424a9ccc83b1c33753f" + ] + ] + } + ], + "timestamp": "2026-03-19T12:27:00.268510601", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "TPM Normalisation": { + "content": [ + { + "0": [ + [ + { + "normalised": false, + "design": "rnaseq.raw.design.csv:md5,39470b02a211aff791f9e4851b017488", + "dataset": "rnaseq_raw", + "platform": "rnaseq" + }, + "rnaseq.raw.tpm.quant_norm.parquet:md5,d0e926a720de0803775b0dbd118b03ac" + ], + [ + { + "normalised": true, + "design": "microarray.normalised.design.csv:md5,9662cc2f58d86cc552d6ff9cf094dd67", + "dataset": "microarray_normalised", + "platform": "microarray" + }, + "microarray.normalised.quant_norm.parquet:md5,0f9ed5a872e8c424a9ccc83b1c33753f" + ] + ], + "counts": [ + [ + { + "normalised": false, + "design": "rnaseq.raw.design.csv:md5,39470b02a211aff791f9e4851b017488", + "dataset": "rnaseq_raw", + "platform": "rnaseq" + }, + "rnaseq.raw.tpm.quant_norm.parquet:md5,d0e926a720de0803775b0dbd118b03ac" + ], + [ + { + "normalised": true, + "design": "microarray.normalised.design.csv:md5,9662cc2f58d86cc552d6ff9cf094dd67", + "dataset": "microarray_normalised", + "platform": "microarray" + }, + "microarray.normalised.quant_norm.parquet:md5,0f9ed5a872e8c424a9ccc83b1c33753f" + ] + ] + } + ], + "timestamp": "2026-03-19T12:26:44.852023368", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/tests/subworkflows/local/genorm/main.nf.test b/tests/subworkflows/local/genorm/main.nf.test new file mode 100644 index 00000000..c45f2553 --- /dev/null +++ b/tests/subworkflows/local/genorm/main.nf.test @@ -0,0 +1,53 @@ +nextflow_workflow { + + name "Test Workflow genorm" + script "subworkflows/local/genorm/main.nf" + workflow "GENORM" + tag "subworkflow_genorm" + tag "subworkflow" + + test("10 genes") { + + tag "subworkflow_genorm_10_genes" + + when { + workflow { + """ + input[0] = channel.of([ + [section: "section_1"], + file( '$projectDir/tests/test_data/genorm/make_chunks/input/counts.head.parquet', checkIfExists: true) + ]) + """ + } + } + + then { + assert workflow.success + assert snapshot(workflow.out).match() + } + + } + + test("1000 genes") { + + tag "subworkflow_genorm_1000_genes" + + when { + workflow { + """ + input[0] = channel.of( [ + [section: "section_1"], + file( '$projectDir/tests/test_data/genorm/make_chunks/input/counts.parquet', checkIfExists: true) + ]) + """ + } + } + + then { + assert workflow.success + assert snapshot(workflow.out).match() + } + + } + +} diff --git a/tests/subworkflows/local/genorm/main.nf.test.snap b/tests/subworkflows/local/genorm/main.nf.test.snap new file mode 100644 index 00000000..813be1cf --- /dev/null +++ b/tests/subworkflows/local/genorm/main.nf.test.snap @@ -0,0 +1,56 @@ +{ + "1000 genes": { + "content": [ + { + "0": [ + [ + { + "section": "section_1" + }, + "m_measures.csv:md5,2119b16fe13e2d0bc0fedc3c9d3d1733" + ] + ], + "m_measures": [ + [ + { + "section": "section_1" + }, + "m_measures.csv:md5,2119b16fe13e2d0bc0fedc3c9d3d1733" + ] + ] + } + ], + "timestamp": "2026-04-01T09:56:47.48692894", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "10 genes": { + "content": [ + { + "0": [ + [ + { + "section": "section_1" + }, + "m_measures.csv:md5,8bfea16844f247e2b871a8f559a3dd73" + ] + ], + "m_measures": [ + [ + { + "section": "section_1" + }, + "m_measures.csv:md5,8bfea16844f247e2b871a8f559a3dd73" + ] + ] + } + ], + "timestamp": "2026-04-01T09:55:53.207791305", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/tests/subworkflows/local/genorm/run_genorm.py b/tests/subworkflows/local/genorm/run_genorm.py new file mode 100644 index 00000000..9704d7dc --- /dev/null +++ b/tests/subworkflows/local/genorm/run_genorm.py @@ -0,0 +1,44 @@ +import sys + +import numpy as np +import pandas as pd + +file = sys.argv[1] +# Expression data for three control genes. +counts = pd.read_parquet(file) +counts.set_index("gene_id", inplace=True) +counts = counts.T.replace(0, 1e-8) + + +def _m_numpy(gene_expression: np.ndarray) -> np.ndarray: + """Internal control gene-stability measure `M`. + + Computes Eq. (4) in Ref. [1]. + + [1]: Vandesompele, Jo, et al. "Accurate normalization of real-time quantitative + RT-PCR data by geometric averaging of multiple internal control genes." Genome + biology 3.7 (2002): 1-12. + """ + + if not (gene_expression > 0).all(): + raise ValueError( + "Expression domain error: not all expression data are strictly positive!" + ) + + a = gene_expression + # Eq. (2): A_{jk}^{(i)} = log_2 (a_{ij} / a_{ik}) + A = np.log2(np.einsum("ij,ik->ijk", a, 1 / a)) + # Eq. (3) + V = np.std(A, axis=0) + # Eq. (4) N.B., Since V_{j=k} is zero, we can simply ignore it since it does not + # contribute to calculation. + n = V.shape[1] + return np.sum(V, axis=1) / (n - 1) + + +def m_measure(gene_expression): + m_values = _m_numpy(gene_expression.to_numpy()) + return pd.Series(m_values, index=gene_expression.columns) + + +print(m_measure(counts).sort_values()) diff --git a/tests/subworkflows/local/get_public_accessions/main.nf.test b/tests/subworkflows/local/get_public_accessions/main.nf.test new file mode 100644 index 00000000..dc449417 --- /dev/null +++ b/tests/subworkflows/local/get_public_accessions/main.nf.test @@ -0,0 +1,241 @@ +nextflow_workflow { + + name "Test Workflow GET_PUBLIC_ACCESSIONS" + script "subworkflows/local/get_public_accessions/main.nf" + workflow "GET_PUBLIC_ACCESSIONS" + tag "get_public_accessions" + + test("Fetch eatlas accessions without keywords") { + + when { + + params { + species = 'beta vulgaris' + skip_fetch_eatlas_accessions = false + fetch_geo_accessions = false + platform = null + keywords = "" + accessions = "" + accessions_file = null + excluded_accessions = "" + excluded_accessions_file = null + random_sampling_size = null + random_sampling_seed = 42 + outdir = "$outputDir" + } + + workflow { + """ + input[0] = params.species.split(' ').join('_') + input[1] = params.skip_fetch_eatlas_accessions + input[2] = params.fetch_geo_accessions + input[3] = params.platform + input[4] = params.keywords + input[5] = channel.fromList( params.accessions.tokenize(',') ) + input[6] = params.accessions_file ? channel.fromPath(params.accessions_file, checkIfExists: true) : channel.empty() + input[7] = channel.fromList( params.excluded_accessions.tokenize(',') ) + input[8] = params.excluded_accessions_file ? channel.fromPath(params.excluded_accessions_file, checkIfExists: true) : channel.empty() + input[9] = params.random_sampling_size + input[10] = params.random_sampling_seed + input[11] = params.outdir + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out).match() } + ) + } + + } + + /* + //TODO: see why it gives issues in CI + test("Fetch public accessions with keywords + GEO") { + + when { + + params { + species = 'beta vulgaris' + skip_fetch_eatlas_accessions = false + fetch_geo_accessions = true + platform = null + keywords = "leaf" + accessions = "" + accessions_file = null + excluded_accessions = "" + excluded_accessions_file = null + random_sampling_size = null + random_sampling_seed = 42 + outdir = "$outputDir" + } + + workflow { + """ + input[0] = params.species.split(' ').join('_') + input[1] = params.skip_fetch_eatlas_accessions + input[2] = params.fetch_geo_accessions + input[3] = params.platform + input[4] = params.keywords + input[5] = channel.fromList( params.accessions.tokenize(',') ) + input[6] = params.accessions_file ? channel.fromPath(params.accessions_file, checkIfExists: true) : channel.empty() + input[7] = channel.fromList( params.excluded_accessions.tokenize(',') ) + input[8] = params.excluded_accessions_file ? channel.fromPath(params.excluded_accessions_file, checkIfExists: true) : channel.empty() + input[9] = params.random_sampling_size + input[10] = params.random_sampling_seed + input[11] = params.outdir + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out).match() } + ) + } + + } + */ + + test("No GEO + accessions provided") { + + when { + + params { + species = 'beta vulgaris' + skip_fetch_eatlas_accessions = false + fetch_geo_accessions = false + platform = null + keywords = "" + accessions = "E-MTAB-552,E-GEOD-61690 ,E-PROT-138" + accessions_file = null + excluded_accessions = "" + excluded_accessions_file = null + random_sampling_size = null + random_sampling_seed = 42 + outdir = "$outputDir" + } + + workflow { + """ + input[0] = params.species.split(' ').join('_') + input[1] = params.skip_fetch_eatlas_accessions + input[2] = params.fetch_geo_accessions + input[3] = params.platform + input[4] = params.keywords + input[5] = channel.fromList( params.accessions.tokenize(',') ) + input[6] = params.accessions_file ? channel.fromPath(params.accessions_file, checkIfExists: true) : channel.empty() + input[7] = channel.fromList( params.excluded_accessions.tokenize(',') ) + input[8] = params.excluded_accessions_file ? channel.fromPath(params.excluded_accessions_file, checkIfExists: true) : channel.empty() + input[9] = params.random_sampling_size + input[10] = params.random_sampling_seed + input[11] = params.outdir + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out).match() } + ) + } + + } + + test("Accessions file + Excluded accessions file") { + + when { + params { + species = 'beta vulgaris' + skip_fetch_eatlas_accessions = false + fetch_geo_accessions = false + platform = null + keywords = "" + accessions = "E-MTAB-552,E-GEOD-61690 ,E-PROT-138" + accessions_file = null + excluded_accessions = "" + excluded_accessions_file = file( '$projectDir/tests/test_data/public_accessions/exclude_one_two_accessions.txt', checkIfExists: true ) + random_sampling_size = null + random_sampling_seed = 42 + outdir = "$outputDir" + } + + workflow { + """ + input[0] = params.species.split(' ').join('_') + input[1] = params.skip_fetch_eatlas_accessions + input[2] = params.fetch_geo_accessions + input[3] = params.platform + input[4] = params.keywords + input[5] = channel.fromList( params.accessions.tokenize(',') ) + input[6] = params.accessions_file ? channel.fromPath(params.accessions_file, checkIfExists: true) : channel.empty() + input[7] = channel.fromList( params.excluded_accessions.tokenize(',') ) + input[8] = params.excluded_accessions_file ? channel.fromPath(params.excluded_accessions_file, checkIfExists: true) : channel.empty() + input[9] = params.random_sampling_size + input[10] = params.random_sampling_seed + input[11] = params.outdir + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out).match() } + ) + } + + } + + test("With samplling size") { + + when { + + params { + species = 'beta vulgaris' + skip_fetch_eatlas_accessions = false + fetch_geo_accessions = false + platform = null + keywords = "" + accessions = "" + accessions_file = null + excluded_accessions = "" + excluded_accessions_file = null + random_sampling_size = 2 + random_sampling_seed = 42 + outdir = "$outputDir" + } + + workflow { + """ + input[0] = params.species.split(' ').join('_') + input[1] = params.skip_fetch_eatlas_accessions + input[2] = params.fetch_geo_accessions + input[3] = params.platform + input[4] = params.keywords + input[5] = channel.fromList( params.accessions.tokenize(',') ) + input[6] = params.accessions_file ? channel.fromPath(params.accessions_file, checkIfExists: true) : channel.empty() + input[7] = channel.fromList( params.excluded_accessions.tokenize(',') ) + input[8] = params.excluded_accessions_file ? channel.fromPath(params.excluded_accessions_file, checkIfExists: true) : channel.empty() + input[9] = params.random_sampling_size + input[10] = params.random_sampling_seed + input[11] = params.outdir + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out).match() } + ) + } + + } + + +} diff --git a/tests/subworkflows/local/get_public_accessions/main.nf.test.snap b/tests/subworkflows/local/get_public_accessions/main.nf.test.snap new file mode 100644 index 00000000..ed1f7832 --- /dev/null +++ b/tests/subworkflows/local/get_public_accessions/main.nf.test.snap @@ -0,0 +1,109 @@ +{ + "Accessions file + Excluded accessions file": { + "content": [ + { + "0": [ + "E-GEOD-61690", + "E-MTAB-552", + "E-MTAB-8187", + "E-PROT-138" + ], + "accessions": [ + "E-GEOD-61690", + "E-MTAB-552", + "E-MTAB-8187", + "E-PROT-138" + ] + } + ], + "timestamp": "2026-04-01T14:49:26.109506348", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "No GEO + accessions provided": { + "content": [ + { + "0": [ + "E-GEOD-61690", + "E-MTAB-552", + "E-MTAB-8187", + "E-PROT-138" + ], + "accessions": [ + "E-GEOD-61690", + "E-MTAB-552", + "E-MTAB-8187", + "E-PROT-138" + ] + } + ], + "timestamp": "2026-04-01T14:49:15.733265214", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "Fetch eatlas accessions without keywords": { + "content": [ + { + "0": [ + "E-MTAB-8187" + ], + "accessions": [ + "E-MTAB-8187" + ] + } + ], + "timestamp": "2026-04-01T14:48:43.670211053", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "With samplling size": { + "content": [ + { + "0": [ + + ], + "accessions": [ + + ] + } + ], + "timestamp": "2026-04-01T14:53:02.05138484", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "Fetch public accessions with keywords": { + "content": [ + { + "0": [ + "E-MTAB-8187", + "GSE107627", + "GSE114968", + "GSE269454", + "GSE281272", + "GSE79526" + ], + "accessions": [ + "E-MTAB-8187", + "GSE107627", + "GSE114968", + "GSE269454", + "GSE281272", + "GSE79526" + ] + } + ], + "timestamp": "2026-04-01T14:49:04.709620479", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/tests/test_data/aggregate_results/mapping.csv b/tests/test_data/aggregate_results/mapping.csv new file mode 100644 index 00000000..b3c00132 --- /dev/null +++ b/tests/test_data/aggregate_results/mapping.csv @@ -0,0 +1,4 @@ +original_gene_id,gene_id +ENSRNA049434199,ENSRNA049454747 +ENSRNA049434246,ENSRNA049454887 +ENSRNA049434252,SNSRNA049434252 diff --git a/tests/test_data/aggregate_results/metadata.csv b/tests/test_data/aggregate_results/metadata.csv new file mode 100644 index 00000000..d4985a9f --- /dev/null +++ b/tests/test_data/aggregate_results/metadata.csv @@ -0,0 +1,4 @@ +gene_id,name,description +ENSRNA049454747,geneA,descriptionA +ENSRNA049454887,geneB,descriptionB +ENSRNA049454947,geneC,descriptionC diff --git a/tests/test_data/aggregate_results/microarray_stats_all_genes.csv b/tests/test_data/aggregate_results/microarray_stats_all_genes.csv new file mode 100644 index 00000000..0fe7d08f --- /dev/null +++ b/tests/test_data/aggregate_results/microarray_stats_all_genes.csv @@ -0,0 +1,10 @@ +gene_id,microarray_mean,microarray_standard_deviation,microarray_median,microarray_median_absolute_deviation,microarray_coefficient_of_variation,microarray_robust_coefficient_of_variation_median,microarray_ratio_nulls_in_all_samples,microarray_ratio_nulls_in_valid_samples,microarray_ratio_zeros,microarray_expression_level_quantile_interval +ENSRNA049454747,0.9375,0.11572751247156893,1.0,0.0,0.12344267996967352,0.0,0.0,0.0,0.0,99 +ENSRNA049454887,0.140625,0.15580293184477811,0.125,0.125,1.1079319597850887,1.4826,0.0,0.0,0.5,11 +ENSRNA049454931,0.4453125,0.12246309575308217,0.4375,0.0625,0.2750048466034126,0.2118,0.0,0.0,0.0,66 +ENSRNA049454947,0.3984375,0.1887975933374152,0.375,0.125,0.4738449401409636,0.4942,0.0,0.0,0.0,44 +ENSRNA049454955,0.421875,0.18525441001112883,0.4375,0.15625,0.4391215644708239,0.5295,0.0,0.0,0.0,55 +ENSRNA049454963,0.78125,0.08838834764831845,0.75,0.0625,0.1131370849898476,0.12355,0.0,0.0,0.0,77 +ENSRNA049454974,0.859375,0.12387890112063936,0.875,0.0625,0.14414999403128945,0.1059,0.0,0.0,0.0,88 +ENSRNA049455639,0.15625,0.20863074009907004,0.125,0.125,1.3352367366340483,1.4826,0.0,0.0,0.375,22 +ENSRNA049455690,0.328125,0.34028283928856257,0.3125,0.3125,1.0370524625937145,1.4826,0.0,0.0,0.375,33 diff --git a/tests/test_data/aggregate_results/rnaseq_stats_all_genes.csv b/tests/test_data/aggregate_results/rnaseq_stats_all_genes.csv new file mode 100644 index 00000000..de9af372 --- /dev/null +++ b/tests/test_data/aggregate_results/rnaseq_stats_all_genes.csv @@ -0,0 +1,10 @@ +gene_id,rnaseq_mean,rnaseq_standard_deviation,rnaseq_median,rnaseq_median_absolute_deviation,rnaseq_coefficient_of_variation,rnaseq_robust_coefficient_of_variation_median,rnaseq_ratio_nulls_in_all_samples,rnaseq_ratio_nulls_in_valid_samples,rnaseq_ratio_zeros,rnaseq_expression_level_quantile_interval +ENSRNA049454747,0.9375,0.11572751247156893,1.0,0.0,0.12344267996967352,0.0,0.0,0.0,0.0,99 +ENSRNA049454887,0.140625,0.15580293184477811,0.125,0.125,1.1079319597850887,1.4826,0.0,0.0,0.5,11 +ENSRNA049454931,0.4453125,0.12246309575308217,0.4375,0.0625,0.2750048466034126,0.2118,0.0,0.0,0.0,66 +ENSRNA049454947,0.3984375,0.1887975933374152,0.375,0.125,0.4738449401409636,0.4942,0.0,0.0,0.0,44 +ENSRNA049454955,0.421875,0.18525441001112883,0.4375,0.15625,0.4391215644708239,0.5295,0.0,0.0,0.0,55 +ENSRNA049454963,0.78125,0.08838834764831845,0.75,0.0625,0.1131370849898476,0.12355,0.0,0.0,0.0,77 +ENSRNA049454974,0.859375,0.12387890112063936,0.875,0.0625,0.14414999403128945,0.1059,0.0,0.0,0.0,88 +ENSRNA049455639,0.15625,0.20863074009907004,0.125,0.125,1.3352367366340483,1.4826,0.0,0.0,0.375,22 +ENSRNA049455690,0.328125,0.34028283928856257,0.3125,0.3125,1.0370524625937145,1.4826,0.0,0.0,0.375,33 diff --git a/tests/test_data/base_statistics/output/section_1.stats_with_scores.csv b/tests/test_data/base_statistics/output/section_1.stats_with_scores.csv new file mode 100644 index 00000000..c210a841 --- /dev/null +++ b/tests/test_data/base_statistics/output/section_1.stats_with_scores.csv @@ -0,0 +1,6 @@ +gene_id,mean,standard_deviation,median,median_absolute_deviation,coefficient_of_variation,robust_coefficient_of_variation_median,ratio_nulls_in_all_samples,ratio_nulls_in_valid_samples,ratio_zeros,expression_level_quantile_interval,section,normfinder_stability_value,genorm_m_measure,is_candidate,normfinder_stability_value_normalised,genorm_m_measure_normalised,coefficient_of_variation_normalised,robust_coefficient_of_variation_median_normalised,stability_score,rank +ENSRNA049454747,0.570564,0.010209,0.568117,0.008682,0.017892,0.022657,0.000000,0.000000,0.000000,56.000000,9,0.004712,0.067012,1,0.000346,0.000000,0.000000,0.031965,0.032311,1 +ENSRNA049454887,0.552805,0.014706,0.552715,0.009310,0.026603,0.024974,0.000000,0.000000,0.000000,55.000000,9,0.006991,0.071275,1,0.197714,0.097993,0.224146,0.072973,0.592827,2 +ENSRNA049454931,0.556514,0.016277,0.555356,0.012927,0.029249,0.034509,0.000000,0.000000,0.000000,55.000000,9,0.005713,0.070772,1,0.087036,0.086431,0.292232,0.241735,0.707433,3 +ENSRNA049454947,0.565699,0.017542,0.563547,0.009311,0.031010,0.024495,0.000000,0.000000,0.000000,56.000000,9,0.006086,0.076305,1,0.119338,0.213617,0.337545,0.064496,0.734996,4 +ENSRNA049454955,0.577896,0.017702,0.576416,0.012490,0.030632,0.032127,0.000000,0.000000,0.000000,57.000000,9,0.006420,0.069699,1,0.148264,0.061766,0.327818,0.199575,0.737423,5 diff --git a/tests/test_data/base_statistics/output/section_2.stats_with_scores.csv b/tests/test_data/base_statistics/output/section_2.stats_with_scores.csv new file mode 100644 index 00000000..d0d6d6e8 --- /dev/null +++ b/tests/test_data/base_statistics/output/section_2.stats_with_scores.csv @@ -0,0 +1,5 @@ +gene_id,mean,standard_deviation,median,median_absolute_deviation,coefficient_of_variation,robust_coefficient_of_variation_median,ratio_nulls_in_all_samples,ratio_nulls_in_valid_samples,ratio_zeros,expression_level_quantile_interval,section,normfinder_stability_value,genorm_m_measure,is_candidate,normfinder_stability_value_normalised,genorm_m_measure_normalised,coefficient_of_variation_normalised,robust_coefficient_of_variation_median_normalised,stability_score,rank +ENSRNA049454963,0.997524,0.000782,0.997419,0.000419,0.000784,0.000622,0.000000,0.000000,0.000000,99.000000,1,0.000125,0.002924,1,0.006574,0.001386,0.177943,0.089186,0.275089,6 +ENSRNA049454974,0.997944,0.000658,0.998069,0.000409,0.000659,0.000607,0.000000,0.000000,0.000000,99.000000,1,0.000185,0.003060,1,0.050402,0.039091,0.143564,0.086399,0.319456,7 +ENSRNA049455639,0.997911,0.000919,0.997909,0.000533,0.000921,0.000791,0.000000,0.000000,0.000000,99.000000,1,0.000116,0.002919,1,0.000000,0.000000,0.215622,0.120587,0.336209,8 +ENSRNA049455690,0.996857,0.000889,0.996528,0.000433,0.000892,0.000645,0.000000,0.000000,0.000000,99.000000,1,0.000155,0.002944,1,0.028488,0.006931,0.207646,0.093460,0.336524,9 diff --git a/tests/test_data/base_statistics/output/stats_all_genes.csv b/tests/test_data/base_statistics/output/stats_all_genes.csv new file mode 100644 index 00000000..9c2d792e --- /dev/null +++ b/tests/test_data/base_statistics/output/stats_all_genes.csv @@ -0,0 +1,10 @@ +gene_id,mean,standard_deviation,median,median_absolute_deviation,coefficient_of_variation,robust_coefficient_of_variation_median,ratio_nulls_in_all_samples,ratio_nulls_in_valid_samples,ratio_zeros,expression_level_quantile_interval +ENSRNA049454747,0.204895,0.197240,0.332892,0.111337,0.962641,0.495860,0.000000,0.000000,0.466667,28 +ENSRNA049454887,0.525767,0.039664,0.515980,0.014747,0.075440,0.042374,0.000000,0.000000,0.000000,52 +ENSRNA049454931,0.429906,0.040942,0.439106,0.028691,0.095235,0.096872,0.000000,0.000000,0.000000,43 +ENSRNA049454947,0.337136,0.023450,0.332792,0.010226,0.069556,0.045556,0.000000,0.000000,0.000000,35 +ENSRNA049454955,0.356393,0.077994,0.367554,0.033003,0.218844,0.133124,0.000000,0.000000,0.033333,37 +ENSRNA049454963,0.473395,0.040190,0.468429,0.021211,0.084898,0.067134,0.000000,0.000000,0.000000,47 +ENSRNA049454974,0.652818,0.120681,0.623259,0.073014,0.184861,0.173684,0.000000,0.000000,0.000000,65 +ENSRNA049455639,0.566799,0.038299,0.562763,0.025460,0.067571,0.067073,0.000000,0.000000,0.000000,56 +ENSRNA049455690,0.653952,0.036833,0.647126,0.016865,0.056324,0.038639,0.000000,0.000000,0.000000,65 diff --git a/tests/test_data/compute_gene_statistics/input/design.csv b/tests/test_data/compute_gene_statistics/input/design.csv new file mode 100644 index 00000000..d3e8694c --- /dev/null +++ b/tests/test_data/compute_gene_statistics/input/design.csv @@ -0,0 +1,28 @@ +sample,condition,batch +ARR029909,g1,A +ARR029910,g1,A +ARR029911,g1,A +ARR029912,g2,A +ARR029913,g2,A +ARR029914,g2,A +ARR029915,g3,A +ARR029916,g3,A +ARR029917,g3,A +URR029909,g1,B +URR029910,g1,B +URR029911,g1,B +URR029912,g2,B +URR029913,g2,B +URR029914,g2,B +URR029915,g3,B +URR029916,g3,B +URR029917,g3,B +ERR029909,g1,C +ERR029910,g1,C +ERR029911,g1,C +ERR029912,g2,C +ERR029913,g2,C +ERR029914,g2,C +ERR029915,g3,C +ERR029916,g3,C +ERR029917,g3,C diff --git a/tests/test_data/compute_gene_statistics/input/gene_counts.csv b/tests/test_data/compute_gene_statistics/input/gene_counts.csv new file mode 100644 index 00000000..fad53618 --- /dev/null +++ b/tests/test_data/compute_gene_statistics/input/gene_counts.csv @@ -0,0 +1,28 @@ +sample,count +ARR029909,4 +ARR029910,4 +ARR029911,4 +ARR029912,4 +ARR029913,4 +ARR029914,4 +ARR029915,4 +ARR029916,4 +ARR029917,4 +URR029909,2 +URR029910,2 +URR029911,2 +URR029912,2 +URR029913,2 +URR029914,2 +URR029915,2 +URR029916,2 +URR029917,2 +ERR029909,3 +ERR029910,3 +ERR029911,3 +ERR029912,3 +ERR029913,3 +ERR029914,3 +ERR029915,3 +ERR029916,3 +ERR029917,3 diff --git a/tests/test_data/compute_gene_statistics/input/ks_stats.csv b/tests/test_data/compute_gene_statistics/input/ks_stats.csv new file mode 100644 index 00000000..119c4ae5 --- /dev/null +++ b/tests/test_data/compute_gene_statistics/input/ks_stats.csv @@ -0,0 +1,27 @@ +URR029909,0.99 +URR029910,0.58 +URR029911,0.24 +URR029912,0.12 +URR029913,0.05 +URR029914,0.0 +URR029915,0.897 +URR029916,0.999 +URR029917,0.23 +ERR029909,0.45 +ERR029910,0.87 +ERR029911,0.456 +ERR029912,0.457 +ERR029913,0.78 +ERR029914,0.32 +ERR029915,0.56 +ERR029916,0.45 +ERR029917,0.12 +ARR029909,0.21 +ARR029910,0.0000005 +ARR029911,0 +ARR029912,0.789 +ARR029913,0.987 +ARR029914,0.876 +ARR029915,0.123 +ARR029916,0.321 +ARR029917,0.156 diff --git a/tests/test_data/compute_gene_statistics/input/mapping1.csv b/tests/test_data/compute_gene_statistics/input/mapping1.csv new file mode 100644 index 00000000..8c5865b4 --- /dev/null +++ b/tests/test_data/compute_gene_statistics/input/mapping1.csv @@ -0,0 +1,9 @@ +original_gene_id,gene_id +Q8VWG3,AT1G34790 +Q9FJA2,AT5G35550 +Q8RYD9,AT5G23260 +ABCD12,AT5G23261 +840386,AT1G34790 +833520,AT5G35550 +832390,AT5G23260 +123456,AT5G35550 diff --git a/tests/test_data/compute_gene_statistics/input/mapping2.csv b/tests/test_data/compute_gene_statistics/input/mapping2.csv new file mode 100644 index 00000000..080dbefd --- /dev/null +++ b/tests/test_data/compute_gene_statistics/input/mapping2.csv @@ -0,0 +1,9 @@ +original_gene_id,gene_id +Q8VWG3,AT1G34790 +Q9FJA2,AT5G35550 +Q8RYD9,AT5G23260 +ABCD12,AT5G23261 +840386,AT1G34790 +833520,AT5G35550 +832390,AT5G23260 +457862,AT5G23260 diff --git a/tests/test_data/compute_gene_statistics/input/mapping3.csv b/tests/test_data/compute_gene_statistics/input/mapping3.csv new file mode 100644 index 00000000..c8fbe3f9 --- /dev/null +++ b/tests/test_data/compute_gene_statistics/input/mapping3.csv @@ -0,0 +1,5 @@ +original_gene_id,gene_id +Q8VWG3,AT1G34790 +Q9FJA2,AT5G35550 +Q8RYD9,AT5G23260 +152348,AT1G23260 diff --git a/tests/test_data/compute_gene_statistics/input/metadata1.csv b/tests/test_data/compute_gene_statistics/input/metadata1.csv new file mode 100644 index 00000000..399628bf --- /dev/null +++ b/tests/test_data/compute_gene_statistics/input/metadata1.csv @@ -0,0 +1,5 @@ +gene_id,name,description +AT1G34790,TT1,C2H2 and C2HC zinc fingers superfamily protein +AT5G35550,TT2,Duplicated homeodomain-like superfamily protein +AT5G23260,TT16,K-box region and MADS-box transcription factor family protein +AT5G23261,TT23,blabla diff --git a/tests/test_data/compute_gene_statistics/input/metadata2.csv b/tests/test_data/compute_gene_statistics/input/metadata2.csv new file mode 100644 index 00000000..69fadca4 --- /dev/null +++ b/tests/test_data/compute_gene_statistics/input/metadata2.csv @@ -0,0 +1,4 @@ +gene_id,name,description +AT1G34790,TT1,C2H2 and C2HC zinc fingers superfamily protein +AT5G35550,TT2,Duplicated homeodomain-like superfamily protein +AT5G23260,TT16,K-box region and MADS-box transcription factor family protein diff --git a/tests/test_data/compute_gene_statistics/input/microarray_stats_all_genes.csv b/tests/test_data/compute_gene_statistics/input/microarray_stats_all_genes.csv new file mode 100644 index 00000000..e40090d6 --- /dev/null +++ b/tests/test_data/compute_gene_statistics/input/microarray_stats_all_genes.csv @@ -0,0 +1,8 @@ +gene_id,microarray_mean,microarray_standard_deviation,microarray_median,microarray_median_absolute_deviation,microarray_variation_coefficient,microarray_total_nb_nulls,microarray_nb_nulls_valid_samples,microarray_stability_score,microarray_expression_level_quantile_interval +AT1G34790,0.6041722385984585,0.2965945020346847,0.8210634736950527,0.07852066041592076,0.49091051042450434,678,643,1.4392880915454482,71 +AT5G35550,0.04211885958141837,0.017403154131542625,0.04081717758449555,0.00889668425147598,0.41319148487155133,678,643,1.3615690659924953,0 +AT5G23260,0.3265572056851324,0.12636844695328353,0.2977133397782717,0.09861099799987358,0.3869718528738528,678,643,1.3353494339947967,35 +AT5G23261,0.05948100952172446,0.0268768665570047,0.049569984365840696,0.021228253513649518,0.4518562608993441,678,643,1.400233842020288,1 +AT1G34790,0.5791984846868644,0.16532007773816776,0.5865184277282238,0.13319224137108376,0.28542905775650596,70,35,0.337051476635562,68 +AT5G35550,0.4069181057633956,0.2662419700433056,0.26506770843115524,0.13156965253473574,0.6542888268484007,678,643,1.6026664079693447,46 +AT5G23260,0.12079194562039748,0.060559689529495545,0.10818687095210754,0.0368400391021249,0.5013553612242599,678,643,1.449732942345204,7 diff --git a/tests/test_data/compute_gene_statistics/input/ratio_nulls_per_sample.csv b/tests/test_data/compute_gene_statistics/input/ratio_nulls_per_sample.csv new file mode 100644 index 00000000..fd6b8853 --- /dev/null +++ b/tests/test_data/compute_gene_statistics/input/ratio_nulls_per_sample.csv @@ -0,0 +1,9 @@ +sample,ratio +sample_63,0.0 +sample_64,0.0 +sample_65,0.0 +sample_66,0.0 +sample_67,0.0 +sample_68,0.0 +sample_69,0.0 +sample_70,0.0 diff --git a/tests/test_data/compute_gene_statistics/input/rnaseq_stats_all_genes.csv b/tests/test_data/compute_gene_statistics/input/rnaseq_stats_all_genes.csv new file mode 100644 index 00000000..e4c7327d --- /dev/null +++ b/tests/test_data/compute_gene_statistics/input/rnaseq_stats_all_genes.csv @@ -0,0 +1,8 @@ +gene_id,rnaseq_mean,rnaseq_standard_deviation,rnaseq_median,rnaseq_median_absolute_deviation,rnaseq_variation_coefficient,rnaseq_total_nb_nulls,rnaseq_nb_nulls_valid_samples,rnaseq_stability_score,rnaseq_expression_level_quantile_interval +AT1G34790,0.029004004004004002,0.061217504567865136,0.0,0.0,2.110657016852365,345,336,3.0544772415714663,0 +AT5G35550,0.2921254587921254,0.028005675342417956,0.28128128128128127,0.025025025025025016,0.09586865676896245,356,347,1.070587757892558,41 +AT5G23260,0.051621388830691145,0.04715133948046024,0.04154154154154154,0.027027027027027035,0.9134070304677029,322,313,1.7926205136137703,3 +AT5G23261,0.06000444889333778,0.0796183056079376,0.030030030030030026,0.030030030030030026,1.3268733748303374,356,347,2.301592475953933,5 +AT1G34790,0.027638749860972082,0.019581626793675158,0.025525525525525526,0.014014014014014014,0.7084845332069752,356,347,1.6832036343305707,0 +AT5G35550,0.07687920478618152,0.05023977809403856,0.06906906906906907,0.03603603603603604,0.6534898251583997,322,313,1.532703308304467,8 +AT5G23260,0.05421550582840906,0.0785887308235655,0.0,0.0,1.4495618849761762,303,294,2.2754045816053896,4 diff --git a/tests/test_data/compute_stability_scores/input/genorm.m_measures.csv b/tests/test_data/compute_stability_scores/input/genorm.m_measures.csv new file mode 100644 index 00000000..b4b6ae10 --- /dev/null +++ b/tests/test_data/compute_stability_scores/input/genorm.m_measures.csv @@ -0,0 +1,10 @@ +gene_id,genorm_m_measure +ENSRNA049454747,0.16034699963469335 +ENSRNA049454887,0.525024672172669794 +ENSRNA049454931,0.264017707597323344 +ENSRNA049454947,0.037074358179388235 +ENSRNA049454955,0.65294154739420848 +ENSRNA049454963,0.213698246698642331 +ENSRNA049454974,0.16807095772646336 +ENSRNA049455639,0.02698654413301954 +ENSRNA049455690,0.57785261216485885 diff --git a/tests/test_data/compute_stability_scores/input/stability_values.normfinder.csv b/tests/test_data/compute_stability_scores/input/stability_values.normfinder.csv new file mode 100644 index 00000000..238572e2 --- /dev/null +++ b/tests/test_data/compute_stability_scores/input/stability_values.normfinder.csv @@ -0,0 +1,10 @@ +gene_id,normfinder_stability_value +ENSRNA049454747,0.036034699963469335 +ENSRNA049454887,0.05024672172669794 +ENSRNA049454931,0.014017707597323344 +ENSRNA049454947,0.037074358179388235 +ENSRNA049454955,0.03294154739420848 +ENSRNA049454963,0.03698246698642331 +ENSRNA049454974,0.06807095772646336 +ENSRNA049455639,0.02698654413301954 +ENSRNA049455690,0.07785261216485885 diff --git a/tests/test_data/compute_stability_scores/input/stats_all_genes.csv b/tests/test_data/compute_stability_scores/input/stats_all_genes.csv new file mode 100644 index 00000000..c5ef7f74 --- /dev/null +++ b/tests/test_data/compute_stability_scores/input/stats_all_genes.csv @@ -0,0 +1,10 @@ +gene_id,mean,standard_deviation,median,median_absolute_deviation,coefficient_of_variation,robust_coefficient_of_variation_median,ratio_nulls_in_all_samples,ratio_nulls_in_valid_samples,ratio_zeros,expression_level_quantile_interval,section +ENSRNA049454747,0.9375,0.11572751247156893,1.0,0.0,0.12344267996967352,0.0,0.0,0.0,0.0,99,19 +ENSRNA049454887,0.140625,0.15580293184477811,0.125,0.125,1.1079319597850887,1.4826,0.0,0.0,0.5,11,19 +ENSRNA049454931,0.4453125,0.12246309575308217,0.4375,0.0625,0.2750048466034126,0.2118,0.0,0.0,0.0,66,19 +ENSRNA049454947,0.3984375,0.1887975933374152,0.375,0.125,0.4738449401409636,0.4942,0.0,0.0,0.0,44,19 +ENSRNA049454955,0.421875,0.18525441001112883,0.4375,0.15625,0.4391215644708239,0.5295,0.0,0.0,0.0,55,19 +ENSRNA049454963,0.78125,0.08838834764831845,0.75,0.0625,0.1131370849898476,0.12355,0.0,0.0,0.0,77,19 +ENSRNA049454974,0.859375,0.12387890112063936,0.875,0.0625,0.14414999403128945,0.1059,0.0,0.0,0.0,88,19 +ENSRNA049455639,0.15625,0.20863074009907004,0.125,0.125,1.3352367366340483,1.4826,0.0,0.0,0.375,22,19 +ENSRNA049455690,0.328125,0.34028283928856257,0.3125,0.3125,1.0370524625937145,1.4826,0.0,0.0,0.375,33,19 diff --git a/tests/test_data/compute_stability_scores/input/stats_all_genes.parquet b/tests/test_data/compute_stability_scores/input/stats_all_genes.parquet new file mode 100644 index 00000000..04b808d5 Binary files /dev/null and b/tests/test_data/compute_stability_scores/input/stats_all_genes.parquet differ diff --git a/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet b/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet new file mode 100644 index 00000000..c3f8fa07 Binary files /dev/null and b/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet differ diff --git a/tests/test_data/dataset_statistics/input/count2.raw.cpm.quant_norm.parquet b/tests/test_data/dataset_statistics/input/count2.raw.cpm.quant_norm.parquet new file mode 100644 index 00000000..d9082e66 Binary files /dev/null and b/tests/test_data/dataset_statistics/input/count2.raw.cpm.quant_norm.parquet differ diff --git a/tests/test_data/dataset_statistics/output/test.dataset_stats.csv b/tests/test_data/dataset_statistics/output/test.dataset_stats.csv new file mode 100644 index 00000000..ead888ce --- /dev/null +++ b/tests/test_data/dataset_statistics/output/test.dataset_stats.csv @@ -0,0 +1,9 @@ +sample,count,mean,std,min,25%,50%,75%,max,skewness,kolmogorov_smirnov_pvalue +sample_63,9.0,0.5,0.3423265984407288,0.0,0.25,0.5,0.75,1.0,0.0,0.013238665147108418 +sample_64,9.0,0.5,0.34089725358236606,0.0,0.25,0.5625,0.75,1.0,-0.0059425832940604335,0.013238665147108418 +sample_65,9.0,0.5,0.3423265984407288,0.0,0.25,0.5,0.75,1.0,0.0,0.013238665147108418 +sample_66,9.0,0.5,0.3423265984407288,0.0,0.25,0.5,0.75,1.0,0.0,0.013238665147108418 +sample_67,9.0,0.4861111111111111,0.361444824435364,0.0,0.25,0.5,0.75,1.0,-0.09766083489340871,0.013238665147108418 +sample_68,9.0,0.4861111111111111,0.361444824435364,0.0,0.25,0.5,0.75,1.0,-0.09766083489340871,0.013238665147108418 +sample_69,9.0,0.5,0.3423265984407288,0.0,0.25,0.5,0.75,1.0,0.0,0.013238665147108418 +sample_70,9.0,0.5,0.34089725358236606,0.0,0.3125,0.5,0.75,1.0,0.0178277498821813,0.013238665147108418 diff --git a/tests/test_data/genorm/compute_m_measure/input/std.0.0.parquet b/tests/test_data/genorm/compute_m_measure/input/std.0.0.parquet new file mode 100644 index 00000000..23b14fa8 Binary files /dev/null and b/tests/test_data/genorm/compute_m_measure/input/std.0.0.parquet differ diff --git a/tests/test_data/genorm/compute_m_measure/input/std.0.1.parquet b/tests/test_data/genorm/compute_m_measure/input/std.0.1.parquet new file mode 100644 index 00000000..2df52c3b Binary files /dev/null and b/tests/test_data/genorm/compute_m_measure/input/std.0.1.parquet differ diff --git a/tests/test_data/genorm/compute_m_measure/input/std.0.2.parquet b/tests/test_data/genorm/compute_m_measure/input/std.0.2.parquet new file mode 100644 index 00000000..48e587cf Binary files /dev/null and b/tests/test_data/genorm/compute_m_measure/input/std.0.2.parquet differ diff --git a/tests/test_data/genorm/compute_m_measure/input/std.0.3.parquet b/tests/test_data/genorm/compute_m_measure/input/std.0.3.parquet new file mode 100644 index 00000000..2984fee1 Binary files /dev/null and b/tests/test_data/genorm/compute_m_measure/input/std.0.3.parquet differ diff --git a/tests/test_data/genorm/compute_m_measure/input/std.1.1.parquet b/tests/test_data/genorm/compute_m_measure/input/std.1.1.parquet new file mode 100644 index 00000000..fae48626 Binary files /dev/null and b/tests/test_data/genorm/compute_m_measure/input/std.1.1.parquet differ diff --git a/tests/test_data/genorm/compute_m_measure/input/std.1.2.parquet b/tests/test_data/genorm/compute_m_measure/input/std.1.2.parquet new file mode 100644 index 00000000..5dcaaf98 Binary files /dev/null and b/tests/test_data/genorm/compute_m_measure/input/std.1.2.parquet differ diff --git a/tests/test_data/genorm/compute_m_measure/input/std.1.3.parquet b/tests/test_data/genorm/compute_m_measure/input/std.1.3.parquet new file mode 100644 index 00000000..b297f2a0 Binary files /dev/null and b/tests/test_data/genorm/compute_m_measure/input/std.1.3.parquet differ diff --git a/tests/test_data/genorm/compute_m_measure/input/std.2.2.parquet b/tests/test_data/genorm/compute_m_measure/input/std.2.2.parquet new file mode 100644 index 00000000..3b7cda0f Binary files /dev/null and b/tests/test_data/genorm/compute_m_measure/input/std.2.2.parquet differ diff --git a/tests/test_data/genorm/compute_m_measure/input/std.2.3.parquet b/tests/test_data/genorm/compute_m_measure/input/std.2.3.parquet new file mode 100644 index 00000000..168d2c51 Binary files /dev/null and b/tests/test_data/genorm/compute_m_measure/input/std.2.3.parquet differ diff --git a/tests/test_data/genorm/compute_m_measure/input/std.3.3.parquet b/tests/test_data/genorm/compute_m_measure/input/std.3.3.parquet new file mode 100644 index 00000000..e1fff9b2 Binary files /dev/null and b/tests/test_data/genorm/compute_m_measure/input/std.3.3.parquet differ diff --git a/tests/test_data/genorm/cross_join/output/cross_join.0.1.parquet b/tests/test_data/genorm/cross_join/output/cross_join.0.1.parquet new file mode 100644 index 00000000..d92b69ef Binary files /dev/null and b/tests/test_data/genorm/cross_join/output/cross_join.0.1.parquet differ diff --git a/tests/test_data/genorm/expression_ratio/output/ratios.0.1.parquet b/tests/test_data/genorm/expression_ratio/output/ratios.0.1.parquet new file mode 100644 index 00000000..b55c4965 Binary files /dev/null and b/tests/test_data/genorm/expression_ratio/output/ratios.0.1.parquet differ diff --git a/tests/test_data/genorm/make_chunks/input/counts.head.parquet b/tests/test_data/genorm/make_chunks/input/counts.head.parquet new file mode 100644 index 00000000..b63b13ef Binary files /dev/null and b/tests/test_data/genorm/make_chunks/input/counts.head.parquet differ diff --git a/tests/test_data/genorm/make_chunks/input/counts.parquet b/tests/test_data/genorm/make_chunks/input/counts.parquet new file mode 100644 index 00000000..c9764863 Binary files /dev/null and b/tests/test_data/genorm/make_chunks/input/counts.parquet differ diff --git a/tests/test_data/genorm/make_chunks/output/count_chunk.0.parquet b/tests/test_data/genorm/make_chunks/output/count_chunk.0.parquet new file mode 100644 index 00000000..2367ea1b Binary files /dev/null and b/tests/test_data/genorm/make_chunks/output/count_chunk.0.parquet differ diff --git a/tests/test_data/genorm/make_chunks/output/count_chunk.1.parquet b/tests/test_data/genorm/make_chunks/output/count_chunk.1.parquet new file mode 100644 index 00000000..98442e57 Binary files /dev/null and b/tests/test_data/genorm/make_chunks/output/count_chunk.1.parquet differ diff --git a/tests/test_data/genorm/make_chunks/output/count_chunk.2.parquet b/tests/test_data/genorm/make_chunks/output/count_chunk.2.parquet new file mode 100644 index 00000000..5e207c7a Binary files /dev/null and b/tests/test_data/genorm/make_chunks/output/count_chunk.2.parquet differ diff --git a/tests/test_data/genorm/ratio_standard_variation/output/std.0.1.parquet b/tests/test_data/genorm/ratio_standard_variation/output/std.0.1.parquet new file mode 100644 index 00000000..995652e8 Binary files /dev/null and b/tests/test_data/genorm/ratio_standard_variation/output/std.0.1.parquet differ diff --git a/tests/test_data/genorm/ratio_standard_variation/output/std.0.2.parquet b/tests/test_data/genorm/ratio_standard_variation/output/std.0.2.parquet new file mode 100644 index 00000000..976ff07d Binary files /dev/null and b/tests/test_data/genorm/ratio_standard_variation/output/std.0.2.parquet differ diff --git a/tests/test_data/idmapping/base/counts.ensembl_ids.csv b/tests/test_data/idmapping/base/counts.ensembl_ids.csv new file mode 100644 index 00000000..a093ec4b --- /dev/null +++ b/tests/test_data/idmapping/base/counts.ensembl_ids.csv @@ -0,0 +1,4 @@ +gend_id,ERR029909,ERR029910,ERR029911,ERR029912,ERR029913,ERR029914,ERR029915,ERR029916,ERR029917,ERR029918,ERR029920,ERR029921,ERR029922,ERR029923,ERR029924 +ENSRNA049434199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +ENSRNA049434246,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +ENSRNA049434252,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 diff --git a/tests/input/idmapping/counts.ncbi_ids.csv b/tests/test_data/idmapping/base/counts.ncbi_ids.csv similarity index 100% rename from tests/input/idmapping/counts.ncbi_ids.csv rename to tests/test_data/idmapping/base/counts.ncbi_ids.csv diff --git a/tests/input/idmapping/counts.uniprot_ids.csv b/tests/test_data/idmapping/base/counts.uniprot_ids.csv similarity index 100% rename from tests/input/idmapping/counts.uniprot_ids.csv rename to tests/test_data/idmapping/base/counts.uniprot_ids.csv diff --git a/tests/test_data/idmapping/custom/mapping.csv b/tests/test_data/idmapping/custom/mapping.csv new file mode 100644 index 00000000..cd43e30f --- /dev/null +++ b/tests/test_data/idmapping/custom/mapping.csv @@ -0,0 +1,4 @@ +original_gene_id,gene_id +ENSRNA049434199,SNSRNA049434199 +ENSRNA049434246,SNSRNA049434246 +ENSRNA049434252,SNSRNA049434252 diff --git a/tests/test_data/idmapping/custom/metadata.csv b/tests/test_data/idmapping/custom/metadata.csv new file mode 100644 index 00000000..0c4095a9 --- /dev/null +++ b/tests/test_data/idmapping/custom/metadata.csv @@ -0,0 +1,4 @@ +gene_id,name,description +SNSRNA049434199,geneA,descriptionA +SNSRNA049434246,geneB,descriptionB +SNSRNA049434252,geneC,descriptionC diff --git a/tests/test_data/idmapping/empty/counts.csv b/tests/test_data/idmapping/empty/counts.csv new file mode 100644 index 00000000..b8d84b76 --- /dev/null +++ b/tests/test_data/idmapping/empty/counts.csv @@ -0,0 +1 @@ +sample_1,sample_2,sample_3 diff --git a/tests/test_data/idmapping/gene_ids/gene_ids.txt b/tests/test_data/idmapping/gene_ids/gene_ids.txt new file mode 100644 index 00000000..94233419 --- /dev/null +++ b/tests/test_data/idmapping/gene_ids/gene_ids.txt @@ -0,0 +1,9 @@ +ENSRNA049434199 +ENSRNA049434246 +ENSRNA049434252 +840386 +833520 +832390 +Q8VWG3 +Q9FJA2 +Q8RYD9 diff --git a/tests/test_data/idmapping/mapped/mapped_gene_ids.csv b/tests/test_data/idmapping/mapped/mapped_gene_ids.csv new file mode 100644 index 00000000..84561688 --- /dev/null +++ b/tests/test_data/idmapping/mapped/mapped_gene_ids.csv @@ -0,0 +1,4 @@ +original_gene_id,gene_id +ENSRNA049434199,ENSRNA049434199 +ENSRNA049434246,ENSRNA049434246 +ENSRNA049434252,ENSRNA049434252 diff --git a/tests/test_data/idmapping/mapped/no_valid_gene_id.txt b/tests/test_data/idmapping/mapped/no_valid_gene_id.txt new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_data/idmapping/mapped/valid_gene_ids.txt b/tests/test_data/idmapping/mapped/valid_gene_ids.txt new file mode 100644 index 00000000..4fc4b319 --- /dev/null +++ b/tests/test_data/idmapping/mapped/valid_gene_ids.txt @@ -0,0 +1,2 @@ +ENSRNA049434199 +ENSRNA049434246 diff --git a/tests/test_data/idmapping/not_found/counts.csv b/tests/test_data/idmapping/not_found/counts.csv new file mode 100644 index 00000000..2b8ebd50 --- /dev/null +++ b/tests/test_data/idmapping/not_found/counts.csv @@ -0,0 +1,4 @@ +sample_1,sample_2,sample_3 +8173941,1,2,3 +8168737,1,2,3 +8067017,1,2,3 diff --git a/tests/test_data/idmapping/tsv/counts.ensembl_ids.tsv b/tests/test_data/idmapping/tsv/counts.ensembl_ids.tsv new file mode 100644 index 00000000..b1e1511d --- /dev/null +++ b/tests/test_data/idmapping/tsv/counts.ensembl_ids.tsv @@ -0,0 +1,4 @@ +gene_id ERR029909 ERR029910 ERR029911 ERR029912 ERR029913 ERR029914 ERR029915 ERR029916 ERR029917 ERR029918 ERR029920 ERR029921 ERR029922 ERR029923 ERR029924 +ENSRNA049434199 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +ENSRNA049434246 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +ENSRNA049434252 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 diff --git a/tests/test_data/idmapping/tsv/mapping.tsv b/tests/test_data/idmapping/tsv/mapping.tsv new file mode 100644 index 00000000..c425f89f --- /dev/null +++ b/tests/test_data/idmapping/tsv/mapping.tsv @@ -0,0 +1,4 @@ +original_gene_id gene_id +ENSRNA049434199 SNSRNA049434199 +ENSRNA049434246 SNSRNA049434246 +ENSRNA049434252 SNSRNA049434252 diff --git a/tests/test_data/idmapping/tsv/metadata.tsv b/tests/test_data/idmapping/tsv/metadata.tsv new file mode 100644 index 00000000..11eae353 --- /dev/null +++ b/tests/test_data/idmapping/tsv/metadata.tsv @@ -0,0 +1,4 @@ +gene_id name description +SNSRNA049434199 geneA descriptionA +SNSRNA049434246 geneB descriptionB +SNSRNA049434252 geneC descriptionC diff --git a/tests/test_data/idmapping/tsv/valid_gene_ids.txt b/tests/test_data/idmapping/tsv/valid_gene_ids.txt new file mode 100644 index 00000000..4fc4b319 --- /dev/null +++ b/tests/test_data/idmapping/tsv/valid_gene_ids.txt @@ -0,0 +1,2 @@ +ENSRNA049434199 +ENSRNA049434246 diff --git a/tests/test_data/input_datasets/gene_lengths.csv b/tests/test_data/input_datasets/gene_lengths.csv new file mode 100644 index 00000000..03ffba68 --- /dev/null +++ b/tests/test_data/input_datasets/gene_lengths.csv @@ -0,0 +1,10 @@ +gene_id,length +ENSRNA049453121,100 +ENSRNA049453138,200 +ENSRNA049454388,300 +ENSRNA049454416,400 +ENSRNA049454647,500 +ENSRNA049454661,600 +ENSRNA049454747,700 +ENSRNA049454887,800 +ENSRNA049454931,900 diff --git a/tests/test_data/input_datasets/input.csv b/tests/test_data/input_datasets/input.csv new file mode 100644 index 00000000..73278d53 --- /dev/null +++ b/tests/test_data/input_datasets/input.csv @@ -0,0 +1,3 @@ +counts,design,platform,normalised +https://raw.githubusercontent.com/nf-core/test-datasets/stableexpression/test_data/input_datasets/microarray.normalised.csv,https://raw.githubusercontent.com/nf-core/test-datasets/stableexpression/test_data/input_datasets/microarray.normalised.design.csv,microarray,true +https://raw.githubusercontent.com/nf-core/test-datasets/stableexpression/test_data/input_datasets/rnaseq.raw.csv,https://raw.githubusercontent.com/nf-core/test-datasets/stableexpression/test_data/input_datasets/rnaseq.raw.design.csv,rnaseq,false diff --git a/tests/test_data/input_datasets/input_big.yaml b/tests/test_data/input_datasets/input_big.yaml new file mode 100644 index 00000000..f54577bb --- /dev/null +++ b/tests/test_data/input_datasets/input_big.yaml @@ -0,0 +1,4 @@ +- counts: https://raw.githubusercontent.com/nf-core/test-datasets/differentialabundance/modules_testdata/SRP254919.salmon.merged.gene_counts.top1000cov.assay.tsv + design: https://raw.githubusercontent.com/nf-core/test-datasets/stableexpression/test_data/input_datasets/rnaseq_big.design.csv + platform: rnaseq + normalised: false diff --git a/tests/test_data/input_datasets/mapping.csv b/tests/test_data/input_datasets/mapping.csv new file mode 100644 index 00000000..04489426 --- /dev/null +++ b/tests/test_data/input_datasets/mapping.csv @@ -0,0 +1,10 @@ +original_gene_id,gene_id +ENSRNA049453121,SNSRNA049434199 +ENSRNA049453138,SNSRNA049434246 +ENSRNA049454388,SNSRNA049434252 +ENSRNA049454416,SNSRNA049434253 +ENSRNA049454647,SNSRNA049434254 +ENSRNA049454661,SNSRNA049434255 +ENSRNA049454747,SNSRNA049434256 +ENSRNA049454887,SNSRNA049434257 +ENSRNA049454931,SNSRNA049434258 diff --git a/tests/test_data/input_datasets/metadata.csv b/tests/test_data/input_datasets/metadata.csv new file mode 100644 index 00000000..fcccf222 --- /dev/null +++ b/tests/test_data/input_datasets/metadata.csv @@ -0,0 +1,10 @@ +gene_id,name,description +ENSRNA049453121,geneA,descriptionA +ENSRNA049453138,geneB,descriptionB +ENSRNA049454388,geneC,descriptionC +ENSRNA049454416,geneD,descriptionD +ENSRNA049454647,geneE,descriptionE +ENSRNA049454661,geneF,descriptionF +ENSRNA049454747,geneG,descriptionG +ENSRNA049454887,geneH,descriptionH +ENSRNA049454931,geneI,descriptionI diff --git a/tests/test_data/input_datasets/microarray.normalised.csv b/tests/test_data/input_datasets/microarray.normalised.csv new file mode 100644 index 00000000..81f3f904 --- /dev/null +++ b/tests/test_data/input_datasets/microarray.normalised.csv @@ -0,0 +1,10 @@ +gene_id,GSM1528575,GSM1528576,GSM1528579,GSM1528583,GSM1528584,GSM1528585,GSM1528580,GSM1528586,GSM1528582,GSM1528578,GSM1528581,GSM1528577 +ENSRNA049453121,20925.1255070264,136184.261516502,144325.370645564,89427.0987612997,164143.182734208,34178.6378088171,28842.7323281157,76973.395782103,41906.9367255656,44756.5602263121,252562.049703724,6953.65643340122 +ENSRNA049453138,196173.051628372,16607.8367703051,344972.83715281,22602.4535330758,13678.598561184,104546.421532852,15451.4637472048,71664.8857281649,160643.257448002,91459.0578537683,88396.7173963033,281623.08555275 +ENSRNA049454388,91547.4240932405,11625.4857392136,84483.143792525,80582.6604222701,218857.576978944,58304.7350856292,42234.0009090266,88475.1675656357,87306.1181782617,17513.436610296,90922.3378933406,76490.2207674135 +ENSRNA049454416,20925.1255070264,106290.155329953,193607.204524536,47170.3378081581,392119.825420608,190998.270108096,90648.5873169351,81397.1541603848,83813.8734511313,165404.67909724,111127.301869638,194702.380135234 +ENSRNA049454647,99394.3461583754,91343.1022366783,3520.13099135521,71738.2220832404,118547.854196928,20105.0810640101,81377.7090686122,15040.7784861581,66352.6498154789,110918.431865208,55563.6509348192,111258.50293442 +ENSRNA049454661,175247.926121346,66431.3470812206,24640.9169394865,52083.9146631746,360203.095444512,36189.1459152181,70046.6356539953,85820.9125386666,13968.9789085219,50594.3724297441,25256.2049703724,52152.4232505092 +ENSRNA049454747,117703.830977024,154452.881963838,281610.479308417,29481.4611300988,191500.379856576,152798.616086476,53565.0743236435,14156.0268105017,293348.557078959,155674.99209152,63140.5124259309,243377.975169043 +ENSRNA049454887,2615.6406883783,164417.584026021,28161.0479308417,82548.0911642767,50154.861391008,136714.551235268,97859.270398964,64586.872322914,328271.004350264,159566.866893808,151537.229822234,86920.7054175153 +ENSRNA049454931,177863.566809724,81378.4001744952,235848.776420799,88444.3833902964,18238.131414912,120630.48638406,82407.8066517592,50430.8455124123,118736.320722436,68107.8090400402,232357.085727426,163410.926184929 diff --git a/tests/test_data/input_datasets/microarray.normalised.design.csv b/tests/test_data/input_datasets/microarray.normalised.design.csv new file mode 100644 index 00000000..d31e5cef --- /dev/null +++ b/tests/test_data/input_datasets/microarray.normalised.design.csv @@ -0,0 +1,13 @@ +sample,condition +GSM1528575,g1 +GSM1528576,g1 +GSM1528579,g1 +GSM1528583,g2 +GSM1528584,g2 +GSM1528585,g2 +GSM1528580,g3 +GSM1528586,g3 +GSM1528582,g3 +GSM1528578,g4 +GSM1528581,g4 +GSM1528577,g4 diff --git a/tests/test_data/input_datasets/rnaseq.raw.csv b/tests/test_data/input_datasets/rnaseq.raw.csv new file mode 100644 index 00000000..5688c066 --- /dev/null +++ b/tests/test_data/input_datasets/rnaseq.raw.csv @@ -0,0 +1,10 @@ +gene_id,ESM1528575,ESM1528576,ESM1528579,ESM1528583,ESM1528584,ESM1528585,ESM1528580,ESM1528586,ESM1528582,ESM1528578,ESM1528581,ESM1528577 +ENSRNA049453121,1,82,8,82,4,68,88,73,46,57,25,22 +ENSRNA049453138,68,93,41,84,36,18,28,92,84,85,92,32 +ENSRNA049454388,38,10,0,23,11,17,95,57,25,82,10,70 +ENSRNA049454416,75,55,7,30,79,60,15,97,12,35,60,56 +ENSRNA049454647,35,64,55,91,48,95,68,100,24,26,100,47 +ENSRNA049454661,8,99,80,48,86,29,80,17,19,9,44,2 +ENSRNA049454747,67,7,98,53,3,10,52,87,4,80,22,15 +ENSRNA049454887,8,40,24,90,42,52,79,81,94,23,35,81 +ENSRNA049454931,45,49,67,73,26,76,41,16,34,47,36,25 diff --git a/tests/test_data/input_datasets/rnaseq.raw.design.csv b/tests/test_data/input_datasets/rnaseq.raw.design.csv new file mode 100644 index 00000000..469751d2 --- /dev/null +++ b/tests/test_data/input_datasets/rnaseq.raw.design.csv @@ -0,0 +1,13 @@ +sample,condition +ESM1528575,g1 +ESM1528576,g1 +ESM1528579,g1 +ESM1528583,g2 +ESM1528584,g2 +ESM1528585,g2 +ESM1528580,g3 +ESM1528586,g3 +ESM1528582,g3 +ESM1528578,g4 +ESM1528581,g4 +ESM1528577,g4 diff --git a/tests/test_data/input_datasets/rnaseq_big.design.csv b/tests/test_data/input_datasets/rnaseq_big.design.csv new file mode 100644 index 00000000..e8de12df --- /dev/null +++ b/tests/test_data/input_datasets/rnaseq_big.design.csv @@ -0,0 +1,7 @@ +sample,condition +SRX8042381,control +SRX8042382,control +SRX8042383,control +SRX8042384,treatment +SRX8042385,treatment +SRX8042386,treatment diff --git a/tests/test_data/merge_data/input/counts1.parquet b/tests/test_data/merge_data/input/counts1.parquet new file mode 100644 index 00000000..b4d98e52 Binary files /dev/null and b/tests/test_data/merge_data/input/counts1.parquet differ diff --git a/tests/test_data/merge_data/input/counts2.parquet b/tests/test_data/merge_data/input/counts2.parquet new file mode 100644 index 00000000..e9eca845 Binary files /dev/null and b/tests/test_data/merge_data/input/counts2.parquet differ diff --git a/tests/test_data/merge_data/input/counts3.parquet b/tests/test_data/merge_data/input/counts3.parquet new file mode 100644 index 00000000..af4fa697 Binary files /dev/null and b/tests/test_data/merge_data/input/counts3.parquet differ diff --git a/tests/test_data/merge_data/input/dataset_stat1.csv b/tests/test_data/merge_data/input/dataset_stat1.csv new file mode 100644 index 00000000..feca6c83 --- /dev/null +++ b/tests/test_data/merge_data/input/dataset_stat1.csv @@ -0,0 +1,10 @@ +sample,count,skewness,kolmogorov_smirnov_to_uniform_dist_pvalue +ARR029909,1,1,1 +ARR029910,2,3,1 +ARR029911,3,5,1 +ARR029912,4,4,9 +ARR029913,5,1,5 +ARR029914,6,6,6 +ARR029915,7,1,9 +ARR029916,8,8,1 +ARR029917,9,3,9 diff --git a/tests/test_data/merge_data/input/dataset_stat2.csv b/tests/test_data/merge_data/input/dataset_stat2.csv new file mode 100644 index 00000000..a7c0ea8b --- /dev/null +++ b/tests/test_data/merge_data/input/dataset_stat2.csv @@ -0,0 +1,10 @@ +sample,count,skewness,kolmogorov_smirnov_to_uniform_dist_pvalue +URR029909,1,1,1 +URR029910,2,2,2 +URR029911,3,2,3 +URR029912,4,4,4 +URR029913,5,5,5 +URR029914,6,6,3 +URR029915,7,7,7 +URR029916,8,8,8 +URR029917,9,9,9 diff --git a/tests/test_data/merge_data/input/dataset_stat3.csv b/tests/test_data/merge_data/input/dataset_stat3.csv new file mode 100644 index 00000000..28be6731 --- /dev/null +++ b/tests/test_data/merge_data/input/dataset_stat3.csv @@ -0,0 +1,10 @@ +sample,count,skewness,kolmogorov_smirnov_to_uniform_dist_pvalue +ERR029909,1,1,1 +ERR029910,2,2,2 +ERR029911,3,3,3 +ERR029912,4,9,4 +ERR029913,5,5,5 +ERR029914,6,6,6 +ERR029915,7,7,7 +ERR029916,8,8,1 +ERR029917,9,9,9 diff --git a/tests/test_data/merge_data/input/design1.csv b/tests/test_data/merge_data/input/design1.csv new file mode 100644 index 00000000..f9b61c49 --- /dev/null +++ b/tests/test_data/merge_data/input/design1.csv @@ -0,0 +1,10 @@ +sample,condition +ARR029909,g1 +ARR029910,g1 +ARR029911,g1 +ARR029912,g2 +ARR029913,g2 +ARR029914,g2 +ARR029915,g3 +ARR029916,g3 +ARR029917,g3 diff --git a/tests/test_data/merge_data/input/design2.csv b/tests/test_data/merge_data/input/design2.csv new file mode 100644 index 00000000..dcb29ec8 --- /dev/null +++ b/tests/test_data/merge_data/input/design2.csv @@ -0,0 +1,10 @@ +sample,condition +URR029909,g1 +URR029910,g1 +URR029911,g1 +URR029912,g2 +URR029913,g2 +URR029914,g2 +URR029915,g3 +URR029916,g3 +URR029917,g3 diff --git a/tests/test_data/merge_data/input/design3.csv b/tests/test_data/merge_data/input/design3.csv new file mode 100644 index 00000000..75caca86 --- /dev/null +++ b/tests/test_data/merge_data/input/design3.csv @@ -0,0 +1,10 @@ +batch,sample,condition +batch3,ERR029909,g1 +batch3,ERR029910,g1 +batch3,ERR029911,g1 +batch3,ERR029912,g2 +batch3,ERR029913,g2 +batch3,ERR029914,g2 +batch3,ERR029915,g3 +batch3,ERR029916,g3 +batch3,ERR029917,g3 diff --git a/tests/test_data/merge_data/output/all_counts.csv b/tests/test_data/merge_data/output/all_counts.csv new file mode 100644 index 00000000..527a2205 --- /dev/null +++ b/tests/test_data/merge_data/output/all_counts.csv @@ -0,0 +1,15 @@ +gene_id,URR029909,URR029910,URR029911,URR029912,URR029913,URR029914,URR029915,URR029916,URR029917,ERR029909,ERR029910,ERR029911,ERR029912,ERR029913,ERR029914,ERR029915,ERR029916,ERR029917,ARR029909,ARR029910,ARR029911,ARR029912,ARR029913,ARR029914,ARR029915,ARR029916,ARR029917 +AT1G34790,0.60113057,0.64080682,0.6,0.6197164000000003,0.60115891,0.63052843,0.61002869,0.65849011,0.66239896,0.60113057,0.64080682,0.6348181099999999,0.6519716400000001,0.60115891,0.63052843,0.61002869,0.65849011,0.66239896,0.60113057,0.64080682,0.6348181099999999,0.6519716400000001,0.20115891000000002,0.93052843,0.71002869,0.65849011,0.16239896 +AT5G35550,0.7148504699999999,0.21713193,0.03318757,0.18404821999999998,0.70246917,0.0,0.8336608,0.00340416,0.23179154000000002,0.0,0.21713193,0.03318757,0.18404821999999998,0.70246917,0.7555268599999999,0.8336608,0.00340416,0.23179154000000002,0.7148504699999999,0.21713193,0.03318757,0.18404821999999998,0.70246917,0.7555268599999999,0.8336608,0.00340416,0.23179154000000002 +AT5G23260,0.71122807,0.47981484,0.85599454,0.69023553,0.40420572,0.30220852000000004,0.73996866,0.08559519,0.80013134,0.71122807,0.47981484,0.85599454,0.69023553,0.40420572,0.30220852000000004,0.73996866,0.08559519,0.80013134,0.71122807,0.47981484,0.85599454,0.69023553,0.40420572,0.30220852000000004,0.73996866,0.08559519,0.80013134 +AT1G34791,0.60113057,0.64080682,0.9348181099999999,0.35197164000000003,0.20115891000000002,0.93052843,0.71002869,0.65849011,0.16239896,0.60113057,0.64080682,0.9348181099999999,0.35197164000000003,0.20115891000000002,0.93052843,0.71002869,0.65849011,0.16239896,0.60113057,0.64080682,0.9348181099999999,0.35197164000000003,0.20115891000000002,0.93052843,0.71002869,0.65849011,0.16239896 +AT5G35551,0.7148504699999999,0.21713193,0.03318757,0.18404821999999998,0.70246917,0.7555268599999999,0.0,0.00340416,0.23179154000000002,0.7148504699999999,0.21713193,0.03318757,0.18404821999999998,0.70246917,0.7555268599999999,0.8336608,0.00340416,0.23179154000000002,0.7148504699999999,0.21713193,0.03318757,0.18404821999999998,0.70246917,0.7555268599999999,0.8336608,0.00340416,0.23179154000000002 +AT5G23261,0.71122807,0.47981484,0.85599454,0.69023553,0.40420572,0.30220852000000004,0.73996866,0.08559519,0.80013134,0.71122807,0.47981484,0.85599454,0.69023553,0.40420572,0.30220852000000004,0.73996866,0.08559519,0.80013134,0.71122807,0.47981484,0.85599454,0.69023553,0.40420572,0.30220852000000004,0.73996866,0.08559519,0.80013134 +AT1G34792,0.60113057,0.64080682,0.9348181099999999,0.35197164000000003,0.0,0.93052843,0.71002869,0.65849011,0.16239896,0.60113057,0.64080682,0.9348181099999999,0.35197164000000003,0.20115891000000002,0.93052843,0.71002869,0.65849011,0.16239896,0.60113057,0.64080682,0.9348181099999999,0.35197164000000003,0.20115891000000002,0.93052843,0.71002869,0.65849011,0.16239896 +AT5G35552,0.7148504699999999,0.21713193,0.03318757,0.18404821999999998,0.70246917,0.7555268599999999,0.8336608,0.00340416,0.23179154000000002,0.7148504699999999,0.21713193,0.03318757,0.18404821999999998,0.70246917,0.7555268599999999,0.8336608,0.00340416,0.23179154000000002,0.7148504699999999,0.21713193,0.03318757,0.18404821999999998,0.70246917,0.7555268599999999,0.8336608,0.00340416,0.23179154000000002 +AT5G23262,0.0,0.47981484,0.85599454,0.69023553,0.0,0.30220852000000004,0.73996866,0.08559519,0.80013134,0.71122807,0.47981484,0.85599454,0.0,0.0,0.0,0.73996866,0.0,0.0,0.0,0.0,0.0,0.69023553,0.40420572,0.30220852000000004,0.73996866,0.08559519,0.80013134 +AT1G34793,0.60113057,0.64080682,0.9348181099999999,0.35197164000000003,0.0,0.93052843,0.71002869,0.65849011,0.16239896,0.60113057,0.64080682,0.9348181099999999,0.35197164000000003,0.20115891000000002,0.93052843,0.71002869,0.65849011,0.16239896,0.60113057,0.64080682,0.9348181099999999,0.35197164000000003,0.20115891000000002,0.93052843,0.71002869,0.65849011,0.16239896 +AT5G35553,,0.21713193,,,,,,,,,,,,,0,,,,,0.9348181099999999,,,0.35197164000000003,,,,0.0 +AT5G35554,,0.01713193,,,,,,,,,,,,,0.01,,,,,0.15,,,0.151,,,,0.0114 +AT5G35555,,0.01713193,,,,,0.0,,,,,,,,0.01,,,,,0.0,,,0.151,,,,0.011 +AT5G23263,,,,,,,,,,,,,,,,,,,,,,,,,,, diff --git a/tests/test_data/merge_data/output/all_counts.parquet b/tests/test_data/merge_data/output/all_counts.parquet new file mode 100644 index 00000000..35fb1441 Binary files /dev/null and b/tests/test_data/merge_data/output/all_counts.parquet differ diff --git a/tests/test_data/misc/accessions_to_include.txt b/tests/test_data/misc/accessions_to_include.txt new file mode 100644 index 00000000..7020d409 --- /dev/null +++ b/tests/test_data/misc/accessions_to_include.txt @@ -0,0 +1,2 @@ +E-MTAB-4252 +E-MTAB-4253 diff --git a/tests/test_data/misc/excluded_accessions.txt b/tests/test_data/misc/excluded_accessions.txt new file mode 100644 index 00000000..6c403a93 --- /dev/null +++ b/tests/test_data/misc/excluded_accessions.txt @@ -0,0 +1,2 @@ +E-MTAB-4251 +E-MTAB-4301 diff --git a/tests/input/normalize/all_counts.csv b/tests/test_data/normalisation/base/counts.csv similarity index 95% rename from tests/input/normalize/all_counts.csv rename to tests/test_data/normalisation/base/counts.csv index 9dbfef27..ba76be4e 100644 --- a/tests/input/normalize/all_counts.csv +++ b/tests/test_data/normalisation/base/counts.csv @@ -10,3 +10,4 @@ ENSRNA549434206,42,49,4,88,82,34,27,83,98 ENSRNA549434207,82,93,85,14,38,8,98,97,30 ENSRNA549434208,72,36,4,60,25,7,14,76,47 ENSRNA549434209,65,12,99,82,72,52,24,79,31 +ENSRNA549434210,0,0,0,0,0,0,0,0,0 diff --git a/tests/test_data/normalisation/base/counts.tsv b/tests/test_data/normalisation/base/counts.tsv new file mode 100644 index 00000000..17db2d66 --- /dev/null +++ b/tests/test_data/normalisation/base/counts.tsv @@ -0,0 +1,13 @@ + E_MTAB_5038_rnaseq_SRR1586392 E_MTAB_5038_rnaseq_SRR1586393 E_MTAB_5038_rnaseq_SRR1586394 E_MTAB_5038_rnaseq_SRR1586395 E_MTAB_5038_rnaseq_SRR1586396 E_MTAB_5038_rnaseq_SRR1586397 E_MTAB_5038_rnaseq_SRR1586400 E_MTAB_5038_rnaseq_SRR1586401 E_MTAB_5038_rnaseq_SRR1586402 +ENSRNA549434199 14 25 27 47 39 34 38 19 64 +ENSRNA549434200 91 37 78 84 6 51 18 2 57 +ENSRNA549434201 98 48 69 7 73 48 57 92 36 +ENSRNA549434202 52 15 41 19 8 100 85 83 97 +ENSRNA549434203 86 71 53 16 66 23 12 42 33 +ENSRNA549434204 62 2 25 89 74 32 45 56 26 +ENSRNA549434205 98 42 79 76 74 85 3 91 56 +ENSRNA549434206 42 49 4 88 82 34 27 83 98 +ENSRNA549434207 82 93 85 14 38 8 98 97 30 +ENSRNA549434208 72 36 4 60 25 7 14 76 47 +ENSRNA549434209 65 12 99 82 72 52 24 79 31 +ENSRNA549434210 0 0 0 0 0 0 0 0 0 diff --git a/tests/input/normalize/design.csv b/tests/test_data/normalisation/base/design.csv similarity index 100% rename from tests/input/normalize/design.csv rename to tests/test_data/normalisation/base/design.csv diff --git a/tests/test_data/normalisation/base/design.tsv b/tests/test_data/normalisation/base/design.tsv new file mode 100644 index 00000000..fca7e731 --- /dev/null +++ b/tests/test_data/normalisation/base/design.tsv @@ -0,0 +1,10 @@ +batch condition sample +E_MTAB_5038_rnaseq g1 E_MTAB_5038_rnaseq_SRR1586392 +E_MTAB_5038_rnaseq g1 E_MTAB_5038_rnaseq_SRR1586393 +E_MTAB_5038_rnaseq g1 E_MTAB_5038_rnaseq_SRR1586394 +E_MTAB_5038_rnaseq g2 E_MTAB_5038_rnaseq_SRR1586395 +E_MTAB_5038_rnaseq g2 E_MTAB_5038_rnaseq_SRR1586396 +E_MTAB_5038_rnaseq g2 E_MTAB_5038_rnaseq_SRR1586397 +E_MTAB_5038_rnaseq g3 E_MTAB_5038_rnaseq_SRR1586400 +E_MTAB_5038_rnaseq g3 E_MTAB_5038_rnaseq_SRR1586401 +E_MTAB_5038_rnaseq g3 E_MTAB_5038_rnaseq_SRR1586402 diff --git a/tests/test_data/normalisation/base/gene_lengths.csv b/tests/test_data/normalisation/base/gene_lengths.csv new file mode 100644 index 00000000..67b05cee --- /dev/null +++ b/tests/test_data/normalisation/base/gene_lengths.csv @@ -0,0 +1,13 @@ +gene_id,length +ENSRNA549434199,100 +ENSRNA549434200,200 +ENSRNA549434201,300 +ENSRNA549434202,400 +ENSRNA549434203,500 +ENSRNA549434204,600 +ENSRNA549434205,700 +ENSRNA549434206,800 +ENSRNA549434207,900 +ENSRNA549434208,1000 +ENSRNA549434209,1100 +ENSRNA549434210,1200 diff --git a/tests/test_data/normalisation/many_zeros/counts.csv b/tests/test_data/normalisation/many_zeros/counts.csv new file mode 100644 index 00000000..261de1aa --- /dev/null +++ b/tests/test_data/normalisation/many_zeros/counts.csv @@ -0,0 +1,6 @@ +,E_CURD_1_rnaseq_ERR274309,E_CURD_1_rnaseq_ERR274310,E_CURD_1_rnaseq_SRR070570,E_CURD_1_rnaseq_SRR070571,E_CURD_1_rnaseq_SRR1001909,E_CURD_1_rnaseq_SRR1001910,E_CURD_1_rnaseq_SRR1019221,E_CURD_1_rnaseq_SRR1046909,E_CURD_1_rnaseq_SRR1046910,E_CURD_1_rnaseq_SRR1105822,E_CURD_1_rnaseq_SRR1105823,E_CURD_1_rnaseq_SRR1106559,E_CURD_1_rnaseq_SRR1159821,E_CURD_1_rnaseq_SRR1159827,E_CURD_1_rnaseq_SRR1159831,E_CURD_1_rnaseq_SRR1159837,E_CURD_1_rnaseq_SRR949993 +AT1G80990,0,0,1,0,1,1,0,0,1,1,3,0,0,1,1,1,0 +AT2G01008,11,24,3,4,6,4,0,0,2,0,0,1,4,2,4,4,0 +AT2G01010,9,1,195,195,8,33,0,14,7,0,0,2,1,0,0,0,0 +AT2G01020,34,27,41,55,58,107,2,10,20,1,3,1,4,2,3,0,0 +AT2G01021,22,10,0,0,0,0,0,106,20,0,0,1,0,0,0,0,0 diff --git a/tests/test_data/normalisation/many_zeros/design.csv b/tests/test_data/normalisation/many_zeros/design.csv new file mode 100644 index 00000000..a6473d3a --- /dev/null +++ b/tests/test_data/normalisation/many_zeros/design.csv @@ -0,0 +1,18 @@ +batch,condition,sample +E_CURD_1_rnaseq,g2,E_CURD_1_rnaseq_ERR274309 +E_CURD_1_rnaseq,g3,E_CURD_1_rnaseq_ERR274310 +E_CURD_1_rnaseq,g23,E_CURD_1_rnaseq_SRR070570 +E_CURD_1_rnaseq,g23,E_CURD_1_rnaseq_SRR070571 +E_CURD_1_rnaseq,g55,E_CURD_1_rnaseq_SRR1001909 +E_CURD_1_rnaseq,g55,E_CURD_1_rnaseq_SRR1001910 +E_CURD_1_rnaseq,g56,E_CURD_1_rnaseq_SRR1019221 +E_CURD_1_rnaseq,g48,E_CURD_1_rnaseq_SRR1046909 +E_CURD_1_rnaseq,g48,E_CURD_1_rnaseq_SRR1046910 +E_CURD_1_rnaseq,g50,E_CURD_1_rnaseq_SRR1105822 +E_CURD_1_rnaseq,g50,E_CURD_1_rnaseq_SRR1105823 +E_CURD_1_rnaseq,g50,E_CURD_1_rnaseq_SRR1106559 +E_CURD_1_rnaseq,g6,E_CURD_1_rnaseq_SRR1159821 +E_CURD_1_rnaseq,g6,E_CURD_1_rnaseq_SRR1159827 +E_CURD_1_rnaseq,g6,E_CURD_1_rnaseq_SRR1159831 +E_CURD_1_rnaseq,g6,E_CURD_1_rnaseq_SRR1159837 +E_CURD_1_rnaseq,g44,E_CURD_1_rnaseq_SRR949993 diff --git a/tests/test_data/normalisation/many_zeros/gene_lengths.csv b/tests/test_data/normalisation/many_zeros/gene_lengths.csv new file mode 100644 index 00000000..923e2d65 --- /dev/null +++ b/tests/test_data/normalisation/many_zeros/gene_lengths.csv @@ -0,0 +1,6 @@ +gene_id,length +AT1G80990,100 +AT2G01008,200 +AT2G01010,300 +AT2G01020,400 +AT2G01021,500 diff --git a/tests/test_data/normalisation/one_group/counts.csv b/tests/test_data/normalisation/one_group/counts.csv new file mode 100644 index 00000000..0ec999c9 --- /dev/null +++ b/tests/test_data/normalisation/one_group/counts.csv @@ -0,0 +1,6 @@ +sampleA,sampleB,sampleC,sampleD +ENSG00000000003,14,4,4,10 +ENSG00000000005,0,0,0,0 +ENSG00000000419,562,584,523,616 +ENSG00000000457,586,377,207,491 +ENSG00000000460,130,55,28,77 diff --git a/tests/test_data/normalisation/one_group/design.csv b/tests/test_data/normalisation/one_group/design.csv new file mode 100644 index 00000000..9aaadb1d --- /dev/null +++ b/tests/test_data/normalisation/one_group/design.csv @@ -0,0 +1,5 @@ +batch,condition,sample +batch1,g1,sampleA +batch1,g1,sampleB +batch1,g1,sampleC +batch1,g1,sampleD diff --git a/tests/test_data/normalisation/one_group/gene_lengths.csv b/tests/test_data/normalisation/one_group/gene_lengths.csv new file mode 100644 index 00000000..73eb9655 --- /dev/null +++ b/tests/test_data/normalisation/one_group/gene_lengths.csv @@ -0,0 +1,6 @@ +gene_id,length +ENSG00000000003,100 +ENSG00000000005,200 +ENSG00000000419,300 +ENSG00000000457,400 +ENSG00000000460,500 diff --git a/tests/test_data/normfinder/small_normalised/all_counts.normalised.parquet b/tests/test_data/normfinder/small_normalised/all_counts.normalised.parquet new file mode 100644 index 00000000..bc192ae3 Binary files /dev/null and b/tests/test_data/normfinder/small_normalised/all_counts.normalised.parquet differ diff --git a/tests/test_data/normfinder/small_normalised/design.csv b/tests/test_data/normfinder/small_normalised/design.csv new file mode 100644 index 00000000..6a212658 --- /dev/null +++ b/tests/test_data/normfinder/small_normalised/design.csv @@ -0,0 +1,12 @@ +batch,condition,sample +E_MTAB_11876_rnaseq,g1,E_MTAB_11876_rnaseq_ERR9883576 +E_MTAB_11876_rnaseq,g1,E_MTAB_11876_rnaseq_ERR9883577 +E_MTAB_11876_rnaseq,g1,E_MTAB_11876_rnaseq_ERR9883578 +E_MTAB_11876_rnaseq,g2,E_MTAB_11876_rnaseq_ERR9883579 +E_MTAB_11876_rnaseq,g2,E_MTAB_11876_rnaseq_ERR9883580 +E_MTAB_11876_rnaseq,g2,E_MTAB_11876_rnaseq_ERR9883581 +E_MTAB_4789_rnaseq,g8,E_MTAB_4789_rnaseq_SRR948460 +E_MTAB_4789_rnaseq,g8,E_MTAB_4789_rnaseq_SRR948461 +E_MTAB_4789_rnaseq,g8,E_MTAB_4789_rnaseq_SRR948462 +E_MTAB_4789_rnaseq,g8,E_MTAB_4789_rnaseq_SRR948463 +E_MTAB_4789_rnaseq,g9,E_MTAB_4789_rnaseq_SRR948464 diff --git a/tests/test_data/normfinder/very_small_cq/all_counts.normalised.parquet b/tests/test_data/normfinder/very_small_cq/all_counts.normalised.parquet new file mode 100644 index 00000000..5e31470f Binary files /dev/null and b/tests/test_data/normfinder/very_small_cq/all_counts.normalised.parquet differ diff --git a/tests/test_data/normfinder/very_small_cq/design.csv b/tests/test_data/normfinder/very_small_cq/design.csv new file mode 100644 index 00000000..221601a5 --- /dev/null +++ b/tests/test_data/normfinder/very_small_cq/design.csv @@ -0,0 +1,7 @@ +sample,condition,batch +S1,control,A +S2,treated,A +S3,control,A +S4,treated,A +S5,control,A +S6,treated,A diff --git a/tests/test_data/normfinder/very_small_cq/normfinder.R b/tests/test_data/normfinder/very_small_cq/normfinder.R new file mode 100644 index 00000000..f415f95f --- /dev/null +++ b/tests/test_data/normfinder/very_small_cq/normfinder.R @@ -0,0 +1,298 @@ +library(optparse) +library(dplyr) +library(tidyr) + + +get_args <- function() { + option_list <- list( + make_option("--data", type = "character") + ) + + args <- parse_args(OptionParser( + option_list = option_list, + description = "Normfinder" + )) + return(args) +} + + +normfinder<-function(data, group = TRUE, ctVal=FALSE, pStabLim=0.3, sample = "sample", gene = "gene", groups = "group", cq = "cq"){ + + # Group & sample ID + sample_group <- unique(data[,c(sample, groups)]) + + tmp <- data.frame(sample = as.character(data[, sample]), + gene = as.character(data[, gene]), + cq = as.numeric(data[, cq])) + tmp <- tmp %>% + dplyr::group_by(sample, gene) %>% + dplyr::summarise(cq=mean(cq, na.rm=T)) %>% + tidyr::spread(sample, cq) + + ntotal<-length(sample_group[,1]) + + if (group == TRUE){ + ngenes <- length(tmp$gene) # number of genes + genenames <- as.character(tmp$gene) + grId <- factor(sample_group[,2]) + } else { + ngenes <- length(tmp$gene) # number of genes + genenames <- as.character(tmp$gene) + grId <- rep(1,ntotal) + } + + tmp <- data.matrix(tmp[,sample_group[,1]]) + + if (!ctVal){tmp<-log2(tmp)} + + + groupnames <- levels(grId) + ngr <- length(levels(grId)) + + # Number of samples in each group: + nsamples <- rep(0,ngr) + for (group in 1:ngr){nsamples[group] <- sum(grId==groupnames[group])} + + + + MakeStab <- function(da){ + + ngenes <- dim(da)[1] + # Sample averages + sampleavg <- apply(da,2,mean) + # Gene averages within group + genegroupavg <- matrix(0,ngenes,ngr) + + for (group in 1:ngr){ + genegroupavg[,group] <- apply(da[,grId==groupnames[group]],1,mean)} + + # Group averages + groupavg=rep(0,ngr) + for (group in 1:ngr){groupavg[group] <- mean(da[,grId==groupnames[group]])} + + # Variances + GGvar=matrix(0,ngenes,ngr) + for (group in 1:ngr){ + grset <- (grId==groupnames[group]) + a=rep(0,ngenes) + for (gene in 1:ngenes){ + a[gene] <- sum((da[gene,grset]-genegroupavg[gene,group]- + sampleavg[grset]+groupavg[group])^2)/(nsamples[group]-1) + } + GGvar[,group] <- (a-sum(a)/(ngenes*ngenes-ngenes))/(1-2/ngenes) + } + + print("GGvar") + print(GGvar) + + # + # Change possible negative values + genegroupMinvar <- matrix(0, ngenes, ngr) + for (group in 1:ngr){ + grset <- (grId == groupnames[group]) + z <- da[,grset] + for (gene in 1:ngenes){ + varpair <- rep(0,ngenes) + for (gene1 in 1:ngenes){varpair[gene1] <- var(z[gene,] - z[gene1,])} + genegroupMinvar[gene,group] <- min(varpair[-gene])/4 + } + } + # + # Final variances + GGvar <- ifelse(GGvar < 0, genegroupMinvar, GGvar) + print("GGvar") + print(GGvar) + # + # Old stability measure for each gene is calculated: + # + dif <- genegroupavg + difgeneavg <- apply(dif, 1, mean) + difgroupavg <- apply(dif, 2, mean) + difavg <- mean(dif) + for (gene in 1:ngenes){ + for (group in 1:ngr){ + dif[gene,group] <- dif[gene, group] - difgeneavg[gene] - difgroupavg[group] + difavg + } + } + # + nsampMatrix <- matrix(rep(nsamples,ngenes),ngenes,ngr,byrow=T) + vardif <- GGvar/nsampMatrix + gamma <- sum(dif * dif) / ((ngr-1) * (ngenes-1)) -sum (vardif) / (ngenes*ngr) + gamma <- ifelse(gamma<0,0,gamma) + # + difnew <- dif * gamma / (gamma+vardif) + varnew <- vardif + gamma * vardif / (gamma+vardif) + Ostab0 <- abs(difnew) + sqrt(varnew) + Ostab <- apply(Ostab0, 1, mean) + + # + # Measure of group differences: + mud <- rep(0,ngenes) + for (gene in 1:ngenes){ + mud[gene] <- 2*max(abs(dif[gene,])) + } + # Common variance: + genevar <- rep(0,ngenes) + for (gene in 1:ngenes){ + genevar[gene] <- sum((nsamples-1) * GGvar[gene,]) / (sum(nsamples)-ngr) + } + Gsd <- sqrt(genevar) + # + # Return results: + # + return(cbind(mud, Gsd, Ostab, rep(gamma,ngenes), GGvar,dif)) + } # End of function MakeStab + # + # + MakeComb2 <- function(g1, g2, res){ + gam <- res[1,4] + d1 <- res[g1,(4 + ngr + 1):(4 + ngr + ngr)]; d2 <- res[g2, (4 + ngr + 1):(4+ngr+ngr)] + s1 <- res[g1, (4+1):(4+ngr)]; s2 <- res[g2, (4+1):(4+ngr)] + rho <- abs(gam * d1 / (gam + s1 / nsamples) + gam * d2 / (gam + s2 / nsamples)) * sqrt(ngenes / (ngenes-2)) / 2 + rho <- rho + sqrt(s1 / nsamples + gam * s1 / (nsamples*gam+s1) + s2 / nsamples + gam * s2 / (nsamples*gam+s2))/2 + return(mean(rho)) + } + # + # + MakeStabOne <- function(da){ + ngenes <- dim(da)[1] + # Sample averages + sampleavg <- apply(da, 2, mean) + # Gene averages + geneavg <- apply(da, 1, mean) + totalavg <- mean(da) + # + # Variances + genevar0 <- rep(0, ngenes) + for (gene in 1:ngenes){ + genevar0[gene] <- sum((tmp[gene,] - geneavg[gene] - sampleavg + totalavg)^2) / ((ntotal-1) * (1-2/ngenes)) + } + genevar <- genevar0 - sum(genevar0) / (ngenes*ngenes-ngenes) + # + # Change possible negative values + geneMinvar <- rep(0,ngenes) + z <- da + for (gene in 1:ngenes){ + varpair <- rep(0, ngenes) + for (gene1 in 1:ngenes){varpair[gene1] <- var(z[gene,] - z[gene1,])} + geneMinvar[gene] <- min(varpair[-gene]) / 4 + } + # Final variances + genevar = ifelse(genevar<0, geneMinvar, genevar) + # + return(genevar) + } + # End of function MakeStabOne + + #### Main function #### + if (ngr>1){ # More than one group. + # + res <- MakeStab(tmp) + # + gcand <- c(1:ngenes)[res[,3] < pStabLim] + ncand <- length(gcand) + if (ncand<4){ + if (ngenes>3){ + li <- sort(res[,3])[4] + gcand <- c(1:ngenes)[res[,3]<=li] + ncand <- length(gcand) + } else { + gcand <- c(1:ngenes) + ncand <- length(gcand) + } + } + # + vv2 <- c() + # + for (g1 in 1:(ncand-1)){ + for (g2 in (g1+1):ncand){ + qmeas <- MakeComb2(gcand[g1], gcand[g2], res) + vv2 <- rbind(vv2, c(gcand[g1], gcand[g2], qmeas)) + }} + # + ord <- order(res[,3]) + FinalRes <- list(Ordered <- data.frame("GroupDif" = round(res[ord,1],3), + "GroupSD" = round(res[ord,2],3), + "Stability" = round(res[ord,3],3), + row.names = genenames[ord]), + UnOrdered <- data.frame("GroupDif" = round(res[,1],3), + "GroupSD" = round(res[,2],3), + "Stability" = round(res[,3],3), + "IGroupSD" = round(sqrt(res[,(4+1):(4+ngr)]),3), + "IGroupDif" = round(res[,(4+ngr+1):(4+ngr+ngr)],3), + row.names = genenames), + PairOfGenes <- data.frame("Gene1" = genenames[vv2[,1]], + "Gene2" = genenames[vv2[,2]], + "Stability" = round(vv2[,3],3))) + # + return(FinalRes) + # + } else { # End of more than one group: next is for one group only. + # + # + sigma <- sqrt(MakeStabOne(tmp)) + # + siglim <- (min(sigma)+0.1) + gcand <- c(1:ngenes)[sigma=2) & (ngenes>3)){ + # + vv2=c() + # + for (g1 in 1:(ncand-1)){ + for (g2 in (g1+1):ncand){ + dat1 <- rbind(tmp[-c(gcand[g1], gcand[g2]),], + apply(tmp[c(gcand[g1], gcand[g2]),], 2, mean)) + qmeas <- sqrt(MakeStabOne(dat1)) + vv2 <- rbind(vv2, c(gcand[g1], gcand[g2], qmeas[ngenes-1])) + }} + ord <- order(sigma) + FinalRes <- list(Ordered <- data.frame("GroupSD" = round(sigma[ord],3), + row.names = genenames[ord]), + PairOfGenes <- data.frame("Gene1" = genenames[vv2[,1]], + "Gene2" = genenames[vv2[,2]], + "GroupSD" = round(vv2[,3],3))) + } else { # No combined genes to consider + ord <- order(sigma) + FinalRes <- list(Ordered <- data.frame("GroupSD" = round(sigma[ord],3), + row.names = genenames[ord])) + } # End ncand<2 or ngenes<=3 + # + return(FinalRes) + # + } # End one group only + +} ##### + +# Read the counts file +counts <- read.csv("all_counts.normfinder.csv") + +# Build design (conditions per sample) +design <- data.frame( + sample = c("S1","S2","S3","S4","S5","S6"), + group = c("control","treated","control","treated","control","treated") +) + +# Convert counts wide → long +library(tidyr) +library(dplyr) + +data <- counts %>% + tidyr::pivot_longer( + cols = -gene_id, + names_to = "sample", + values_to = "cq" + ) %>% + dplyr::rename(gene = gene_id) %>% + dplyr::left_join(design, by = "sample") + +# Inspect +#print(data) + +data <- as.data.frame(data) + + +res = normfinder(data, ctVal=TRUE) +print("res") +print(res) diff --git a/tests/test_data/public_accessions/exclude_one_geo_accession.txt b/tests/test_data/public_accessions/exclude_one_geo_accession.txt new file mode 100644 index 00000000..c6978b9b --- /dev/null +++ b/tests/test_data/public_accessions/exclude_one_geo_accession.txt @@ -0,0 +1 @@ +GSE55951 diff --git a/tests/test_data/public_accessions/exclude_two_geo_accessions.txt b/tests/test_data/public_accessions/exclude_two_geo_accessions.txt new file mode 100644 index 00000000..0ef19a43 --- /dev/null +++ b/tests/test_data/public_accessions/exclude_two_geo_accessions.txt @@ -0,0 +1,2 @@ +GSE79526 +GSE55951 diff --git a/tests/test_data/quantile_normalisation/count.raw.cpm.csv b/tests/test_data/quantile_normalisation/count.raw.cpm.csv new file mode 100644 index 00000000..e8ecde05 --- /dev/null +++ b/tests/test_data/quantile_normalisation/count.raw.cpm.csv @@ -0,0 +1,10 @@ +,sample_63,sample_64,sample_65,sample_66,sample_67,sample_68,sample_69,sample_70 +ENSRNA049454747,9.07095165125094,56.5509090498679,12.6897789869867,15.7656784862991,4.55005160208214,5.21967362537592,8.87627280506172,6.33326316409849 +ENSRNA049454887,0.740485849081709,1.66326203087847,0.229679257683017,0.785665040845472,2.20608562525195,2.37257892062542,0.365278716257684,0.139192597013154 +ENSRNA049454931,1.20328950475778,2.61369747709473,0.574198144207542,1.46657474291155,2.0682052736737,3.32161048887559,0.620973817638062,0.591568537305903 +ENSRNA049454947,1.48097169816342,2.1384797539866,0.459358515366033,1.57133008169094,2.89548738314318,3.08435259681304,0.474862331134989,0.452375940292749 +ENSRNA049454955,1.29585023589299,2.61369747709473,0.516778329786788,1.09993105718366,3.8606498441909,4.03338416506321,0.584445946012294,0.452375940292749 +ENSRNA049454963,1.38841096702821,4.51456836952727,1.43549536051885,2.7236388082643,4.96369265681688,5.45693151743846,1.35153125015343,1.25273337311838 +ENSRNA049454974,1.66609316043385,3.564132923311,2.52647183451318,2.46175046131581,5.51521406312986,12.5746682793147,1.71680996641111,1.53111856714469 +ENSRNA049455639,0.185121462270427,0.237608861554067,0.803877401890558,1.15230872657336,0.137880351578247,0.237257892062542,0.438334459509221,0.417577791039461 +ENSRNA049455690,0.0925607311352137,1.18804430777033,0.746457587469804,2.98552715521279,0.137880351578247,0.237257892062542,0.876668919018441,0.487174089546038 diff --git a/tests/workflows/stableexpression.nf.test b/tests/workflows/stableexpression.nf.test deleted file mode 100644 index c6bbd715..00000000 --- a/tests/workflows/stableexpression.nf.test +++ /dev/null @@ -1,95 +0,0 @@ -nextflow_workflow { - - name "Test Workflow STABLEEXPRESSION" - script "workflows/stableexpression.nf" - workflow "STABLEEXPRESSION" - tag "workflow" - - test("Two Expression Atlas accessions provided") { - - tag "workflow_eatlas_accessions" - - when { - params { - species = 'solanum tuberosum' - fetch_eatlas_accessions = false - eatlas_accessions = "E-MTAB-552,E-GEOD-61690" - } - } - - then { - assert workflow.success - assert snapshot(workflow.out).match() - } - } - - test("Two Expression Atlas no keyword (whole species)") { - - tag "workflow_eatlas_no_kw" - - when { - params { - species = 'solanum tuberosum' - fetch_eatlas_accessions = true - } - } - - then { - assert workflow.success - assert snapshot(workflow.out).match() - } - } - - test("Two Expression Atlas keywords provided") { - - tag "workflow_eatlas_kw" - - when { - params { - species = 'solanum tuberosum' - eatlas_keywords = "potato,stress" - } - } - - then { - assert workflow.success - assert snapshot(workflow.out).match() - } - } - - test("Two Expression Atlas keywords provided - Normalization with EdgeR") { - - tag "workflow_eatlas_kw_edger" - - when { - params { - species = 'solanum tuberosum' - eatlas_keywords = "potato,stress" - normalization_method = "edger" - } - } - - then { - assert workflow.success - assert snapshot(workflow.out).match() - } - } - - test("Full workflow - Expression Atlas only") { - - when { - params { - species = 'solanum tuberosum' - eatlas_accessions = "E-MTAB-552,E-GEOD-61690" - eatlas_keywords = "phloem" - } - } - - then { - assert workflow.success - assert snapshot(workflow.out).match() - } - - } - -} diff --git a/tests/workflows/stableexpression.nf.test.snap b/tests/workflows/stableexpression.nf.test.snap deleted file mode 100644 index cb3d38da..00000000 --- a/tests/workflows/stableexpression.nf.test.snap +++ /dev/null @@ -1,70 +0,0 @@ -{ - "Two Expression Atlas keywords provided - Normalization with EdgeR": { - "content": [ - { - "0": [ - "variation_coefficients.csv:md5,a45d061a9118f6657848663844f769ee" - ], - "ch_output_from_variation_coefficient": [ - "variation_coefficients.csv:md5,a45d061a9118f6657848663844f769ee" - ] - } - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" - }, - "timestamp": "2024-12-29T15:12:21.118208" - }, - "Full workflow - Expression Atlas only": { - "content": [ - { - "0": [ - "variation_coefficients.csv:md5,68c7ec46c5fa350a2d7a37cfc167980e" - ], - "ch_output_from_variation_coefficient": [ - "variation_coefficients.csv:md5,68c7ec46c5fa350a2d7a37cfc167980e" - ] - } - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" - }, - "timestamp": "2024-12-29T13:49:56.353746" - }, - "Two Expression Atlas keywords provided": { - "content": [ - { - "0": [ - "variation_coefficients.csv:md5,bbf85cf21180ab7306d23f3fac07ea14" - ], - "ch_output_from_variation_coefficient": [ - "variation_coefficients.csv:md5,bbf85cf21180ab7306d23f3fac07ea14" - ] - } - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" - }, - "timestamp": "2024-12-28T23:44:22.027225612" - }, - "Two Expression Atlas accessions provided": { - "content": [ - { - "0": [ - "variation_coefficients.csv:md5,744a2ee2b5d266640abd9139fbd925f9" - ], - "ch_output_from_variation_coefficient": [ - "variation_coefficients.csv:md5,744a2ee2b5d266640abd9139fbd925f9" - ] - } - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" - }, - "timestamp": "2024-12-29T12:40:43.491641" - } -} \ No newline at end of file diff --git a/workflows/stableexpression.nf b/workflows/stableexpression.nf index 5203547a..09325a3a 100644 --- a/workflows/stableexpression.nf +++ b/workflows/stableexpression.nf @@ -1,23 +1,21 @@ -nextflow.enable.dsl = 2 -nextflow.preview.topic = true - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT MODULES / SUBWORKFLOWS / FUNCTIONS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { EXPRESSIONATLAS_GETACCESSIONS } from '../modules/local/expressionatlas/getaccessions/main' -include { EXPRESSIONATLAS_GETDATA } from '../modules/local/expressionatlas/getdata/main' -include { DESEQ2_NORMALIZE } from '../modules/local/deseq2/normalize/main' -include { EDGER_NORMALIZE } from '../modules/local/edger/normalize/main' -include { GPROFILER_IDMAPPING } from '../modules/local/gprofiler/idmapping/main' -include { VARIATION_COEFFICIENT } from '../modules/local/variation_coefficient/main' - -include { customSoftwareVersionsToYAML } from '../subworkflows/local/utils_nfcore_stableexpression_pipeline' -include { paramsSummaryMap } from 'plugin/nf-schema' -include { samplesheetToList } from 'plugin/nf-schema' +include { GET_PUBLIC_ACCESSIONS } from '../subworkflows/local/get_public_accessions' +include { DOWNLOAD_PUBLIC_DATASETS } from '../subworkflows/local/download_public_datasets' +include { ID_MAPPING } from '../subworkflows/local/idmapping' +include { SAMPLE_FILTERING } from '../subworkflows/local/sample_filtering' +include { EXPRESSION_NORMALISATION } from '../subworkflows/local/expression_normalisation' +include { DATASET_ANALYSIS } from '../subworkflows/local/dataset_analysis' +include { MERGE_DATA } from '../subworkflows/local/merge_data' +include { GENE_STATISTICS } from '../subworkflows/local/gene_statistics' +include { STABILITY_SCORING } from '../subworkflows/local/stability_scoring' +include { REPORTING } from '../subworkflows/local/reporting' +include { checkCounts } from '../subworkflows/local/utils_nfcore_stableexpression_pipeline' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -27,175 +25,201 @@ include { samplesheetToList } from 'plugin/nf-schema' workflow STABLEEXPRESSION { - // - // Checking input parameters - // - - if ( !params.species ) { - error('You must provide a species name') - } - - if ( - !params.datasets - && !params.eatlas_accessions - && !params.fetch_eatlas_accessions - ) { - error('You must provide at least either --datasets or --fetch_eatlas_accessions or --eatlas_accessions or --eatlas_keywords') - } - - // - // Initializing channels - // - - def species = params.species.split(' ').join('_') - ch_species = Channel.value(species) - - ch_normalized_datasets = Channel.empty() - ch_raw_datasets = Channel.empty() - ch_accessions = Channel.empty() - - // if input datasets were provided - if ( params.datasets ) { - - // - // Parsing input datasets - // - - // reads list of input datasets from input file - // and splits them in normalized and raw sub-channels - Channel.fromList( samplesheetToList(params.datasets, "${projectDir}/assets/schema_input.json") ) - .map { - item -> - def (count_file, design_file, normalized) = item - meta = [accession: count_file.name, design: design_file] - [meta, count_file, normalized] - } - .branch { - item -> - normalized: item[2] == true - raw: item[2] == false - } - .set { ch_input_datasets } - - // removes the third element ("normalized" column) and adds to the corresponding channel - ch_normalized_datasets = ch_normalized_datasets.concat( - ch_input_datasets.normalized.map{ it -> it.take(2) } - ) - ch_raw_datasets = ch_raw_datasets.concat( - ch_input_datasets.raw.map{ it -> it.take(2) } + take: + ch_input_datasets + + + main: + + ch_accessions = channel.empty() + ch_downloaded_datasets = channel.empty() + ch_all_counts = channel.empty() + ch_all_imputed_counts = channel.empty() + ch_whole_design = channel.empty() + ch_whole_design = channel.empty() + ch_stats_all_genes_with_scores = channel.empty() + ch_platform_statistics = channel.empty() + ch_whole_gene_metadata = channel.empty() + ch_whole_gene_id_mapping = channel.empty() + ch_most_stable_genes_summary = channel.empty() + ch_all_genes_statistics = channel.empty() + ch_most_stable_genes_transposed_counts = channel.empty() + + def species = params.species.split(' ').join('_').toLowerCase() + + // ----------------------------------------------------------------- + // FETCH PUBLIC ACCESSIONS + // ----------------------------------------------------------------- + + GET_PUBLIC_ACCESSIONS( + species, + params.skip_fetch_eatlas_accessions, + params.fetch_geo_accessions, + params.platform, + params.keywords, + channel.fromList( params.accessions.tokenize(',') ), + params.accessions_file ? channel.fromPath(params.accessions_file, checkIfExists: true) : channel.empty(), + channel.fromList( params.excluded_accessions.tokenize(',') ), + params.excluded_accessions_file ? channel.fromPath(params.excluded_accessions_file, checkIfExists: true) : channel.empty(), + params.random_sampling_size, + params.random_sampling_seed, + params.outdir ) - } + ch_accessions = GET_PUBLIC_ACCESSIONS.out.accessions - // parsing Expression Atlas accessions if provided - if ( params.eatlas_accessions ) { + // ----------------------------------------------------------------- + // DOWNLOAD GEO DATASETS IF NEEDED + // ----------------------------------------------------------------- - // parsing accessions from provided parameter - ch_accessions = Channel.fromList( params.eatlas_accessions.tokenize(',') ) + if ( !params.accessions_only) { - } - - - // fetching Expression Atlas accessions if applicable - if ( params.fetch_eatlas_accessions || params.eatlas_keywords ) { - - // - // MODULE: Expression Atlas - Get accessions - // - - // keeping the keywords (separated by spaces) as a single string - ch_keywords = Channel.value( params.eatlas_keywords ) - - // getting Expression Atlas accessions given a species name and keywords - // keywords can be an empty string - EXPRESSIONATLAS_GETACCESSIONS( ch_species, ch_keywords ) + DOWNLOAD_PUBLIC_DATASETS ( + species, + ch_accessions + ) - // appending to accessions provided by the user - // ensures that no accessions is present twice (provided by the user and fetched from E. Atlas) - ch_accessions = ch_accessions - .concat( EXPRESSIONATLAS_GETACCESSIONS.out.txt.splitText() ) - .unique() + ch_downloaded_datasets = DOWNLOAD_PUBLIC_DATASETS.out.datasets } - // logging accessions if present - ch_accessions.collect().map { items -> println "Obtained accessions ${items}"} - - // - // MODULE: Expression Atlas - Get data - // - - // Downloading Expression Atlas data for each accession in ch_accessions - EXPRESSIONATLAS_GETDATA( ch_accessions ) - - // separating and arranging EXPRESSIONATLAS_GETDATA output in two separate channels (already normalized or raw data) - ch_normalized_datasets = ch_normalized_datasets.concat( - EXPRESSIONATLAS_GETDATA.out.normalized.map { - accession, design_file, count_file -> - meta = [accession: accession, design: design_file] - [meta, count_file] - } - ) + if ( !params.accessions_only && !params.download_only ) { + + ch_counts = ch_input_datasets.mix( ch_downloaded_datasets ) + // returns an error with a message if no dataset was found + checkCounts( ch_counts, params.fetch_geo_accessions ) + + // ----------------------------------------------------------------- + // IDMAPPING + // ----------------------------------------------------------------- + + // tries to map gene IDs to Ensembl IDs whenever possible + ID_MAPPING( + ch_counts, + species, + params.skip_id_mapping, + params.skip_cleaning_gene_ids, + params.gprofiler_target_db, + params.gene_id_mapping, + params.gene_metadata, + params.min_occurrence_freq, + params.min_occurrence_quantile, + params.outdir + ) - ch_raw_datasets = ch_raw_datasets.concat( - EXPRESSIONATLAS_GETDATA.out.raw.map { - accession, design_file, count_file -> - meta = [accession: accession, design: design_file] - [meta, count_file] - } - ) + ch_counts = ID_MAPPING.out.counts + ch_whole_gene_id_mapping = ID_MAPPING.out.mapping + ch_whole_gene_metadata = ID_MAPPING.out.metadata + ch_valid_gene_ids = ID_MAPPING.out.valid_gene_ids + + // ----------------------------------------------------------------- + // FILTER OUT SAMPLES NOT VALID + // ----------------------------------------------------------------- + + SAMPLE_FILTERING ( + ch_counts, + ch_valid_gene_ids, + params.max_zero_ratio, + params.max_null_ratio, + params.outdir + ) + ch_ratio_nulls_per_sample_file = SAMPLE_FILTERING.out.ratio_nulls_per_sample_file - // - // MODULE: Normalization of raw count datasets (including RNA-seq datasets) - // + // ----------------------------------------------------------------- + // NORMALISATION OF RAW COUNT DATASETS (INCLUDING RNA-SEQ DATASETS) + // ----------------------------------------------------------------- - if ( params.normalization_method == 'deseq2' ) { - DESEQ2_NORMALIZE(ch_raw_datasets) - ch_raw_datasets_normalized = DESEQ2_NORMALIZE.out.csv + EXPRESSION_NORMALISATION( + species, + SAMPLE_FILTERING.out.counts, + params.normalisation_method, + params.quantile_norm_target_distrib, + params.gff, + params.gene_length + ) - } else { // 'edger' - EDGER_NORMALIZE(ch_raw_datasets) - ch_raw_datasets_normalized = EDGER_NORMALIZE.out.csv - } + ch_normalised_counts = EXPRESSION_NORMALISATION.out.counts - // putting all normalized count datasets together - ch_normalized_datasets.concat( ch_raw_datasets_normalized ).set{ ch_all_normalized } + // ----------------------------------------------------------------- + // ANALYSIS OF NORMALISED DATASETS + // ----------------------------------------------------------------- + DATASET_ANALYSIS( + ch_normalised_counts + ) - // - // MODULE: ID Mapping - // + // ----------------------------------------------------------------- + // MERGE ALL DATASETS INTO ONE SINGLE DATASET + // ----------------------------------------------------------------- - // tries to map gene IDs to Ensembl IDs whenever possible - GPROFILER_IDMAPPING( ch_all_normalized.combine(ch_species) ) + MERGE_DATA ( + ch_normalised_counts, + params.missing_value_imputer, + params.outdir + ) + ch_all_imputed_counts = MERGE_DATA.out.all_imputed_counts + ch_all_counts = MERGE_DATA.out.all_counts + ch_whole_design = MERGE_DATA.out.whole_design + ch_platform_counts = MERGE_DATA.out.platform_counts + + // ----------------------------------------------------------------- + // COMPUTE BASE STATISTICS FOR ALL GENES + // ----------------------------------------------------------------- + + GENE_STATISTICS ( + ch_all_imputed_counts, + ch_all_counts, + ch_platform_counts, + ch_ratio_nulls_per_sample_file, + params.max_null_ratio_valid_sample + ) - // - // MODULE: Merge count files & compute variation coefficient for each gene - // + ch_all_datasets_stats = GENE_STATISTICS.out.stats + ch_platform_statistics = GENE_STATISTICS.out.platform_stats + + // ----------------------------------------------------------------- + // GET CANDIDATES AS REFERENCE GENE AND COMPUTES VARIOUS STABILITY VALUES + // ----------------------------------------------------------------- + + STABILITY_SCORING ( + ch_all_imputed_counts.map{ meta, file -> file }, + ch_whole_design, + ch_all_datasets_stats, + params.nb_candidates_per_section, + params.nb_sections, + params.skip_genorm, + params.stability_score_weights + ) - VARIATION_COEFFICIENT( GPROFILER_IDMAPPING.out.csv.collect() ) - ch_output_from_variation_coefficient = VARIATION_COEFFICIENT.out.csv + ch_stats_all_genes_with_scores = STABILITY_SCORING.out.summary_statistics + } - // - // Collate and save software versions - // TODO: use the nf-core functions when they are adapted to channel topics - // + // ----------------------------------------------------------------- + // REPORTING + // ----------------------------------------------------------------- + + REPORTING( + ch_all_imputed_counts, + ch_whole_design, + ch_stats_all_genes_with_scores, + ch_platform_statistics, + ch_whole_gene_metadata, + ch_whole_gene_id_mapping, + params.target_genes, + params.target_gene_file, + params.multiqc_config, + params.multiqc_logo, + params.multiqc_methods_description, + params.outdir + ) - customSoftwareVersionsToYAML( Channel.topic('versions') ) - .collectFile( - storeDir: "${params.outdir}/pipeline_info", - name: 'software_versions.yml', - sort: true, - newLine: true - ) - // only used for nf-test emit: - ch_output_from_variation_coefficient - + multiqc_report = REPORTING.out.multiqc_report.toList() + all_genes_summary = REPORTING.out.all_genes_summary }