maize-genetics · matthewwiese · Dec 8, 2023 · Dec 8, 2023 · Dec 8, 2023 · Dec 8, 2023
diff --git a/.github/workflows/build_upload_conda.yml b/.github/workflows/build_upload_conda.yml
@@ -0,0 +1,33 @@
+name: Build and Upload Conda Package When Release is Published
+
+# Triggers when a release is made (in our case by the run_deploy_on_merge.yml action)
+on:
+  workflow_dispatch:
+  release:
+    types: [published]
+
+env:
+  ANACONDA_API_TOKEN: ${{ secrets.CONDA_TOKEN }}
+  PHG2_VERSION: ${{ vars.PHG2_VERSION }}
+  PHG2_RELEASE: ${{ vars.PHG2_RELEASE }}
+  PHG2_VERSION_MD5: ${{ vars.PHG2_VERSION_MD5 }}
+
+jobs:
+  build-upload-conda:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Setup Conda
+        uses: conda-incubator/setup-miniconda@v3
+        with:
+          auto-update-conda: true
+          python-version: 3.11
+          channels: conda-forge
+      - name: Install Conda build dependencies
+        run: |
+          conda install -c defaults anaconda-client conda-build
+          conda config --set anaconda_upload yes
+      - name: Build and upload package
+        run: |
+          conda build ./conda
diff --git a/.github/workflows/run_deploy_on_merge.yml b/.github/workflows/run_deploy_on_merge.yml
@@ -152,3 +152,23 @@ jobs:
             ${{ steps.matching_release.outputs.body }}
             ${{ env.COMMIT_MSG }}
 
+      # Update repository variables based on new release version and its MD5 hash
+      # These are used by the Conda package build Action
+      - uses: action-pack/set-variable@v1
+        with:
+          name: 'PHG2_VERSION'
+          value: '${{ env.VERSION }}'
+          token: ${{secrets.PHGV2CD}}
+      - uses: action-pack/set-variable@v1
+        with:
+          name: 'PHG2_RELEASE'
+          value: '${{ env.RELEASE }}'
+          token: ${{secrets.PHGV2CD}}
+      - name: Calculate MD5 of package
+        run: |
+          echo "PHG2_VERSION_MD5=$(md5sum PHGv2-v${{ env.RELEASE }}.tar | cut -d ' ' -f 1)" >> $GITHUB_ENV
+      - uses: action-pack/set-variable@v1
+        with:
+          name: 'PHG2_VERSION_MD5'
+          value: '${{ env.PHG2_VERSION_MD5 }}'
+          token: ${{secrets.PHGV2CD}}
diff --git a/README.md b/README.md
@@ -1,4 +1,7 @@
 # PHG version 2
+> [!TIP]
+> The quickest way to install PHG2 is from Conda: `conda install -c conda-forge -c maize-genetics phg2`
+
 [![PHGv2 CI](https://github.com/maize-genetics/phg_v2/actions/workflows/phgv2_ci.yml/badge.svg)](https://github.com/maize-genetics/phg_v2/actions/workflows/phgv2_ci.yml) [![codecov](https://codecov.io/gh/maize-genetics/phg_v2/graph/badge.svg?token=4BVD2QXQ1A)](https://codecov.io/gh/maize-genetics/phg_v2) [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
 
 The Practical Haplotype Graph (PHG) is a powerful tool for representing pangenomes.  The PHG is optimized for the plant breeding and genetics, where genomic diversity can be high, phased haplotypes are common (e.g. inbred lines), and imputation with low density markers is essential for breeding efficiency. This is powerful complement to the excellent tools such as [BEAGLE](https://faculty.washington.edu/browning/beagle/beagle.html) that is used extensively in low diversity, unphased species with high density genotyping.
@@ -7,7 +10,31 @@ The PHG is a trellis graph based representation of genic and intergenic regions
 
 [PHGv1](https://bitbucket.org/bucklerlab/practicalhaplotypegraph/wiki/Home) was [published in 2022](https://doi.org/10.1093/bioinformatics/btac410). It addressed many challenges related to aligning diverse genomes, efficient storage, and imputation across a pangenome. However, it depended on a custom relational database that necessitated unique formats, and database queries did not scale effectively with a large number of taxa and rare alleles. Moreover, after developing PHGs for six species, we identified significant opportunities to refine and streamline the platform for curation.
 
-# PHGv2 design
+## Quick Start
+> [!IMPORTANT]
+> PHG2 is still under active development, please regularly check back for updates!
+
+You can download PHG2 from [the releases](https://github.com/maize-genetics/phg_v2/releases) page, but we recommend installing through Conda:
+
+```
+conda install --channel conda-forge --channel maize-genetics phg2
+```
+
+We recommend creating a dedicated Conda environment for each of your bioinformatics projects, as opposed to installing directly into the base environment; please refer to Conda's documentation [on managing environments](https://conda.io/projects/conda/en/latest/user-guide/getting-started.html#managing-environments). To create an environment and install PHG2 in a single command, do as follows:
+
+```
+conda create --name my_env --channel conda-forge --channel maize-genetics phg2
+```
+
+If you've already installed PHG2, you can update to the newest version via:
+
+```
+conda update phg2
+```
+
+Please refer to the [building and loading documentation](docs/build_and_load.md) for instructions on getting started with using PHG2.
+
+## Design
 The redesign leverages the powerful TileDB-VCF database, which is widely used in human genetics for extensive medical applications and is highly performant for rapid querying and storage of rare variants. The PHG is now backed by two TileDB-VCF databases: one for tracking haplotypes across all samples (h.vcf), and another for tracking variants relative to either the reference genomes or the closest haplotype (g.vcf). Our implementation of haplotype encoding in VCF heavily relies on the VCF ALT haplotype specification [v4.3](http://samtools.github.io/hts-specs/VCFv4.3.pdf).
 
 * High-quality phased genome assemblies (or similar) are available to initialize the PHG.
@@ -17,7 +44,7 @@ The redesign leverages the powerful TileDB-VCF database, which is widely used in
 * Genotyping with low-density markers is now done using a memory- and speed-efficient kmer approach, followed by pathfinding (imputation) with HMM, BWT, or our ML model.
 * Rare allele discovery with short reads is based on the above path, involving short read alignment to the inferred haplotype path genome and the GATK haplotype caller.
 
-# PHG terminology
+## Terminology
 
     Reference genome - the genome used for initial alignment and base coordinates
     Reference range - a segment of the reference genome
@@ -27,56 +54,3 @@ The redesign leverages the powerful TileDB-VCF database, which is widely used in
     Composite Reference Haplotypes 
 
 More information on terminology can be found [here](docs/terminology.md).
-
-# Example usage
-To populate that database
-```
-## Setup conda environment
-./phg setup-environment
-
-## Initialize DBs
-./phg initdb --db-path /path/to/dbs
-
-## Preprocessing data
-./phg annotate-fastas --keyfile /path/to/keyfile --output-dir /path/to/annotated/fastas --threads numberThreadstoRun
-
-## Build VCF data
-./phg create-ranges --reference-file Ref.fa --gff my.gff --boundary gene --pad 500 -o /path/to/bed/file.bed
-./phg align-assemblies --gff anchors.gff --reference-file Ref.fa -a assembliesList.txt --total-threads 20 --in-parallel 4 -o /path/for/generatedFiles
-./phg agc-compress --db-path /path/to/dbs --reference-file /my/ref.fasta --fasta-list /my/assemblyFastaList.txt 
-./phg create-ref-vcf --bed /my/bed/file.bed --reference-file /my/ref.fasta --reference-url https://url-for-ref --reference-name B73 --output-dir /path/to/vcfs
-./phg create-maf-vcf --db-path /path/to/dbs --bed /my/bed/file.bed --reference-file /my/ref.fasta --maf-dir /my/maf/files -o /path/to/vcfs
-
-## Load data into DBs
-./phg load-vcf --vcf /my/vcf/dir --dbpath /path/to/dbs
-```
-
-```
-## Index
-./phg index-kmers --ancestor founder.h.vcf -o kmer_index.map // we need this
-
-## Map
-./phg map-kmers \
-    --kmer-index kmer_index.map \
-    --reads my_reads.fastq \ // possibly thousands of samples being inputted
-    --output read_count_out.map \ // could we pipe this into impute method? // thousands of outputs
-    // consider batch interface here ^^
-
-## Impute
-./phg impute \
-    --hap-counts read_count_out.map \ // will users understand the di
-    --diploid false \
-    --ancestor founder.h.vcf \
-    --max-anc-hap-num 20 \
-    --max-anc-hap-prop 0.95 \
-    --output-parent best_parents.txt \
-    -o my_impute.h.vcf
-
-## Load
-./phg load-vcf --vcf my_impute.vcf --dbpath /my/db/uri
-```
-
-```
-## Export from Tiledb
-./phg export-vcf --db-path /my/db/uri --dataset-type hvcf --sample-Names LineA,LineB --output-dir /my/output/dir
-
diff --git a/conda/build.sh b/conda/build.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+mkdir -p $PREFIX/bin
+
+PHG2_DIR=$PREFIX/share/phg2-$PKG_VERSION-$PKG_BUILDNUM/
+
+mkdir -p $PHG2_DIR
+
+mv $SRC_DIR/* $PHG2_DIR
+
+# Soft symlink to "point" to phg script, as a hard symlink
+# leads to being unable to find the jars in lib/
+# Helpful: https://stackoverflow.com/a/29786294
+ln -s $PHG2_DIR/bin/phg $PREFIX/bin/phg2
diff --git a/conda/meta.yaml b/conda/meta.yaml
@@ -0,0 +1,31 @@
+{% set name = "PHG2" %}
+
+package:
+  name: {{ name|lower }}
+  version: "{{ PHG2_VERSION }}"
+
+source:
+  url: https://github.com/maize-genetics/phg_v2/releases/download/{{ PHG2_VERSION }}/PHGv2-v{{ PHG2_RELEASE }}.tar
+  md5: "{{ PHG2_VERSION_MD5 }}"
+
+build:
+  noarch: generic
+
+# NOTE: Dependencies like AnchorWave are not included here
+#       since they are used within the phgv2-conda environment.
+#       In future if it's desired to move away from that design
+#       decision those dependencies can be added and the reliance
+#       on a bespoke environment for execution can be removed.
+requirements:
+  run:
+    - openjdk >=17.0
+
+test:
+  commands:
+    - 'phg2 | grep "Usage: phg"'
+
+about:
+  home: https://github.com/maize-genetics/phg_v2
+  license: Apache-2.0
+  summary: 'The Practical Haplotype Graph (PHG) is a powerful tool for representing pangenomes.'
+  dev_url: https://github.com/maize-genetics/phg_v2