Merge pull request #6 from cvaske/notebook-hic

droumis · web-flow · commit f0ad07e33bfc · 2025-02-03T17:57:44.000+01:00
Notebook hic conversion example
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -16,6 +16,7 @@ repos:
   - id: mixed-line-ending
   - id: no-commit-to-branch
   - id: pretty-format-json
+    exclude_types: [jupyter]
   - id: trailing-whitespace
 
 - repo: https://github.com/astral-sh/ruff-pre-commit
diff --git a/notebooks/load-hic-from-GEO.ipynb b/notebooks/load-hic-from-GEO.ipynb
@@ -0,0 +1,150 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Conversion from `.hic` to holoSEq `hseq` format\n",
+    "\n",
+    "This Jupyter notebook shows an example of converting a .hic file directly from GEO (Gene Expression Omnibus), and then launching a panel server by the command line to view it.\n",
+    "\n",
+    "This notebook has been checked out via git, to get the entire repository of code."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# run this to install all required dependencies before running the main code if not already in the venv\n",
+    "# If you made a kernel out of the package's venv, you can skip this step\n",
+    "! pip install datashader 'dask[dataframe]' 'holoviews[recommended]' pandas matplotlib bokeh hic-straw"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Configuration\n",
+    "The following paths can be changed to point to other samples or output names"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Name of the sample to put as metadata in the hseq file\n",
+    "HIC_TITLE = \"A001C007\"\n",
+    "## URL to the hic file  \n",
+    "HIC_URL = \"https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM6326nnn/GSM6326543/suppl/GSM6326543%5FA001C007%2Ehg38%2Enodups%2Epairs%2Ehic\"\n",
+    "## Local download path\n",
+    "HIC_FILE = \"{HIC_TITLE}_hic.txt.gz\"\n",
+    "\n",
+    "## Output file names\n",
+    "hseq_filename = f\"{HIC_TITLE}_hseq.txt.gz\"\n",
+    "lenfile_name = f\"{HIC_TITLE}_hseq.txt.gz.len\"\n",
+    "\n",
+    "## Number of chromosomes to include in the hseq file.\n",
+    "## In the example file, the first two chromosomes are the \"ALL\" catchall and the mitochondrial chromosome.\n",
+    "## Set this to 0 to convert all chromosomes.\n",
+    "MAX_CHROM = 5"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Load the conversion code from the holoSeq repository\n",
+    "import sys\n",
+    "sys.path.append(\"../scripts\")\n",
+    "import hic2hseq"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Download from GEO\n",
+    "The given example file is about a 5GB download."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from urllib.request import urlretrieve\n",
+    "urlretrieve(HIC_URL, HIC_FILE)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Convert to hseq\n",
+    "Conversion of the entire 5GB .hic file takes 10-20 minutes, and the output is about 500MB. Fewer chromosomes will convert faster and have a smaller output file."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "lenfile_stream = open(lenfile_name, mode=\"w\")\n",
+    "with hic2hseq.GzipOut(hseq_filename) as ostream:\n",
+    "    hic2hseq.convert_hic_to_hseq(HIC_FILE, ostream, lenfile_stream, lenfile_name, MAX_CHROM, HIC_TITLE)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Start the panel server"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!panel serve ../scripts/holoseq_display.py --show --args --inFile {HIC_TITLE}_hseq.txt.gz --size 1000"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "holoSeq",
+   "language": "python",
+   "name": "holoseq"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/scripts/hic2hseq.py b/scripts/hic2hseq.py
@@ -43,7 +43,7 @@
 import logging
 import os
 import argparse
-from typing import List
+from typing import List, TextIO
 
 import hicstraw
 
@@ -79,7 +79,7 @@ def cumulative_sum(x: List[int]) -> List[int]:
     return y
 
 
-def convert_hic_to_hseq(hicfn: str, ostream: GzipOut, max_chrom: int, title: str) -> None:
+def convert_hic_to_hseq(hicfn: str, ostream: GzipOut, lenfile_stream: TextIO,lenfile_name: str, max_chrom: int, title: str) -> None:
     """Convert a .hic file into HoloSeq hseq format.
 
     Parameters:
@@ -88,8 +88,6 @@ def convert_hic_to_hseq(hicfn: str, ostream: GzipOut, max_chrom: int, title: str
         max_chrom (int): Maximum number of chromosomes to include.
         title (str): Title for the output matrix.
     """
-    if not os.path.isfile(hicfn):
-        raise FileNotFoundError(f"Input file '{hicfn}' does not exist.")
 
     hic = hicstraw.HiCFile(hicfn)
     chroms = hic.getChromosomes()
@@ -102,7 +100,7 @@ def convert_hic_to_hseq(hicfn: str, ostream: GzipOut, max_chrom: int, title: str
     cnames = [c.name for c in chroms]
 
     ostream.write(
-        f"@v1HoloSeq2D\n@title {title}\n"
+        f"@v1HoloSeq2D\n@@heatmap\n@@title {title}\n@@xclenfile {lenfile_name}\n@@yclenfile {lenfile_name}\n@@axes H1\n"
         + "".join(f"@H1 {chrom} {offset}\n" for chrom, offset in zip(cnames, offsets))
     )
 
@@ -141,10 +139,13 @@ def convert_hic_to_hseq(hicfn: str, ostream: GzipOut, max_chrom: int, title: str
     argv = ap.parse_args()
 
     argv.title = argv.title or argv.hicfile
+    lenfile_path = argv.hseqgz + ".len"
+    lenfile_name = os.path.basename(lenfile_path)
+    lenfile_stream = open(lenfile_path, mode="w")
 
     try:
         with GzipOut(argv.hseqgz) as ostream:
-            convert_hic_to_hseq(argv.hicfile, ostream, argv.max_chrom, argv.title)
+            convert_hic_to_hseq(argv.hicfile, ostream, lenfile_stream, lenfile_name, argv.max_chrom, argv.title)
         logger.info("Conversion completed successfully.")
     except Exception as e:
         logger.error(f"Failed to convert .hic to HoloSeq format: {e}")