Skip to content

Commit f0ad07e

Browse files
authored
Merge pull request #6 from cvaske/notebook-hic
Notebook hic conversion example
2 parents 72e75d1 + b01aee1 commit f0ad07e

File tree

3 files changed

+158
-6
lines changed

3 files changed

+158
-6
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ repos:
1616
- id: mixed-line-ending
1717
- id: no-commit-to-branch
1818
- id: pretty-format-json
19+
exclude_types: [jupyter]
1920
- id: trailing-whitespace
2021

2122
- repo: https://github.com/astral-sh/ruff-pre-commit

notebooks/load-hic-from-GEO.ipynb

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# Conversion from `.hic` to holoSEq `hseq` format\n",
8+
"\n",
9+
"This Jupyter notebook shows an example of converting a .hic file directly from GEO (Gene Expression Omnibus), and then launching a panel server by the command line to view it.\n",
10+
"\n",
11+
"This notebook has been checked out via git, to get the entire repository of code."
12+
]
13+
},
14+
{
15+
"cell_type": "code",
16+
"execution_count": null,
17+
"metadata": {},
18+
"outputs": [],
19+
"source": [
20+
"# run this to install all required dependencies before running the main code if not already in the venv\n",
21+
"# If you made a kernel out of the package's venv, you can skip this step\n",
22+
"! pip install datashader 'dask[dataframe]' 'holoviews[recommended]' pandas matplotlib bokeh hic-straw"
23+
]
24+
},
25+
{
26+
"cell_type": "markdown",
27+
"metadata": {},
28+
"source": [
29+
"# Configuration\n",
30+
"The following paths can be changed to point to other samples or output names"
31+
]
32+
},
33+
{
34+
"cell_type": "code",
35+
"execution_count": 1,
36+
"metadata": {},
37+
"outputs": [],
38+
"source": [
39+
"## Name of the sample to put as metadata in the hseq file\n",
40+
"HIC_TITLE = \"A001C007\"\n",
41+
"## URL to the hic file \n",
42+
"HIC_URL = \"https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM6326nnn/GSM6326543/suppl/GSM6326543%5FA001C007%2Ehg38%2Enodups%2Epairs%2Ehic\"\n",
43+
"## Local download path\n",
44+
"HIC_FILE = \"{HIC_TITLE}_hic.txt.gz\"\n",
45+
"\n",
46+
"## Output file names\n",
47+
"hseq_filename = f\"{HIC_TITLE}_hseq.txt.gz\"\n",
48+
"lenfile_name = f\"{HIC_TITLE}_hseq.txt.gz.len\"\n",
49+
"\n",
50+
"## Number of chromosomes to include in the hseq file.\n",
51+
"## In the example file, the first two chromosomes are the \"ALL\" catchall and the mitochondrial chromosome.\n",
52+
"## Set this to 0 to convert all chromosomes.\n",
53+
"MAX_CHROM = 5"
54+
]
55+
},
56+
{
57+
"cell_type": "code",
58+
"execution_count": 2,
59+
"metadata": {},
60+
"outputs": [],
61+
"source": [
62+
"## Load the conversion code from the holoSeq repository\n",
63+
"import sys\n",
64+
"sys.path.append(\"../scripts\")\n",
65+
"import hic2hseq"
66+
]
67+
},
68+
{
69+
"cell_type": "markdown",
70+
"metadata": {},
71+
"source": [
72+
"# Download from GEO\n",
73+
"The given example file is about a 5GB download."
74+
]
75+
},
76+
{
77+
"cell_type": "code",
78+
"execution_count": null,
79+
"metadata": {},
80+
"outputs": [],
81+
"source": [
82+
"from urllib.request import urlretrieve\n",
83+
"urlretrieve(HIC_URL, HIC_FILE)"
84+
]
85+
},
86+
{
87+
"cell_type": "markdown",
88+
"metadata": {},
89+
"source": [
90+
"# Convert to hseq\n",
91+
"Conversion of the entire 5GB .hic file takes 10-20 minutes, and the output is about 500MB. Fewer chromosomes will convert faster and have a smaller output file."
92+
]
93+
},
94+
{
95+
"cell_type": "code",
96+
"execution_count": null,
97+
"metadata": {},
98+
"outputs": [],
99+
"source": [
100+
"lenfile_stream = open(lenfile_name, mode=\"w\")\n",
101+
"with hic2hseq.GzipOut(hseq_filename) as ostream:\n",
102+
" hic2hseq.convert_hic_to_hseq(HIC_FILE, ostream, lenfile_stream, lenfile_name, MAX_CHROM, HIC_TITLE)"
103+
]
104+
},
105+
{
106+
"cell_type": "markdown",
107+
"metadata": {},
108+
"source": [
109+
"# Start the panel server"
110+
]
111+
},
112+
{
113+
"cell_type": "code",
114+
"execution_count": null,
115+
"metadata": {},
116+
"outputs": [],
117+
"source": [
118+
"!panel serve ../scripts/holoseq_display.py --show --args --inFile {HIC_TITLE}_hseq.txt.gz --size 1000"
119+
]
120+
},
121+
{
122+
"cell_type": "code",
123+
"execution_count": null,
124+
"metadata": {},
125+
"outputs": [],
126+
"source": []
127+
}
128+
],
129+
"metadata": {
130+
"kernelspec": {
131+
"display_name": "holoSeq",
132+
"language": "python",
133+
"name": "holoseq"
134+
},
135+
"language_info": {
136+
"codemirror_mode": {
137+
"name": "ipython",
138+
"version": 3
139+
},
140+
"file_extension": ".py",
141+
"mimetype": "text/x-python",
142+
"name": "python",
143+
"nbconvert_exporter": "python",
144+
"pygments_lexer": "ipython3",
145+
"version": "3.12.1"
146+
}
147+
},
148+
"nbformat": 4,
149+
"nbformat_minor": 2
150+
}

scripts/hic2hseq.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
import logging
4444
import os
4545
import argparse
46-
from typing import List
46+
from typing import List, TextIO
4747

4848
import hicstraw
4949

@@ -79,7 +79,7 @@ def cumulative_sum(x: List[int]) -> List[int]:
7979
return y
8080

8181

82-
def convert_hic_to_hseq(hicfn: str, ostream: GzipOut, max_chrom: int, title: str) -> None:
82+
def convert_hic_to_hseq(hicfn: str, ostream: GzipOut, lenfile_stream: TextIO,lenfile_name: str, max_chrom: int, title: str) -> None:
8383
"""Convert a .hic file into HoloSeq hseq format.
8484
8585
Parameters:
@@ -88,8 +88,6 @@ def convert_hic_to_hseq(hicfn: str, ostream: GzipOut, max_chrom: int, title: str
8888
max_chrom (int): Maximum number of chromosomes to include.
8989
title (str): Title for the output matrix.
9090
"""
91-
if not os.path.isfile(hicfn):
92-
raise FileNotFoundError(f"Input file '{hicfn}' does not exist.")
9391

9492
hic = hicstraw.HiCFile(hicfn)
9593
chroms = hic.getChromosomes()
@@ -102,7 +100,7 @@ def convert_hic_to_hseq(hicfn: str, ostream: GzipOut, max_chrom: int, title: str
102100
cnames = [c.name for c in chroms]
103101

104102
ostream.write(
105-
f"@v1HoloSeq2D\n@title {title}\n"
103+
f"@v1HoloSeq2D\n@@heatmap\n@@title {title}\n@@xclenfile {lenfile_name}\n@@yclenfile {lenfile_name}\n@@axes H1\n"
106104
+ "".join(f"@H1 {chrom} {offset}\n" for chrom, offset in zip(cnames, offsets))
107105
)
108106

@@ -141,10 +139,13 @@ def convert_hic_to_hseq(hicfn: str, ostream: GzipOut, max_chrom: int, title: str
141139
argv = ap.parse_args()
142140

143141
argv.title = argv.title or argv.hicfile
142+
lenfile_path = argv.hseqgz + ".len"
143+
lenfile_name = os.path.basename(lenfile_path)
144+
lenfile_stream = open(lenfile_path, mode="w")
144145

145146
try:
146147
with GzipOut(argv.hseqgz) as ostream:
147-
convert_hic_to_hseq(argv.hicfile, ostream, argv.max_chrom, argv.title)
148+
convert_hic_to_hseq(argv.hicfile, ostream, lenfile_stream, lenfile_name, argv.max_chrom, argv.title)
148149
logger.info("Conversion completed successfully.")
149150
except Exception as e:
150151
logger.error(f"Failed to convert .hic to HoloSeq format: {e}")

0 commit comments

Comments
 (0)