Skip to content

Commit 7dd3bb0

Browse files
anastasiyablcopybara-github
authored andcommitted
Update inspecting examples colab.
PiperOrigin-RevId: 512956150
1 parent 21a5346 commit 7dd3bb0

1 file changed

Lines changed: 60 additions & 47 deletions

File tree

notebooks/Inspecting_DeepConsensus_examples_and_running_model.ipynb

Lines changed: 60 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,8 @@
5959
"Requirement already satisfied: wheel in /usr/local/lib/python3.8/dist-packages (0.38.4)\n",
6060
"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
6161
"\u001b[0mLooking in indexes: https://test.pypi.org/simple/, https://us-python.pkg.dev/colab-wheels/public/simple/, https://pypi.org/simple\n",
62-
"Collecting deepconsensus[cpu]==1.1.0\n",
63-
" Using cached https://test-files.pythonhosted.org/packages/6e/01/790fbf619116eac308509c1c029bf303ff6684afea77742b43dd150da6ec/deepconsensus-1.1.0-py3-none-any.whl (157 kB)\n",
62+
"Collecting deepconsensus[cpu]==1.2.0\n",
63+
" Using cached https://test-files.pythonhosted.org/packages/6e/01/790fbf619116eac308509c1c029bf303ff6684afea77742b43dd150da6ec/deepconsensus-1.2.0-py3-none-any.whl (157 kB)\n",
6464
"Collecting tf-models-official==2.9.2\n",
6565
" Using cached tf_models_official-2.9.2-py2.py3-none-any.whl (2.1 MB)\n",
6666
"Requirement already satisfied: absl-py==1.0.0 in /usr/local/lib/python3.8/dist-packages (from deepconsensus[cpu]==1.1.0) (1.0.0)\n",
@@ -174,15 +174,15 @@
174174
" Found existing installation: pandas 1.3.5\n",
175175
" Uninstalling pandas-1.3.5:\n",
176176
" Successfully uninstalled pandas-1.3.5\n",
177-
"Successfully installed deepconsensus-1.1.0 intel-tensorflow-2.9.1 ml-collections-0.1.1 pandas-1.5.1 sacrebleu-2.3.1 seqeval-1.2.2 tensorflow-addons-0.19.0 tensorflow-text-2.9.0 tf-models-official-2.9.2\n",
177+
"Successfully installed deepconsensus-1.2.0 intel-tensorflow-2.9.1 ml-collections-0.1.1 pandas-1.5.1 sacrebleu-2.3.1 seqeval-1.2.2 tensorflow-addons-0.19.0 tensorflow-text-2.9.0 tf-models-official-2.9.2\n",
178178
"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
179179
"\u001b[0m"
180180
]
181181
}
182182
],
183183
"source": [
184184
"! pip install --upgrade pip setuptools wheel\n",
185-
"! pip install deepconsensus[cpu]==1.1.0"
185+
"! pip install deepconsensus[cpu]==1.2.0"
186186
]
187187
},
188188
{
@@ -197,8 +197,7 @@
197197
"import colorama\n",
198198
"import numpy as np\n",
199199
"import random\n",
200-
"import PIL\n",
201-
"from IPython.display import display\n",
200+
"import matplotlib.pyplot as plt\n",
202201
"from deepconsensus.models import model_configs\n",
203202
"from deepconsensus.models import model_utils\n",
204203
"from deepconsensus.models import data_providers\n",
@@ -222,47 +221,52 @@
222221
},
223222
"outputs": [],
224223
"source": [
225-
"def plot_array(arr, scale = 5):\n",
226-
" \"\"\"Transforms array for plotting.\"\"\"\n",
227-
" arr = np.copy(arr).astype('float64')\n",
228-
" arr = np.where(arr \u003e 1.0e-10, arr, -10.0)\n",
229-
" arr = np.log(arr, where=arr \u003e 0.0, out=arr) * 100.0\n",
230-
" arr = np.where(arr \u003e 0, arr + 25.0, 0)\n",
231-
" arr = arr.astype('uint8')\n",
232-
" im = PIL.Image.fromarray(arr, 'P')\n",
233-
" im = im.resize((im.size[0] * scale, im.size[1] * scale))\n",
234-
" display(im)\n",
224+
"def plot_array(arr):\n",
225+
" \"\"\"Plot the array.\"\"\"\n",
226+
" plt.figure(figsize = (20,10))\n",
227+
" plt.imshow(arr, cmap=\"Greys_r\", vmin=0, vmax=20)\n",
228+
" plt.axis('off')\n",
229+
" plt.show()\n",
235230
"\n",
236-
"def show_rows_for_one_example(rows, max_passes=20):\n",
231+
"def show_rows_for_one_example(rows, max_passes=20, use_ccs_bq=True):\n",
237232
" \"\"\"Break out the black/white matrix into subreads, ccs, PW, IP, etc.\n",
238233
"\n",
239-
" For each of `max_subreads`, we have four pieces of information: bases, PW, IP,\n",
240-
" and strand. We also have one row for CCS, and four rows for SN (in that\n",
241-
" order).\n",
242-
" The information is structured as follows:\n",
243-
" Bases: rows 0 to (params.max_passes - 1)\n",
244-
" PW: rows (params.max_passes) to (params.max_passes * 2 - 1)\n",
245-
" IP: rows (params.max_passes * 2) to (params.max_passes * 3 - 1)\n",
246-
" Strand: rows (params.max_passes * 3) to (params.max_passes * 4 - 1)\n",
247-
" CCS+SN: rows (params.max_passes * 4) to (params.max_passes * 4 + 5)\n",
234+
" For each of `max_subreads`, we have multiple pieces of information: bases, PW,\n",
235+
" IP, and strand. We also have one row for CCS, another row for CCS base\n",
236+
" qualitites (optionally) and four rows for SN.\n",
248237
" \"\"\"\n",
238+
" (\n",
239+
" base_indices,\n",
240+
" pw_indices,\n",
241+
" ip_indices,\n",
242+
" strand_indices,\n",
243+
" ccs_indices,\n",
244+
" ccs_bq_indices,\n",
245+
" sn_indices,\n",
246+
" ) = data_providers.get_indices(max_passes, use_ccs_bq)\n",
247+
" \n",
248+
" print(base_indices)\n",
249249
" print('bases:')\n",
250-
" plot_array(rows[0:max_passes])\n",
250+
" plot_array(rows[slice(*base_indices)])\n",
251251
" print('PW: pulse-width')\n",
252-
" pw = rows[max_passes:max_passes*2]\n",
252+
" pw = rows[slice(*pw_indices)]\n",
253253
" plot_array(pw)\n",
254254
" print('IP: inter-pulse duration')\n",
255-
" ip = rows[max_passes*2:max_passes*3]\n",
255+
" ip = rows[slice(*ip_indices)]\n",
256256
" plot_array(ip)\n",
257257
"\n",
258258
" print('Strand:')\n",
259-
" plot_array(rows[max_passes*3:-5])\n",
259+
" plot_array(rows[slice(*strand_indices)])\n",
260260
"\n",
261261
" print('CCS:')\n",
262-
" ccs = rows[-5:-4]\n",
262+
" ccs = rows[slice(*ccs_indices)]\n",
263263
" plot_array(ccs)\n",
264+
" if use_ccs_bq:\n",
265+
" print('CCS base quality scores:')\n",
266+
" ccs_bq = rows[slice(*ccs_bq_indices)]\n",
267+
" plot_array(ccs_bq)\n",
264268
" print('SN:')\n",
265-
" sn = rows[-4:]\n",
269+
" sn = rows[slice(*sn_indices)]\n",
266270
" plot_array(sn)"
267271
]
268272
},
@@ -279,14 +283,19 @@
279283
" return ''.join([dc_constants.SEQ_VOCAB[int(base)] for base in encoded_sequence])\n",
280284
"\n",
281285
"\n",
282-
"def get_ccs_matrix(rows_for_one_example):\n",
286+
"def get_ccs_matrix(rows_for_one_example, max_passes, use_ccs_bq):\n",
283287
" \"\"\"Slice the encoded CCS out of the 2d rows matrix of one example.\"\"\"\n",
284-
" return rows_for_one_example[-5, :]\n",
288+
" _, _, _, _, ccs_indices, _, _ = data_providers.get_indices(\n",
289+
" max_passes=max_passes,\n",
290+
" use_ccs_bq=use_ccs_bq,\n",
291+
" )\n",
292+
" return rows_for_one_example[slice(*ccs_indices), :][0]\n",
285293
"\n",
286294
"\n",
287-
"def get_ccs_seq(rows_for_one_example):\n",
295+
"def get_ccs_seq(rows_for_one_example, max_passes, use_ccs_bq):\n",
288296
" \"\"\"Get CCS string out of the 2d rows matrix of one example.\"\"\"\n",
289-
" return encoded_sequence_to_string(get_ccs_matrix(rows_for_one_example))\n",
297+
" return encoded_sequence_to_string(get_ccs_matrix(\n",
298+
" rows_for_one_example, max_passes, use_ccs_bq))\n",
290299
"\n",
291300
"\n",
292301
"def colorful(seq):\n",
@@ -305,11 +314,11 @@
305314
" return ''.join(colored_seq)\n",
306315
"\n",
307316
"\n",
308-
"def show_example(batch, example_i, ypreds=None, max_passes=20):\n",
317+
"def show_example(batch, example_i, ypreds=None, max_passes=20, use_ccs_bq=True):\n",
309318
" \"\"\"Show an example with subreads, ccs, predictions, and labels.\"\"\"\n",
310319
" batch_size = batch['rows'].shape[0]\n",
311320
" rows = batch['rows'][example_i, :, :, 0]\n",
312-
" ccs_i = get_ccs_seq(rows)\n",
321+
" ccs_i = get_ccs_seq(rows, max_passes, use_ccs_bq)\n",
313322
" print(f'Example: {example_i} of {batch_size} (batch)')\n",
314323
" print('How the sequences are represented for the model:')\n",
315324
" subreads = rows[0:max_passes, :]\n",
@@ -360,12 +369,12 @@
360369
"name": "stdout",
361370
"output_type": "stream",
362371
"text": [
363-
"Copying gs://brain-genomics-public/research/deepconsensus/models/v1.1/model_checkpoint/checkpoint.data-00000-of-00001...\n",
364-
"Copying gs://brain-genomics-public/research/deepconsensus/models/v1.1/model_checkpoint/checkpoint.index...\n",
365-
"Copying gs://brain-genomics-public/research/deepconsensus/models/v1.1/model_checkpoint/params.json...\n",
372+
"Copying gs://brain-genomics-public/research/deepconsensus/models/v1.2/model_checkpoint/checkpoint.data-00000-of-00001...\n",
373+
"Copying gs://brain-genomics-public/research/deepconsensus/models/v1.2/model_checkpoint/checkpoint.index...\n",
374+
"Copying gs://brain-genomics-public/research/deepconsensus/models/v1.2/model_checkpoint/params.json...\n",
366375
"- [3 files][ 85.7 MiB/ 85.7 MiB] \n",
367376
"Operation completed over 3 objects/85.7 MiB. \n",
368-
"Copying gs://brain-genomics-public/research/deepconsensus/training-tutorial/v1.1/test/tf-test-00000-of-00500.tfrecord.gz...\n",
377+
"Copying gs://brain-genomics-public/research/deepconsensus/training-tutorial/v1.2/test/tf-test-00000-of-00500.tfrecord.gz...\n",
369378
"/ [1 files][ 6.9 MiB/ 6.9 MiB] \n",
370379
"Operation completed over 1 objects/6.9 MiB. \n"
371380
]
@@ -374,9 +383,9 @@
374383
"source": [
375384
"# Download Model\n",
376385
"! mkdir -p deepconsensus_model\n",
377-
"! gsutil cp -r gs://brain-genomics-public/research/deepconsensus/models/v1.1/model_checkpoint/* deepconsensus_model/\n",
386+
"! gsutil cp -r gs://brain-genomics-public/research/deepconsensus/models/v1.2/model_checkpoint/* deepconsensus_model/\n",
378387
"# Download test data\n",
379-
"! gsutil cp gs://brain-genomics-public/research/deepconsensus/training-tutorial/v1.1/test/tf-test-00000-of-00500.tfrecord.gz ./tf-test.tfrecord.gz"
388+
"! gsutil cp gs://brain-genomics-public/research/deepconsensus/training-tutorial/v1.2/test/tf-test-00000-of-00500.tfrecord.gz ./tf-test.tfrecord.gz"
380389
]
381390
},
382391
{
@@ -451,7 +460,7 @@
451460
"model = model_utils.get_model(params)\n",
452461
"checkpoint = tf.train.Checkpoint(model=model)\n",
453462
"\n",
454-
"row_size = data_providers.get_total_rows(params.max_passes)\n",
463+
"row_size = data_providers.get_total_rows(params.max_passes, params.use_ccs_bq)\n",
455464
"input_shape = (1, row_size, params.max_length, params.num_channels)\n",
456465
"model_utils.print_model_summary(model, input_shape)\n",
457466
"checkpoint.restore(\n",
@@ -500,7 +509,7 @@
500509
"text": [
501510
"name.shape: (20, 1)\n",
502511
"label.shape: (20, 100)\n",
503-
"rows.shape: (20, 85, 100, 1)\n",
512+
"rows.shape: (20, 86, 100, 1)\n",
504513
"num_passes.shape: (20, 1)\n",
505514
"window_pos.shape: (20, 1)\n"
506515
]
@@ -604,7 +613,7 @@
604613
"source": [
605614
"### Let's break that down\n",
606615
"\n",
607-
"Each matrix is composed of multiple data types layered in rows. The bases, pulse-width, inter-pulse duration, and strand are per-subread, up to the max number of subreads (this is in params.max_passes, which as of DeepConsensus v1.1 is 20). The input also contains a draft circular consensus sequence (CCS) from PacBio and the signal-to-noise ratio (SN)."
616+
"Each matrix is composed of multiple data types layered in rows. The bases, pulse-width, inter-pulse duration, and strand are per-subread, up to the max number of subreads (this is in params.max_passes, which as of DeepConsensus v1.2 is 20). The input also contains a draft circular consensus sequence (CCS) from PacBio and the signal-to-noise ratio (SN)."
608617
]
609618
},
610619
{
@@ -978,6 +987,10 @@
978987
],
979988
"metadata": {
980989
"colab": {
990+
"last_runtime": {
991+
"build_target": "//learning/genomics/internal:genomics_colab",
992+
"kind": "private"
993+
},
981994
"provenance": [
982995
{
983996
"file_id": "/piper/depot/google3/learning/genomics/deepconsensus/opensource_only/g3doc/notebooks/Copy_of_Inspecting_DeepConsensus_examples_and_running_model.ipynb?workspaceId=belyaeva:dc_colab::citc",

0 commit comments

Comments
 (0)