Update inspecting examples colab.

anastasiyabl · copybara-github · commit 7dd3bb0c7a5d · 2023-02-28T09:29:13.000-08:00
PiperOrigin-RevId: 512956150
diff --git a/notebooks/Inspecting_DeepConsensus_examples_and_running_model.ipynb b/notebooks/Inspecting_DeepConsensus_examples_and_running_model.ipynb
@@ -59,8 +59,8 @@
             "Requirement already satisfied: wheel in /usr/local/lib/python3.8/dist-packages (0.38.4)\n",
             "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
             "\u001b[0mLooking in indexes: https://test.pypi.org/simple/, https://us-python.pkg.dev/colab-wheels/public/simple/, https://pypi.org/simple\n",
-            "Collecting deepconsensus[cpu]==1.1.0\n",
-            "  Using cached https://test-files.pythonhosted.org/packages/6e/01/790fbf619116eac308509c1c029bf303ff6684afea77742b43dd150da6ec/deepconsensus-1.1.0-py3-none-any.whl (157 kB)\n",
+            "Collecting deepconsensus[cpu]==1.2.0\n",
+            "  Using cached https://test-files.pythonhosted.org/packages/6e/01/790fbf619116eac308509c1c029bf303ff6684afea77742b43dd150da6ec/deepconsensus-1.2.0-py3-none-any.whl (157 kB)\n",
             "Collecting tf-models-official==2.9.2\n",
             "  Using cached tf_models_official-2.9.2-py2.py3-none-any.whl (2.1 MB)\n",
             "Requirement already satisfied: absl-py==1.0.0 in /usr/local/lib/python3.8/dist-packages (from deepconsensus[cpu]==1.1.0) (1.0.0)\n",
@@ -174,15 +174,15 @@
             "    Found existing installation: pandas 1.3.5\n",
             "    Uninstalling pandas-1.3.5:\n",
             "      Successfully uninstalled pandas-1.3.5\n",
-            "Successfully installed deepconsensus-1.1.0 intel-tensorflow-2.9.1 ml-collections-0.1.1 pandas-1.5.1 sacrebleu-2.3.1 seqeval-1.2.2 tensorflow-addons-0.19.0 tensorflow-text-2.9.0 tf-models-official-2.9.2\n",
+            "Successfully installed deepconsensus-1.2.0 intel-tensorflow-2.9.1 ml-collections-0.1.1 pandas-1.5.1 sacrebleu-2.3.1 seqeval-1.2.2 tensorflow-addons-0.19.0 tensorflow-text-2.9.0 tf-models-official-2.9.2\n",
             "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
             "\u001b[0m"
           ]
         }
       ],
       "source": [
         "! pip install --upgrade pip setuptools wheel\n",
-        "! pip install deepconsensus[cpu]==1.1.0"
+        "! pip install deepconsensus[cpu]==1.2.0"
       ]
     },
     {
@@ -197,8 +197,7 @@
         "import colorama\n",
         "import numpy as np\n",
         "import random\n",
-        "import PIL\n",
-        "from IPython.display import display\n",
+        "import matplotlib.pyplot as plt\n",
         "from deepconsensus.models import model_configs\n",
         "from deepconsensus.models import model_utils\n",
         "from deepconsensus.models import data_providers\n",
@@ -222,47 +221,52 @@
       },
       "outputs": [],
       "source": [
-        "def plot_array(arr, scale = 5):\n",
-        "  \"\"\"Transforms array for plotting.\"\"\"\n",
-        "  arr = np.copy(arr).astype('float64')\n",
-        "  arr = np.where(arr \u003e 1.0e-10, arr, -10.0)\n",
-        "  arr = np.log(arr, where=arr \u003e 0.0, out=arr) * 100.0\n",
-        "  arr = np.where(arr \u003e 0, arr + 25.0, 0)\n",
-        "  arr = arr.astype('uint8')\n",
-        "  im = PIL.Image.fromarray(arr, 'P')\n",
-        "  im = im.resize((im.size[0] * scale, im.size[1] * scale))\n",
-        "  display(im)\n",
+        "def plot_array(arr):\n",
+        "  \"\"\"Plot the array.\"\"\"\n",
+        "  plt.figure(figsize = (20,10))\n",
+        "  plt.imshow(arr, cmap=\"Greys_r\", vmin=0, vmax=20)\n",
+        "  plt.axis('off')\n",
+        "  plt.show()\n",
         "\n",
-        "def show_rows_for_one_example(rows, max_passes=20):\n",
+        "def show_rows_for_one_example(rows, max_passes=20, use_ccs_bq=True):\n",
         "  \"\"\"Break out the black/white matrix into subreads, ccs, PW, IP, etc.\n",
         "\n",
-        "  For each of `max_subreads`, we have four pieces of information: bases, PW, IP,\n",
-        "  and strand. We also have one row for CCS, and four rows for SN (in that\n",
-        "  order).\n",
-        "  The information is structured as follows:\n",
-        "  Bases: rows 0 to  (params.max_passes - 1)\n",
-        "  PW: rows (params.max_passes) to (params.max_passes * 2 - 1)\n",
-        "  IP: rows (params.max_passes * 2) to (params.max_passes * 3 - 1)\n",
-        "  Strand: rows (params.max_passes * 3) to (params.max_passes * 4 - 1)\n",
-        "  CCS+SN: rows (params.max_passes * 4) to (params.max_passes * 4 + 5)\n",
+        "  For each of `max_subreads`, we have multiple pieces of information: bases, PW,\n",
+        "  IP, and strand. We also have one row for CCS, another row for CCS base\n",
+        "  qualitites (optionally) and four rows for SN.\n",
         "  \"\"\"\n",
+        "  (\n",
+        "      base_indices,\n",
+        "      pw_indices,\n",
+        "      ip_indices,\n",
+        "      strand_indices,\n",
+        "      ccs_indices,\n",
+        "      ccs_bq_indices,\n",
+        "      sn_indices,\n",
+        "  ) = data_providers.get_indices(max_passes, use_ccs_bq)\n",
+        "  \n",
+        "  print(base_indices)\n",
         "  print('bases:')\n",
-        "  plot_array(rows[0:max_passes])\n",
+        "  plot_array(rows[slice(*base_indices)])\n",
         "  print('PW: pulse-width')\n",
-        "  pw = rows[max_passes:max_passes*2]\n",
+        "  pw = rows[slice(*pw_indices)]\n",
         "  plot_array(pw)\n",
         "  print('IP: inter-pulse duration')\n",
-        "  ip = rows[max_passes*2:max_passes*3]\n",
+        "  ip = rows[slice(*ip_indices)]\n",
         "  plot_array(ip)\n",
         "\n",
         "  print('Strand:')\n",
-        "  plot_array(rows[max_passes*3:-5])\n",
+        "  plot_array(rows[slice(*strand_indices)])\n",
         "\n",
         "  print('CCS:')\n",
-        "  ccs = rows[-5:-4]\n",
+        "  ccs = rows[slice(*ccs_indices)]\n",
         "  plot_array(ccs)\n",
+        "  if use_ccs_bq:\n",
+        "    print('CCS base quality scores:')\n",
+        "    ccs_bq = rows[slice(*ccs_bq_indices)]\n",
+        "    plot_array(ccs_bq)\n",
         "  print('SN:')\n",
-        "  sn = rows[-4:]\n",
+        "  sn = rows[slice(*sn_indices)]\n",
         "  plot_array(sn)"
       ]
     },
@@ -279,14 +283,19 @@
         "  return ''.join([dc_constants.SEQ_VOCAB[int(base)] for base in encoded_sequence])\n",
         "\n",
         "\n",
-        "def get_ccs_matrix(rows_for_one_example):\n",
+        "def get_ccs_matrix(rows_for_one_example, max_passes, use_ccs_bq):\n",
         "  \"\"\"Slice the encoded CCS out of the 2d rows matrix of one example.\"\"\"\n",
-        "  return rows_for_one_example[-5, :]\n",
+        "  _, _, _, _, ccs_indices, _, _ = data_providers.get_indices(\n",
+        "      max_passes=max_passes,\n",
+        "      use_ccs_bq=use_ccs_bq,\n",
+        "  )\n",
+        "  return rows_for_one_example[slice(*ccs_indices), :][0]\n",
         "\n",
         "\n",
-        "def get_ccs_seq(rows_for_one_example):\n",
+        "def get_ccs_seq(rows_for_one_example, max_passes, use_ccs_bq):\n",
         "  \"\"\"Get CCS string out of the 2d rows matrix of one example.\"\"\"\n",
-        "  return encoded_sequence_to_string(get_ccs_matrix(rows_for_one_example))\n",
+        "  return encoded_sequence_to_string(get_ccs_matrix(\n",
+        "      rows_for_one_example, max_passes, use_ccs_bq))\n",
         "\n",
         "\n",
         "def colorful(seq):\n",
@@ -305,11 +314,11 @@
         "  return ''.join(colored_seq)\n",
         "\n",
         "\n",
-        "def show_example(batch, example_i, ypreds=None, max_passes=20):\n",
+        "def show_example(batch, example_i, ypreds=None, max_passes=20, use_ccs_bq=True):\n",
         "  \"\"\"Show an example with subreads, ccs, predictions, and labels.\"\"\"\n",
         "  batch_size = batch['rows'].shape[0]\n",
         "  rows = batch['rows'][example_i, :, :, 0]\n",
-        "  ccs_i = get_ccs_seq(rows)\n",
+        "  ccs_i = get_ccs_seq(rows, max_passes, use_ccs_bq)\n",
         "  print(f'Example: {example_i} of {batch_size} (batch)')\n",
         "  print('How the sequences are represented for the model:')\n",
         "  subreads = rows[0:max_passes, :]\n",
@@ -360,12 +369,12 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "Copying gs://brain-genomics-public/research/deepconsensus/models/v1.1/model_checkpoint/checkpoint.data-00000-of-00001...\n",
-            "Copying gs://brain-genomics-public/research/deepconsensus/models/v1.1/model_checkpoint/checkpoint.index...\n",
-            "Copying gs://brain-genomics-public/research/deepconsensus/models/v1.1/model_checkpoint/params.json...\n",
+            "Copying gs://brain-genomics-public/research/deepconsensus/models/v1.2/model_checkpoint/checkpoint.data-00000-of-00001...\n",
+            "Copying gs://brain-genomics-public/research/deepconsensus/models/v1.2/model_checkpoint/checkpoint.index...\n",
+            "Copying gs://brain-genomics-public/research/deepconsensus/models/v1.2/model_checkpoint/params.json...\n",
             "- [3 files][ 85.7 MiB/ 85.7 MiB]                                                \n",
             "Operation completed over 3 objects/85.7 MiB.                                     \n",
-            "Copying gs://brain-genomics-public/research/deepconsensus/training-tutorial/v1.1/test/tf-test-00000-of-00500.tfrecord.gz...\n",
+            "Copying gs://brain-genomics-public/research/deepconsensus/training-tutorial/v1.2/test/tf-test-00000-of-00500.tfrecord.gz...\n",
             "/ [1 files][  6.9 MiB/  6.9 MiB]                                                \n",
             "Operation completed over 1 objects/6.9 MiB.                                      \n"
           ]
@@ -374,9 +383,9 @@
       "source": [
         "# Download Model\n",
         "! mkdir -p deepconsensus_model\n",
-        "! gsutil cp -r gs://brain-genomics-public/research/deepconsensus/models/v1.1/model_checkpoint/* deepconsensus_model/\n",
+        "! gsutil cp -r gs://brain-genomics-public/research/deepconsensus/models/v1.2/model_checkpoint/* deepconsensus_model/\n",
         "# Download test data\n",
-        "! gsutil cp gs://brain-genomics-public/research/deepconsensus/training-tutorial/v1.1/test/tf-test-00000-of-00500.tfrecord.gz ./tf-test.tfrecord.gz"
+        "! gsutil cp gs://brain-genomics-public/research/deepconsensus/training-tutorial/v1.2/test/tf-test-00000-of-00500.tfrecord.gz ./tf-test.tfrecord.gz"
       ]
     },
     {
@@ -451,7 +460,7 @@
         "model = model_utils.get_model(params)\n",
         "checkpoint = tf.train.Checkpoint(model=model)\n",
         "\n",
-        "row_size = data_providers.get_total_rows(params.max_passes)\n",
+        "row_size = data_providers.get_total_rows(params.max_passes, params.use_ccs_bq)\n",
         "input_shape = (1, row_size, params.max_length, params.num_channels)\n",
         "model_utils.print_model_summary(model, input_shape)\n",
         "checkpoint.restore(\n",
@@ -500,7 +509,7 @@
           "text": [
             "name.shape: (20, 1)\n",
             "label.shape: (20, 100)\n",
-            "rows.shape: (20, 85, 100, 1)\n",
+            "rows.shape: (20, 86, 100, 1)\n",
             "num_passes.shape: (20, 1)\n",
             "window_pos.shape: (20, 1)\n"
           ]
@@ -604,7 +613,7 @@
       "source": [
         "### Let's break that down\n",
         "\n",
-        "Each matrix is composed of multiple data types layered in rows. The bases, pulse-width, inter-pulse duration, and strand are per-subread, up to the max number of subreads (this is in params.max_passes, which as of DeepConsensus v1.1 is 20). The input also contains a draft circular consensus sequence (CCS) from PacBio and the signal-to-noise ratio (SN)."
+        "Each matrix is composed of multiple data types layered in rows. The bases, pulse-width, inter-pulse duration, and strand are per-subread, up to the max number of subreads (this is in params.max_passes, which as of DeepConsensus v1.2 is 20). The input also contains a draft circular consensus sequence (CCS) from PacBio and the signal-to-noise ratio (SN)."
       ]
     },
     {
@@ -978,6 +987,10 @@
   ],
   "metadata": {
     "colab": {
+      "last_runtime": {
+        "build_target": "//learning/genomics/internal:genomics_colab",
+        "kind": "private"
+      },
       "provenance": [
         {
           "file_id": "/piper/depot/google3/learning/genomics/deepconsensus/opensource_only/g3doc/notebooks/Copy_of_Inspecting_DeepConsensus_examples_and_running_model.ipynb?workspaceId=belyaeva:dc_colab::citc",