|
59 | 59 | "Requirement already satisfied: wheel in /usr/local/lib/python3.8/dist-packages (0.38.4)\n", |
60 | 60 | "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", |
61 | 61 | "\u001b[0mLooking in indexes: https://test.pypi.org/simple/, https://us-python.pkg.dev/colab-wheels/public/simple/, https://pypi.org/simple\n", |
62 | | - "Collecting deepconsensus[cpu]==1.1.0\n", |
63 | | - " Using cached https://test-files.pythonhosted.org/packages/6e/01/790fbf619116eac308509c1c029bf303ff6684afea77742b43dd150da6ec/deepconsensus-1.1.0-py3-none-any.whl (157 kB)\n", |
| 62 | + "Collecting deepconsensus[cpu]==1.2.0\n", |
| 63 | + " Using cached https://test-files.pythonhosted.org/packages/6e/01/790fbf619116eac308509c1c029bf303ff6684afea77742b43dd150da6ec/deepconsensus-1.2.0-py3-none-any.whl (157 kB)\n", |
64 | 64 | "Collecting tf-models-official==2.9.2\n", |
65 | 65 | " Using cached tf_models_official-2.9.2-py2.py3-none-any.whl (2.1 MB)\n", |
66 | 66 | "Requirement already satisfied: absl-py==1.0.0 in /usr/local/lib/python3.8/dist-packages (from deepconsensus[cpu]==1.1.0) (1.0.0)\n", |
|
174 | 174 | " Found existing installation: pandas 1.3.5\n", |
175 | 175 | " Uninstalling pandas-1.3.5:\n", |
176 | 176 | " Successfully uninstalled pandas-1.3.5\n", |
177 | | - "Successfully installed deepconsensus-1.1.0 intel-tensorflow-2.9.1 ml-collections-0.1.1 pandas-1.5.1 sacrebleu-2.3.1 seqeval-1.2.2 tensorflow-addons-0.19.0 tensorflow-text-2.9.0 tf-models-official-2.9.2\n", |
| 177 | + "Successfully installed deepconsensus-1.2.0 intel-tensorflow-2.9.1 ml-collections-0.1.1 pandas-1.5.1 sacrebleu-2.3.1 seqeval-1.2.2 tensorflow-addons-0.19.0 tensorflow-text-2.9.0 tf-models-official-2.9.2\n", |
178 | 178 | "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", |
179 | 179 | "\u001b[0m" |
180 | 180 | ] |
181 | 181 | } |
182 | 182 | ], |
183 | 183 | "source": [ |
184 | 184 | "! pip install --upgrade pip setuptools wheel\n", |
185 | | - "! pip install deepconsensus[cpu]==1.1.0" |
| 185 | + "! pip install deepconsensus[cpu]==1.2.0" |
186 | 186 | ] |
187 | 187 | }, |
188 | 188 | { |
|
197 | 197 | "import colorama\n", |
198 | 198 | "import numpy as np\n", |
199 | 199 | "import random\n", |
200 | | - "import PIL\n", |
201 | | - "from IPython.display import display\n", |
| 200 | + "import matplotlib.pyplot as plt\n", |
202 | 201 | "from deepconsensus.models import model_configs\n", |
203 | 202 | "from deepconsensus.models import model_utils\n", |
204 | 203 | "from deepconsensus.models import data_providers\n", |
|
222 | 221 | }, |
223 | 222 | "outputs": [], |
224 | 223 | "source": [ |
225 | | - "def plot_array(arr, scale = 5):\n", |
226 | | - " \"\"\"Transforms array for plotting.\"\"\"\n", |
227 | | - " arr = np.copy(arr).astype('float64')\n", |
228 | | - " arr = np.where(arr \u003e 1.0e-10, arr, -10.0)\n", |
229 | | - " arr = np.log(arr, where=arr \u003e 0.0, out=arr) * 100.0\n", |
230 | | - " arr = np.where(arr \u003e 0, arr + 25.0, 0)\n", |
231 | | - " arr = arr.astype('uint8')\n", |
232 | | - " im = PIL.Image.fromarray(arr, 'P')\n", |
233 | | - " im = im.resize((im.size[0] * scale, im.size[1] * scale))\n", |
234 | | - " display(im)\n", |
| 224 | + "def plot_array(arr):\n", |
| 225 | + " \"\"\"Plot the array.\"\"\"\n", |
| 226 | + " plt.figure(figsize = (20,10))\n", |
| 227 | + " plt.imshow(arr, cmap=\"Greys_r\", vmin=0, vmax=20)\n", |
| 228 | + " plt.axis('off')\n", |
| 229 | + " plt.show()\n", |
235 | 230 | "\n", |
236 | | - "def show_rows_for_one_example(rows, max_passes=20):\n", |
| 231 | + "def show_rows_for_one_example(rows, max_passes=20, use_ccs_bq=True):\n", |
237 | 232 | " \"\"\"Break out the black/white matrix into subreads, ccs, PW, IP, etc.\n", |
238 | 233 | "\n", |
239 | | - " For each of `max_subreads`, we have four pieces of information: bases, PW, IP,\n", |
240 | | - " and strand. We also have one row for CCS, and four rows for SN (in that\n", |
241 | | - " order).\n", |
242 | | - " The information is structured as follows:\n", |
243 | | - " Bases: rows 0 to (params.max_passes - 1)\n", |
244 | | - " PW: rows (params.max_passes) to (params.max_passes * 2 - 1)\n", |
245 | | - " IP: rows (params.max_passes * 2) to (params.max_passes * 3 - 1)\n", |
246 | | - " Strand: rows (params.max_passes * 3) to (params.max_passes * 4 - 1)\n", |
247 | | - " CCS+SN: rows (params.max_passes * 4) to (params.max_passes * 4 + 5)\n", |
| 234 | + " For each of `max_subreads`, we have multiple pieces of information: bases, PW,\n", |
| 235 | + " IP, and strand. We also have one row for CCS, another row for CCS base\n", |
| 236 | + " qualitites (optionally) and four rows for SN.\n", |
248 | 237 | " \"\"\"\n", |
| 238 | + " (\n", |
| 239 | + " base_indices,\n", |
| 240 | + " pw_indices,\n", |
| 241 | + " ip_indices,\n", |
| 242 | + " strand_indices,\n", |
| 243 | + " ccs_indices,\n", |
| 244 | + " ccs_bq_indices,\n", |
| 245 | + " sn_indices,\n", |
| 246 | + " ) = data_providers.get_indices(max_passes, use_ccs_bq)\n", |
| 247 | + " \n", |
| 248 | + " print(base_indices)\n", |
249 | 249 | " print('bases:')\n", |
250 | | - " plot_array(rows[0:max_passes])\n", |
| 250 | + " plot_array(rows[slice(*base_indices)])\n", |
251 | 251 | " print('PW: pulse-width')\n", |
252 | | - " pw = rows[max_passes:max_passes*2]\n", |
| 252 | + " pw = rows[slice(*pw_indices)]\n", |
253 | 253 | " plot_array(pw)\n", |
254 | 254 | " print('IP: inter-pulse duration')\n", |
255 | | - " ip = rows[max_passes*2:max_passes*3]\n", |
| 255 | + " ip = rows[slice(*ip_indices)]\n", |
256 | 256 | " plot_array(ip)\n", |
257 | 257 | "\n", |
258 | 258 | " print('Strand:')\n", |
259 | | - " plot_array(rows[max_passes*3:-5])\n", |
| 259 | + " plot_array(rows[slice(*strand_indices)])\n", |
260 | 260 | "\n", |
261 | 261 | " print('CCS:')\n", |
262 | | - " ccs = rows[-5:-4]\n", |
| 262 | + " ccs = rows[slice(*ccs_indices)]\n", |
263 | 263 | " plot_array(ccs)\n", |
| 264 | + " if use_ccs_bq:\n", |
| 265 | + " print('CCS base quality scores:')\n", |
| 266 | + " ccs_bq = rows[slice(*ccs_bq_indices)]\n", |
| 267 | + " plot_array(ccs_bq)\n", |
264 | 268 | " print('SN:')\n", |
265 | | - " sn = rows[-4:]\n", |
| 269 | + " sn = rows[slice(*sn_indices)]\n", |
266 | 270 | " plot_array(sn)" |
267 | 271 | ] |
268 | 272 | }, |
|
279 | 283 | " return ''.join([dc_constants.SEQ_VOCAB[int(base)] for base in encoded_sequence])\n", |
280 | 284 | "\n", |
281 | 285 | "\n", |
282 | | - "def get_ccs_matrix(rows_for_one_example):\n", |
| 286 | + "def get_ccs_matrix(rows_for_one_example, max_passes, use_ccs_bq):\n", |
283 | 287 | " \"\"\"Slice the encoded CCS out of the 2d rows matrix of one example.\"\"\"\n", |
284 | | - " return rows_for_one_example[-5, :]\n", |
| 288 | + " _, _, _, _, ccs_indices, _, _ = data_providers.get_indices(\n", |
| 289 | + " max_passes=max_passes,\n", |
| 290 | + " use_ccs_bq=use_ccs_bq,\n", |
| 291 | + " )\n", |
| 292 | + " return rows_for_one_example[slice(*ccs_indices), :][0]\n", |
285 | 293 | "\n", |
286 | 294 | "\n", |
287 | | - "def get_ccs_seq(rows_for_one_example):\n", |
| 295 | + "def get_ccs_seq(rows_for_one_example, max_passes, use_ccs_bq):\n", |
288 | 296 | " \"\"\"Get CCS string out of the 2d rows matrix of one example.\"\"\"\n", |
289 | | - " return encoded_sequence_to_string(get_ccs_matrix(rows_for_one_example))\n", |
| 297 | + " return encoded_sequence_to_string(get_ccs_matrix(\n", |
| 298 | + " rows_for_one_example, max_passes, use_ccs_bq))\n", |
290 | 299 | "\n", |
291 | 300 | "\n", |
292 | 301 | "def colorful(seq):\n", |
|
305 | 314 | " return ''.join(colored_seq)\n", |
306 | 315 | "\n", |
307 | 316 | "\n", |
308 | | - "def show_example(batch, example_i, ypreds=None, max_passes=20):\n", |
| 317 | + "def show_example(batch, example_i, ypreds=None, max_passes=20, use_ccs_bq=True):\n", |
309 | 318 | " \"\"\"Show an example with subreads, ccs, predictions, and labels.\"\"\"\n", |
310 | 319 | " batch_size = batch['rows'].shape[0]\n", |
311 | 320 | " rows = batch['rows'][example_i, :, :, 0]\n", |
312 | | - " ccs_i = get_ccs_seq(rows)\n", |
| 321 | + " ccs_i = get_ccs_seq(rows, max_passes, use_ccs_bq)\n", |
313 | 322 | " print(f'Example: {example_i} of {batch_size} (batch)')\n", |
314 | 323 | " print('How the sequences are represented for the model:')\n", |
315 | 324 | " subreads = rows[0:max_passes, :]\n", |
|
360 | 369 | "name": "stdout", |
361 | 370 | "output_type": "stream", |
362 | 371 | "text": [ |
363 | | - "Copying gs://brain-genomics-public/research/deepconsensus/models/v1.1/model_checkpoint/checkpoint.data-00000-of-00001...\n", |
364 | | - "Copying gs://brain-genomics-public/research/deepconsensus/models/v1.1/model_checkpoint/checkpoint.index...\n", |
365 | | - "Copying gs://brain-genomics-public/research/deepconsensus/models/v1.1/model_checkpoint/params.json...\n", |
| 372 | + "Copying gs://brain-genomics-public/research/deepconsensus/models/v1.2/model_checkpoint/checkpoint.data-00000-of-00001...\n", |
| 373 | + "Copying gs://brain-genomics-public/research/deepconsensus/models/v1.2/model_checkpoint/checkpoint.index...\n", |
| 374 | + "Copying gs://brain-genomics-public/research/deepconsensus/models/v1.2/model_checkpoint/params.json...\n", |
366 | 375 | "- [3 files][ 85.7 MiB/ 85.7 MiB] \n", |
367 | 376 | "Operation completed over 3 objects/85.7 MiB. \n", |
368 | | - "Copying gs://brain-genomics-public/research/deepconsensus/training-tutorial/v1.1/test/tf-test-00000-of-00500.tfrecord.gz...\n", |
| 377 | + "Copying gs://brain-genomics-public/research/deepconsensus/training-tutorial/v1.2/test/tf-test-00000-of-00500.tfrecord.gz...\n", |
369 | 378 | "/ [1 files][ 6.9 MiB/ 6.9 MiB] \n", |
370 | 379 | "Operation completed over 1 objects/6.9 MiB. \n" |
371 | 380 | ] |
|
374 | 383 | "source": [ |
375 | 384 | "# Download Model\n", |
376 | 385 | "! mkdir -p deepconsensus_model\n", |
377 | | - "! gsutil cp -r gs://brain-genomics-public/research/deepconsensus/models/v1.1/model_checkpoint/* deepconsensus_model/\n", |
| 386 | + "! gsutil cp -r gs://brain-genomics-public/research/deepconsensus/models/v1.2/model_checkpoint/* deepconsensus_model/\n", |
378 | 387 | "# Download test data\n", |
379 | | - "! gsutil cp gs://brain-genomics-public/research/deepconsensus/training-tutorial/v1.1/test/tf-test-00000-of-00500.tfrecord.gz ./tf-test.tfrecord.gz" |
| 388 | + "! gsutil cp gs://brain-genomics-public/research/deepconsensus/training-tutorial/v1.2/test/tf-test-00000-of-00500.tfrecord.gz ./tf-test.tfrecord.gz" |
380 | 389 | ] |
381 | 390 | }, |
382 | 391 | { |
|
451 | 460 | "model = model_utils.get_model(params)\n", |
452 | 461 | "checkpoint = tf.train.Checkpoint(model=model)\n", |
453 | 462 | "\n", |
454 | | - "row_size = data_providers.get_total_rows(params.max_passes)\n", |
| 463 | + "row_size = data_providers.get_total_rows(params.max_passes, params.use_ccs_bq)\n", |
455 | 464 | "input_shape = (1, row_size, params.max_length, params.num_channels)\n", |
456 | 465 | "model_utils.print_model_summary(model, input_shape)\n", |
457 | 466 | "checkpoint.restore(\n", |
|
500 | 509 | "text": [ |
501 | 510 | "name.shape: (20, 1)\n", |
502 | 511 | "label.shape: (20, 100)\n", |
503 | | - "rows.shape: (20, 85, 100, 1)\n", |
| 512 | + "rows.shape: (20, 86, 100, 1)\n", |
504 | 513 | "num_passes.shape: (20, 1)\n", |
505 | 514 | "window_pos.shape: (20, 1)\n" |
506 | 515 | ] |
|
604 | 613 | "source": [ |
605 | 614 | "### Let's break that down\n", |
606 | 615 | "\n", |
607 | | - "Each matrix is composed of multiple data types layered in rows. The bases, pulse-width, inter-pulse duration, and strand are per-subread, up to the max number of subreads (this is in params.max_passes, which as of DeepConsensus v1.1 is 20). The input also contains a draft circular consensus sequence (CCS) from PacBio and the signal-to-noise ratio (SN)." |
| 616 | + "Each matrix is composed of multiple data types layered in rows. The bases, pulse-width, inter-pulse duration, and strand are per-subread, up to the max number of subreads (this is in params.max_passes, which as of DeepConsensus v1.2 is 20). The input also contains a draft circular consensus sequence (CCS) from PacBio and the signal-to-noise ratio (SN)." |
608 | 617 | ] |
609 | 618 | }, |
610 | 619 | { |
|
978 | 987 | ], |
979 | 988 | "metadata": { |
980 | 989 | "colab": { |
| 990 | + "last_runtime": { |
| 991 | + "build_target": "//learning/genomics/internal:genomics_colab", |
| 992 | + "kind": "private" |
| 993 | + }, |
981 | 994 | "provenance": [ |
982 | 995 | { |
983 | 996 | "file_id": "/piper/depot/google3/learning/genomics/deepconsensus/opensource_only/g3doc/notebooks/Copy_of_Inspecting_DeepConsensus_examples_and_running_model.ipynb?workspaceId=belyaeva:dc_colab::citc", |
|
0 commit comments