Skip to content

Commit 7b53a62

Browse files
Update documentation
1 parent 691c3b1 commit 7b53a62

2 files changed

Lines changed: 70 additions & 24 deletions

File tree

Chapter5/polars.html

Lines changed: 30 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1729,16 +1729,16 @@ <h2><span class="section-number">6.14.11. </span>Efficiently Join Large Datasets
17291729
<span class="kn">import</span><span class="w"> </span><span class="nn">pandas</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">pd</span>
17301730

17311731
<span class="c1"># Create two large DataFrames</span>
1732-
<span class="n">df1</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s1">&#39;id&#39;</span><span class="p">:</span> <span class="nb">range</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1000001</span><span class="p">),</span> <span class="s1">&#39;value&#39;</span><span class="p">:</span> <span class="nb">range</span><span class="p">(</span><span class="mi">1000000</span><span class="p">)})</span>
1733-
<span class="n">df2</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s1">&#39;id&#39;</span><span class="p">:</span> <span class="nb">range</span><span class="p">(</span><span class="mi">500000</span><span class="p">,</span> <span class="mi">1500000</span><span class="p">),</span> <span class="s1">&#39;value&#39;</span><span class="p">:</span> <span class="nb">range</span><span class="p">(</span><span class="mi">500000</span><span class="p">,</span> <span class="mi">1500000</span><span class="p">)})</span>
1732+
<span class="n">df1</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s2">&quot;id&quot;</span><span class="p">:</span> <span class="nb">range</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1000001</span><span class="p">),</span> <span class="s2">&quot;value&quot;</span><span class="p">:</span> <span class="nb">range</span><span class="p">(</span><span class="mi">1000000</span><span class="p">)})</span>
1733+
<span class="n">df2</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s2">&quot;id&quot;</span><span class="p">:</span> <span class="nb">range</span><span class="p">(</span><span class="mi">500000</span><span class="p">,</span> <span class="mi">1500000</span><span class="p">),</span> <span class="s2">&quot;value&quot;</span><span class="p">:</span> <span class="nb">range</span><span class="p">(</span><span class="mi">500000</span><span class="p">,</span> <span class="mi">1500000</span><span class="p">)})</span>
17341734
</pre></div>
17351735
</div>
17361736
</div>
17371737
</div>
17381738
<div class="cell docutils container">
17391739
<div class="cell_input docutils container">
17401740
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="o">%%time</span>
1741-
<span class="n">result_pandas</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">merge</span><span class="p">(</span><span class="n">df1</span><span class="p">,</span> <span class="n">df2</span><span class="p">,</span> <span class="n">on</span><span class="o">=</span><span class="s1">&#39;id&#39;</span><span class="p">,</span> <span class="n">how</span><span class="o">=</span><span class="s1">&#39;inner&#39;</span><span class="p">)</span>
1741+
<span class="n">result_pandas</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">merge</span><span class="p">(</span><span class="n">df1</span><span class="p">,</span> <span class="n">df2</span><span class="p">,</span> <span class="n">on</span><span class="o">=</span><span class="s2">&quot;id&quot;</span><span class="p">,</span> <span class="n">how</span><span class="o">=</span><span class="s2">&quot;inner&quot;</span><span class="p">)</span>
17421742
<span class="nb">print</span><span class="p">(</span><span class="n">result_pandas</span><span class="o">.</span><span class="n">head</span><span class="p">())</span>
17431743
</pre></div>
17441744
</div>
@@ -1763,16 +1763,16 @@ <h2><span class="section-number">6.14.11. </span>Efficiently Join Large Datasets
17631763
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span><span class="w"> </span><span class="nn">polars</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">pl</span>
17641764

17651765
<span class="c1"># Create two large DataFrames</span>
1766-
<span class="n">df1</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s1">&#39;id&#39;</span><span class="p">:</span> <span class="nb">range</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1000001</span><span class="p">),</span> <span class="s1">&#39;value&#39;</span><span class="p">:</span> <span class="nb">range</span><span class="p">(</span><span class="mi">1000000</span><span class="p">)})</span>
1767-
<span class="n">df2</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s1">&#39;id&#39;</span><span class="p">:</span> <span class="nb">range</span><span class="p">(</span><span class="mi">500000</span><span class="p">,</span> <span class="mi">1500000</span><span class="p">),</span> <span class="s1">&#39;value&#39;</span><span class="p">:</span> <span class="nb">range</span><span class="p">(</span><span class="mi">500000</span><span class="p">,</span> <span class="mi">1500000</span><span class="p">)})</span>
1766+
<span class="n">df1</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s2">&quot;id&quot;</span><span class="p">:</span> <span class="nb">range</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1000001</span><span class="p">),</span> <span class="s2">&quot;value&quot;</span><span class="p">:</span> <span class="nb">range</span><span class="p">(</span><span class="mi">1000000</span><span class="p">)})</span>
1767+
<span class="n">df2</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s2">&quot;id&quot;</span><span class="p">:</span> <span class="nb">range</span><span class="p">(</span><span class="mi">500000</span><span class="p">,</span> <span class="mi">1500000</span><span class="p">),</span> <span class="s2">&quot;value&quot;</span><span class="p">:</span> <span class="nb">range</span><span class="p">(</span><span class="mi">500000</span><span class="p">,</span> <span class="mi">1500000</span><span class="p">)})</span>
17681768
</pre></div>
17691769
</div>
17701770
</div>
17711771
</div>
17721772
<div class="cell docutils container">
17731773
<div class="cell_input docutils container">
17741774
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="o">%%time</span>
1775-
<span class="n">result_polars</span> <span class="o">=</span> <span class="n">df1</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">df2</span><span class="p">,</span> <span class="n">on</span><span class="o">=</span><span class="s1">&#39;id&#39;</span><span class="p">,</span> <span class="n">how</span><span class="o">=</span><span class="s1">&#39;inner&#39;</span><span class="p">)</span>
1775+
<span class="n">result_polars</span> <span class="o">=</span> <span class="n">df1</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">df2</span><span class="p">,</span> <span class="n">on</span><span class="o">=</span><span class="s2">&quot;id&quot;</span><span class="p">,</span> <span class="n">how</span><span class="o">=</span><span class="s2">&quot;inner&quot;</span><span class="p">)</span>
17761776
<span class="nb">print</span><span class="p">(</span><span class="n">result_polars</span><span class="o">.</span><span class="n">head</span><span class="p">())</span>
17771777
</pre></div>
17781778
</div>
@@ -1836,6 +1836,14 @@ <h2><span class="section-number">6.14.12. </span>Simplify Aggregations with Pola
18361836
</pre></div>
18371837
</div>
18381838
</div>
1839+
<div class="cell_output docutils container">
1840+
<div class="output stream highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span> value1_sum value2_mean
1841+
category
1842+
A 30 10.0
1843+
B 70 30.0
1844+
</pre></div>
1845+
</div>
1846+
</div>
18391847
</div>
18401848
<p>Here, you must explicitly define each aggregation using tuples, which can be tedious for larger datasets or more complex operations.</p>
18411849
<p>Polars simplifies this process with its declarative approach using <code class="docutils literal notranslate"><span class="pre">pl.col</span></code>.</p>
@@ -1852,17 +1860,29 @@ <h2><span class="section-number">6.14.12. </span>Simplify Aggregations with Pola
18521860
<span class="n">df</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">data</span><span class="p">)</span>
18531861

18541862
<span class="c1"># Aggregations in Polars</span>
1855-
<span class="n">result</span> <span class="o">=</span> <span class="p">(</span>
1856-
<span class="n">df</span><span class="o">.</span><span class="n">group_by</span><span class="p">(</span><span class="s2">&quot;category&quot;</span><span class="p">)</span>
1857-
<span class="o">.</span><span class="n">agg</span><span class="p">([</span>
1863+
<span class="n">result</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">group_by</span><span class="p">(</span><span class="s2">&quot;category&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span>
1864+
<span class="p">[</span>
18581865
<span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;value1&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">&quot;value1_sum&quot;</span><span class="p">),</span>
18591866
<span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;value2&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">&quot;value2_mean&quot;</span><span class="p">),</span>
1860-
<span class="p">])</span>
1867+
<span class="p">]</span>
18611868
<span class="p">)</span>
18621869
<span class="nb">print</span><span class="p">(</span><span class="n">result</span><span class="p">)</span>
18631870
</pre></div>
18641871
</div>
18651872
</div>
1873+
<div class="cell_output docutils container">
1874+
<div class="output stream highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>shape: (2, 3)
1875+
┌──────────┬────────────┬─────────────┐
1876+
│ category ┆ value1_sum ┆ value2_mean │
1877+
│ --- ┆ --- ┆ --- │
1878+
│ str ┆ i64 ┆ f64 │
1879+
╞══════════╪════════════╪═════════════╡
1880+
│ B ┆ 70 ┆ 30.0 │
1881+
│ A ┆ 30 ┆ 10.0 │
1882+
└──────────┴────────────┴─────────────┘
1883+
</pre></div>
1884+
</div>
1885+
</div>
18661886
</div>
18671887
<p>In this example, Polars allows you to define aggregations directly using expressions like <code class="docutils literal notranslate"><span class="pre">pl.col(&quot;value1&quot;).sum()</span></code> and <code class="docutils literal notranslate"><span class="pre">pl.col(&quot;value2&quot;).mean()</span></code>. This approach eliminates the need for tuples and makes the code more intuitive.</p>
18681888
<p><a class="reference external" href="https://github.com/pola-rs/polars">Link to Polars</a>.</p>

_sources/Chapter5/polars.ipynb

Lines changed: 40 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1821,8 +1821,8 @@
18211821
"import pandas as pd\n",
18221822
"\n",
18231823
"# Create two large DataFrames\n",
1824-
"df1 = pd.DataFrame({'id': range(1, 1000001), 'value': range(1000000)})\n",
1825-
"df2 = pd.DataFrame({'id': range(500000, 1500000), 'value': range(500000, 1500000)})"
1824+
"df1 = pd.DataFrame({\"id\": range(1, 1000001), \"value\": range(1000000)})\n",
1825+
"df2 = pd.DataFrame({\"id\": range(500000, 1500000), \"value\": range(500000, 1500000)})"
18261826
]
18271827
},
18281828
{
@@ -1848,7 +1848,7 @@
18481848
],
18491849
"source": [
18501850
"%%time\n",
1851-
"result_pandas = pd.merge(df1, df2, on='id', how='inner')\n",
1851+
"result_pandas = pd.merge(df1, df2, on=\"id\", how=\"inner\")\n",
18521852
"print(result_pandas.head())"
18531853
]
18541854
},
@@ -1872,8 +1872,8 @@
18721872
"import polars as pl\n",
18731873
"\n",
18741874
"# Create two large DataFrames\n",
1875-
"df1 = pl.DataFrame({'id': range(1, 1000001), 'value': range(1000000)})\n",
1876-
"df2 = pl.DataFrame({'id': range(500000, 1500000), 'value': range(500000, 1500000)})"
1875+
"df1 = pl.DataFrame({\"id\": range(1, 1000001), \"value\": range(1000000)})\n",
1876+
"df2 = pl.DataFrame({\"id\": range(500000, 1500000), \"value\": range(500000, 1500000)})"
18771877
]
18781878
},
18791879
{
@@ -1905,7 +1905,7 @@
19051905
],
19061906
"source": [
19071907
"%%time\n",
1908-
"result_polars = df1.join(df2, on='id', how='inner')\n",
1908+
"result_polars = df1.join(df2, on=\"id\", how=\"inner\")\n",
19091909
"print(result_polars.head())"
19101910
]
19111911
},
@@ -1945,12 +1945,23 @@
19451945
},
19461946
{
19471947
"cell_type": "code",
1948-
"execution_count": null,
1948+
"execution_count": 5,
19491949
"id": "3c50b76d",
19501950
"metadata": {
19511951
"lines_to_next_cell": 2
19521952
},
1953-
"outputs": [],
1953+
"outputs": [
1954+
{
1955+
"name": "stdout",
1956+
"output_type": "stream",
1957+
"text": [
1958+
" value1_sum value2_mean\n",
1959+
"category \n",
1960+
"A 30 10.0\n",
1961+
"B 70 30.0\n"
1962+
]
1963+
}
1964+
],
19541965
"source": [
19551966
"import pandas as pd\n",
19561967
"\n",
@@ -1982,12 +1993,28 @@
19821993
},
19831994
{
19841995
"cell_type": "code",
1985-
"execution_count": null,
1996+
"execution_count": 6,
19861997
"id": "518e5668",
19871998
"metadata": {
19881999
"lines_to_next_cell": 2
19892000
},
1990-
"outputs": [],
2001+
"outputs": [
2002+
{
2003+
"name": "stdout",
2004+
"output_type": "stream",
2005+
"text": [
2006+
"shape: (2, 3)\n",
2007+
"┌──────────┬────────────┬─────────────┐\n",
2008+
"│ category ┆ value1_sum ┆ value2_mean │\n",
2009+
"│ --- ┆ --- ┆ --- │\n",
2010+
"│ str ┆ i64 ┆ f64 │\n",
2011+
"╞══════════╪════════════╪═════════════╡\n",
2012+
"│ B ┆ 70 ┆ 30.0 │\n",
2013+
"│ A ┆ 30 ┆ 10.0 │\n",
2014+
"└──────────┴────────────┴─────────────┘\n"
2015+
]
2016+
}
2017+
],
19912018
"source": [
19922019
"import polars as pl\n",
19932020
"\n",
@@ -2000,12 +2027,11 @@
20002027
"df = pl.DataFrame(data)\n",
20012028
"\n",
20022029
"# Aggregations in Polars\n",
2003-
"result = (\n",
2004-
" df.group_by(\"category\")\n",
2005-
" .agg([\n",
2030+
"result = df.group_by(\"category\").agg(\n",
2031+
" [\n",
20062032
" pl.col(\"value1\").sum().alias(\"value1_sum\"),\n",
20072033
" pl.col(\"value2\").mean().alias(\"value2_mean\"),\n",
2008-
" ])\n",
2034+
" ]\n",
20092035
")\n",
20102036
"print(result)"
20112037
]

0 commit comments

Comments
 (0)