@@ -1729,16 +1729,16 @@ <h2><span class="section-number">6.14.11. </span>Efficiently Join Large Datasets
17291729< span class ="kn "> import</ span > < span class ="w "> </ span > < span class ="nn "> pandas</ span > < span class ="w "> </ span > < span class ="k "> as</ span > < span class ="w "> </ span > < span class ="nn "> pd</ span >
17301730
17311731< span class ="c1 "> # Create two large DataFrames</ span >
1732- < span class ="n "> df1</ span > < span class ="o "> =</ span > < span class ="n "> pd</ span > < span class ="o "> .</ span > < span class ="n "> DataFrame</ span > < span class ="p "> ({</ span > < span class ="s1 "> ' ;id' ;</ span > < span class ="p "> :</ span > < span class ="nb "> range</ span > < span class ="p "> (</ span > < span class ="mi "> 1</ span > < span class ="p "> ,</ span > < span class ="mi "> 1000001</ span > < span class ="p "> ),</ span > < span class ="s1 "> ' ;value' ;</ span > < span class ="p "> :</ span > < span class ="nb "> range</ span > < span class ="p "> (</ span > < span class ="mi "> 1000000</ span > < span class ="p "> )})</ span >
1733- < span class ="n "> df2</ span > < span class ="o "> =</ span > < span class ="n "> pd</ span > < span class ="o "> .</ span > < span class ="n "> DataFrame</ span > < span class ="p "> ({</ span > < span class ="s1 "> ' ;id' ;</ span > < span class ="p "> :</ span > < span class ="nb "> range</ span > < span class ="p "> (</ span > < span class ="mi "> 500000</ span > < span class ="p "> ,</ span > < span class ="mi "> 1500000</ span > < span class ="p "> ),</ span > < span class ="s1 "> ' ;value' ;</ span > < span class ="p "> :</ span > < span class ="nb "> range</ span > < span class ="p "> (</ span > < span class ="mi "> 500000</ span > < span class ="p "> ,</ span > < span class ="mi "> 1500000</ span > < span class ="p "> )})</ span >
1732+ < span class ="n "> df1</ span > < span class ="o "> =</ span > < span class ="n "> pd</ span > < span class ="o "> .</ span > < span class ="n "> DataFrame</ span > < span class ="p "> ({</ span > < span class ="s2 "> " ;id" ;</ span > < span class ="p "> :</ span > < span class ="nb "> range</ span > < span class ="p "> (</ span > < span class ="mi "> 1</ span > < span class ="p "> ,</ span > < span class ="mi "> 1000001</ span > < span class ="p "> ),</ span > < span class ="s2 "> " ;value" ;</ span > < span class ="p "> :</ span > < span class ="nb "> range</ span > < span class ="p "> (</ span > < span class ="mi "> 1000000</ span > < span class ="p "> )})</ span >
1733+ < span class ="n "> df2</ span > < span class ="o "> =</ span > < span class ="n "> pd</ span > < span class ="o "> .</ span > < span class ="n "> DataFrame</ span > < span class ="p "> ({</ span > < span class ="s2 "> " ;id" ;</ span > < span class ="p "> :</ span > < span class ="nb "> range</ span > < span class ="p "> (</ span > < span class ="mi "> 500000</ span > < span class ="p "> ,</ span > < span class ="mi "> 1500000</ span > < span class ="p "> ),</ span > < span class ="s2 "> " ;value" ;</ span > < span class ="p "> :</ span > < span class ="nb "> range</ span > < span class ="p "> (</ span > < span class ="mi "> 500000</ span > < span class ="p "> ,</ span > < span class ="mi "> 1500000</ span > < span class ="p "> )})</ span >
17341734</ pre > </ div >
17351735</ div >
17361736</ div >
17371737</ div >
17381738< div class ="cell docutils container ">
17391739< div class ="cell_input docutils container ">
17401740< div class ="highlight-ipython3 notranslate "> < div class ="highlight "> < pre > < span > </ span > < span class ="o "> %%time</ span >
1741- < span class ="n "> result_pandas</ span > < span class ="o "> =</ span > < span class ="n "> pd</ span > < span class ="o "> .</ span > < span class ="n "> merge</ span > < span class ="p "> (</ span > < span class ="n "> df1</ span > < span class ="p "> ,</ span > < span class ="n "> df2</ span > < span class ="p "> ,</ span > < span class ="n "> on</ span > < span class ="o "> =</ span > < span class ="s1 "> ' ;id' ;</ span > < span class ="p "> ,</ span > < span class ="n "> how</ span > < span class ="o "> =</ span > < span class ="s1 "> ' ;inner' ;</ span > < span class ="p "> )</ span >
1741+ < span class ="n "> result_pandas</ span > < span class ="o "> =</ span > < span class ="n "> pd</ span > < span class ="o "> .</ span > < span class ="n "> merge</ span > < span class ="p "> (</ span > < span class ="n "> df1</ span > < span class ="p "> ,</ span > < span class ="n "> df2</ span > < span class ="p "> ,</ span > < span class ="n "> on</ span > < span class ="o "> =</ span > < span class ="s2 "> " ;id" ;</ span > < span class ="p "> ,</ span > < span class ="n "> how</ span > < span class ="o "> =</ span > < span class ="s2 "> " ;inner" ;</ span > < span class ="p "> )</ span >
17421742< span class ="nb "> print</ span > < span class ="p "> (</ span > < span class ="n "> result_pandas</ span > < span class ="o "> .</ span > < span class ="n "> head</ span > < span class ="p "> ())</ span >
17431743</ pre > </ div >
17441744</ div >
@@ -1763,16 +1763,16 @@ <h2><span class="section-number">6.14.11. </span>Efficiently Join Large Datasets
17631763< div class ="highlight-ipython3 notranslate "> < div class ="highlight "> < pre > < span > </ span > < span class ="kn "> import</ span > < span class ="w "> </ span > < span class ="nn "> polars</ span > < span class ="w "> </ span > < span class ="k "> as</ span > < span class ="w "> </ span > < span class ="nn "> pl</ span >
17641764
17651765< span class ="c1 "> # Create two large DataFrames</ span >
1766- < span class ="n "> df1</ span > < span class ="o "> =</ span > < span class ="n "> pl</ span > < span class ="o "> .</ span > < span class ="n "> DataFrame</ span > < span class ="p "> ({</ span > < span class ="s1 "> ' ;id' ;</ span > < span class ="p "> :</ span > < span class ="nb "> range</ span > < span class ="p "> (</ span > < span class ="mi "> 1</ span > < span class ="p "> ,</ span > < span class ="mi "> 1000001</ span > < span class ="p "> ),</ span > < span class ="s1 "> ' ;value' ;</ span > < span class ="p "> :</ span > < span class ="nb "> range</ span > < span class ="p "> (</ span > < span class ="mi "> 1000000</ span > < span class ="p "> )})</ span >
1767- < span class ="n "> df2</ span > < span class ="o "> =</ span > < span class ="n "> pl</ span > < span class ="o "> .</ span > < span class ="n "> DataFrame</ span > < span class ="p "> ({</ span > < span class ="s1 "> ' ;id' ;</ span > < span class ="p "> :</ span > < span class ="nb "> range</ span > < span class ="p "> (</ span > < span class ="mi "> 500000</ span > < span class ="p "> ,</ span > < span class ="mi "> 1500000</ span > < span class ="p "> ),</ span > < span class ="s1 "> ' ;value' ;</ span > < span class ="p "> :</ span > < span class ="nb "> range</ span > < span class ="p "> (</ span > < span class ="mi "> 500000</ span > < span class ="p "> ,</ span > < span class ="mi "> 1500000</ span > < span class ="p "> )})</ span >
1766+ < span class ="n "> df1</ span > < span class ="o "> =</ span > < span class ="n "> pl</ span > < span class ="o "> .</ span > < span class ="n "> DataFrame</ span > < span class ="p "> ({</ span > < span class ="s2 "> " ;id" ;</ span > < span class ="p "> :</ span > < span class ="nb "> range</ span > < span class ="p "> (</ span > < span class ="mi "> 1</ span > < span class ="p "> ,</ span > < span class ="mi "> 1000001</ span > < span class ="p "> ),</ span > < span class ="s2 "> " ;value" ;</ span > < span class ="p "> :</ span > < span class ="nb "> range</ span > < span class ="p "> (</ span > < span class ="mi "> 1000000</ span > < span class ="p "> )})</ span >
1767+ < span class ="n "> df2</ span > < span class ="o "> =</ span > < span class ="n "> pl</ span > < span class ="o "> .</ span > < span class ="n "> DataFrame</ span > < span class ="p "> ({</ span > < span class ="s2 "> " ;id" ;</ span > < span class ="p "> :</ span > < span class ="nb "> range</ span > < span class ="p "> (</ span > < span class ="mi "> 500000</ span > < span class ="p "> ,</ span > < span class ="mi "> 1500000</ span > < span class ="p "> ),</ span > < span class ="s2 "> " ;value" ;</ span > < span class ="p "> :</ span > < span class ="nb "> range</ span > < span class ="p "> (</ span > < span class ="mi "> 500000</ span > < span class ="p "> ,</ span > < span class ="mi "> 1500000</ span > < span class ="p "> )})</ span >
17681768</ pre > </ div >
17691769</ div >
17701770</ div >
17711771</ div >
17721772< div class ="cell docutils container ">
17731773< div class ="cell_input docutils container ">
17741774< div class ="highlight-ipython3 notranslate "> < div class ="highlight "> < pre > < span > </ span > < span class ="o "> %%time</ span >
1775- < span class ="n "> result_polars</ span > < span class ="o "> =</ span > < span class ="n "> df1</ span > < span class ="o "> .</ span > < span class ="n "> join</ span > < span class ="p "> (</ span > < span class ="n "> df2</ span > < span class ="p "> ,</ span > < span class ="n "> on</ span > < span class ="o "> =</ span > < span class ="s1 "> ' ;id' ;</ span > < span class ="p "> ,</ span > < span class ="n "> how</ span > < span class ="o "> =</ span > < span class ="s1 "> ' ;inner' ;</ span > < span class ="p "> )</ span >
1775+ < span class ="n "> result_polars</ span > < span class ="o "> =</ span > < span class ="n "> df1</ span > < span class ="o "> .</ span > < span class ="n "> join</ span > < span class ="p "> (</ span > < span class ="n "> df2</ span > < span class ="p "> ,</ span > < span class ="n "> on</ span > < span class ="o "> =</ span > < span class ="s2 "> " ;id" ;</ span > < span class ="p "> ,</ span > < span class ="n "> how</ span > < span class ="o "> =</ span > < span class ="s2 "> " ;inner" ;</ span > < span class ="p "> )</ span >
17761776< span class ="nb "> print</ span > < span class ="p "> (</ span > < span class ="n "> result_polars</ span > < span class ="o "> .</ span > < span class ="n "> head</ span > < span class ="p "> ())</ span >
17771777</ pre > </ div >
17781778</ div >
@@ -1836,6 +1836,14 @@ <h2><span class="section-number">6.14.12. </span>Simplify Aggregations with Pola
18361836</ pre > </ div >
18371837</ div >
18381838</ div >
1839+ < div class ="cell_output docutils container ">
1840+ < div class ="output stream highlight-myst-ansi notranslate "> < div class ="highlight "> < pre > < span > </ span > value1_sum value2_mean
1841+ category
1842+ A 30 10.0
1843+ B 70 30.0
1844+ </ pre > </ div >
1845+ </ div >
1846+ </ div >
18391847</ div >
18401848< p > Here, you must explicitly define each aggregation using tuples, which can be tedious for larger datasets or more complex operations.</ p >
18411849< p > Polars simplifies this process with its declarative approach using < code class ="docutils literal notranslate "> < span class ="pre "> pl.col</ span > </ code > .</ p >
@@ -1852,17 +1860,29 @@ <h2><span class="section-number">6.14.12. </span>Simplify Aggregations with Pola
18521860< span class ="n "> df</ span > < span class ="o "> =</ span > < span class ="n "> pl</ span > < span class ="o "> .</ span > < span class ="n "> DataFrame</ span > < span class ="p "> (</ span > < span class ="n "> data</ span > < span class ="p "> )</ span >
18531861
18541862< span class ="c1 "> # Aggregations in Polars</ span >
1855- < span class ="n "> result</ span > < span class ="o "> =</ span > < span class ="p "> (</ span >
1856- < span class ="n "> df</ span > < span class ="o "> .</ span > < span class ="n "> group_by</ span > < span class ="p "> (</ span > < span class ="s2 "> "category"</ span > < span class ="p "> )</ span >
1857- < span class ="o "> .</ span > < span class ="n "> agg</ span > < span class ="p "> ([</ span >
1863+ < span class ="n "> result</ span > < span class ="o "> =</ span > < span class ="n "> df</ span > < span class ="o "> .</ span > < span class ="n "> group_by</ span > < span class ="p "> (</ span > < span class ="s2 "> "category"</ span > < span class ="p "> )</ span > < span class ="o "> .</ span > < span class ="n "> agg</ span > < span class ="p "> (</ span >
1864+ < span class ="p "> [</ span >
18581865 < span class ="n "> pl</ span > < span class ="o "> .</ span > < span class ="n "> col</ span > < span class ="p "> (</ span > < span class ="s2 "> "value1"</ span > < span class ="p "> )</ span > < span class ="o "> .</ span > < span class ="n "> sum</ span > < span class ="p "> ()</ span > < span class ="o "> .</ span > < span class ="n "> alias</ span > < span class ="p "> (</ span > < span class ="s2 "> "value1_sum"</ span > < span class ="p "> ),</ span >
18591866 < span class ="n "> pl</ span > < span class ="o "> .</ span > < span class ="n "> col</ span > < span class ="p "> (</ span > < span class ="s2 "> "value2"</ span > < span class ="p "> )</ span > < span class ="o "> .</ span > < span class ="n "> mean</ span > < span class ="p "> ()</ span > < span class ="o "> .</ span > < span class ="n "> alias</ span > < span class ="p "> (</ span > < span class ="s2 "> "value2_mean"</ span > < span class ="p "> ),</ span >
1860- < span class ="p "> ]) </ span >
1867+ < span class ="p "> ]</ span >
18611868< span class ="p "> )</ span >
18621869< span class ="nb "> print</ span > < span class ="p "> (</ span > < span class ="n "> result</ span > < span class ="p "> )</ span >
18631870</ pre > </ div >
18641871</ div >
18651872</ div >
1873+ < div class ="cell_output docutils container ">
1874+ < div class ="output stream highlight-myst-ansi notranslate "> < div class ="highlight "> < pre > < span > </ span > shape: (2, 3)
1875+ ┌──────────┬────────────┬─────────────┐
1876+ │ category ┆ value1_sum ┆ value2_mean │
1877+ │ --- ┆ --- ┆ --- │
1878+ │ str ┆ i64 ┆ f64 │
1879+ ╞══════════╪════════════╪═════════════╡
1880+ │ B ┆ 70 ┆ 30.0 │
1881+ │ A ┆ 30 ┆ 10.0 │
1882+ └──────────┴────────────┴─────────────┘
1883+ </ pre > </ div >
1884+ </ div >
1885+ </ div >
18661886</ div >
18671887< p > In this example, Polars allows you to define aggregations directly using expressions like < code class ="docutils literal notranslate "> < span class ="pre "> pl.col("value1").sum()</ span > </ code > and < code class ="docutils literal notranslate "> < span class ="pre "> pl.col("value2").mean()</ span > </ code > . This approach eliminates the need for tuples and makes the code more intuitive.</ p >
18681888< p > < a class ="reference external " href ="https://github.com/pola-rs/polars "> Link to Polars</ a > .</ p >
0 commit comments