<?xml version="1.0" encoding="gb2312"?>
<rss version="2.0">
    <channel>
        <title>斯巴达第二季</title>
        <link>http://donghao.org/</link>
        <description>                                  董昊</description>
        <language>en</language>
        <copyright>Copyright 2010</copyright>
        <lastBuildDate>Thu, 22 07 2010 17:46:43 +0800</lastBuildDate>
        <generator>http://www.sixapart.com/movabletype/</generator>
        <docs>http://www.rssboard.org/rss-specification</docs>
        
        <item>
            <title>[c++] 小心析构函数</title>
            <description><![CDATA[先上代码，很少，就两个类。<p></p><p id="zw-129f981640frUEIW2dbd2a"></p><div class="" id="zw-129f9816d23pH6Ilx2dbd2a"><span id="zw-129f9816d24ee1xxV2dbd2a">&nbsp;&nbsp;#include &lt;iostream&gt;</span></div><div class="" id="zw-129f9816d24DyR11U2dbd2a"><br id="zw-129f9816d24Ks0jF82dbd2a" /></div><div class="" id="zw-129f9816d24easwu2dbd2a"><span id="zw-129f9816d257mft62dbd2a">&nbsp;&nbsp;using namespace std;</span></div><div class="" id="zw-129f9816d25nCkDQa2dbd2a"><br id="zw-129f9816d25lG2GEh2dbd2a" /></div><div class="" id="zw-129f9816d259q1w42dbd2a"><span id="zw-129f9816d255ttINu2dbd2a">&nbsp;&nbsp;struct Pig</span></div><div class="" id="zw-129f9816d25xPwdMA2dbd2a"><span id="zw-129f9816d266z9CY_2dbd2a">&nbsp;&nbsp;{</span></div><div class="" id="zw-129f9816d26uKTDqx2dbd2a">&nbsp;&nbsp; &nbsp; &nbsp;virtual void call(void)</div><div class="" id="zw-129f9816d2aXHiE72dbd2a"><span id="zw-129f9816d2aijV2fF2dbd2a">&nbsp;&nbsp; &nbsp; &nbsp;{</span></div><div class="" id="zw-129f9816d2atBNMOo2dbd2a"><span id="zw-129f9816d2aPUZT2Z2dbd2a">&nbsp;&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;cout &lt;&lt; "Pig\n";</span></div><div class="" id="zw-129f9816d2aBaPKyS2dbd2a"><span id="zw-129f9816d2boN8oA2dbd2a">&nbsp;&nbsp; &nbsp; &nbsp;}</span></div><div class="" id="zw-129f9816d2bebnj2C2dbd2a"><br id="zw-129f9816d2biygVb12dbd2a" /></div><div class="" id="zw-129f9816d2b4miki2dbd2a"><span id="zw-129f9816d2bw-4Zlp2dbd2a">&nbsp;&nbsp;};</span></div><div class="" id="zw-129f9816d2buTyLH72dbd2a"><br id="zw-129f9816d2cyO2_al2dbd2a" /></div><div class="" id="zw-129f9816d2c5a6U7d2dbd2a"><span id="zw-129f9816d2ccE42fg2dbd2a">&nbsp;&nbsp;struct SmallPig : public Pig</span></div><div class="" id="zw-129f9816d2cLiPBt2dbd2a"><span id="zw-129f9816d2cG70RSn2dbd2a">&nbsp;&nbsp;{</span></div><div class="" id="zw-129f9816d2cHKR8M2dbd2a"><span id="zw-129f9816d2diVKyq32dbd2a">&nbsp;&nbsp; &nbsp; &nbsp;virtual void call(void)</span></div><div class="" id="zw-129f9816d2dRISm8c2dbd2a"><span id="zw-129f9816d2d4ILshc2dbd2a">&nbsp;&nbsp; &nbsp; &nbsp;{</span></div><div class="" id="zw-129f9816d2dOSuXe2dbd2a"><span id="zw-129f9816d2dnr7X0p2dbd2a">&nbsp;&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;cout &lt;&lt; "Small Pig\n";</span></div><div class="" id="zw-129f9816d2dY-aihX2dbd2a"><span id="zw-129f9816d2e1goT7-2dbd2a">&nbsp;&nbsp; &nbsp; &nbsp;}</span></div><div class="" id="zw-129f9816d2eRPBRCy2dbd2a"><span id="zw-129f9816d2eQFLIzd2dbd2a">&nbsp;&nbsp;};</span></div><div class="" id="zw-129f9816d2eJV7dak2dbd2a"><br id="zw-129f9816d2eID5mra2dbd2a" /></div><div class="" id="zw-129f9816d2ehaZcDg2dbd2a"><span id="zw-129f9816d2fOl6ptJ2dbd2a">&nbsp;&nbsp;int main(int argc, char* argv[])</span></div><div class="" id="zw-129f9816d2fQvtXuV2dbd2a"><span id="zw-129f9816d2fdBEXKH2dbd2a">&nbsp;&nbsp;{</span></div><div class="" id="zw-129f9816d2fqLwPX22dbd2a"><span id="zw-129f9816d2fvX46DH2dbd2a">&nbsp;&nbsp; &nbsp; &nbsp;Pig* p = new SmallPig;</span></div><div class="" id="zw-129f9816d30I2fWwE2dbd2a"><span id="zw-129f9816d30kesL2H2dbd2a">&nbsp;&nbsp; &nbsp; &nbsp;p-&gt;call();</span></div><div class="" id="zw-129f9816d30A6EyGz2dbd2a"><span id="zw-129f9816d30yXjt_Y2dbd2a">&nbsp;&nbsp; &nbsp; &nbsp;delete p;</span></div><div class="" id="zw-129f9816d30J6nyIO2dbd2a"><span id="zw-129f9816d30ID5Xvu2dbd2a">&nbsp;&nbsp;}</span></div><div id="zw-129f9816d31NuViz2dbd2a"><br id="zw-129f9816d31bg7oNX2dbd2a" /></div><p id="zw-129f9816d342Em5tD2dbd2a"></p><p id="zw-129f9816d34Zli1o2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt; ">打印出什么？这个简单，了解c++的都能答对：打出 "S<span id="zw-129f9823fddPl6GUU2dbd2a">mall Pig“，因为是虚函数嘛。</span></p><p id="zw-129f98265bfZuZNiN2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt; "><span id="zw-129f98265bf3r-XL2dbd2a">好，现在改改Pig类：</span></p><p id="zw-129f982cecePVyo5y2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt; "></p><div class="" id="zw-129f982fb1bXYBZhA2dbd2a"><span id="zw-129f982fb1b03dfpP2dbd2a">&nbsp;&nbsp;struct Pig</span></div><div class="" id="zw-129f982fb1b8JXSkV2dbd2a"><span id="zw-129f982fb1cv6tMdl2dbd2a">&nbsp;&nbsp;{</span></div><div class="" id="zw-129f982fb1c5kZTY2dbd2a"><span id="zw-129f982fb1cRk-5kf2dbd2a">&nbsp;&nbsp; &nbsp; &nbsp;Pig()</span></div><div class="" id="zw-129f982fb1cIttC-z2dbd2a"><span id="zw-129f982fb1dKSFPrA2dbd2a">&nbsp;&nbsp; &nbsp; &nbsp;{</span></div><div class="" id="zw-129f982fb1dtlZXn62dbd2a"><span id="zw-129f982fb1d9WcK-72dbd2a">&nbsp;&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;call();</span></div><div class="" id="zw-129f982fb1dGCl4dz2dbd2a"><span id="zw-129f982fb1esPizP62dbd2a">&nbsp;&nbsp; &nbsp; &nbsp;}</span></div><div class="" id="zw-129f982fb1eSg3KfL2dbd2a"><br id="zw-129f982fb1e7sS-DY2dbd2a" /></div><div class="" id="zw-129f982fb1eP9TGqY2dbd2a"><span id="zw-129f982fb1e56bc6T2dbd2a">&nbsp;&nbsp; &nbsp; &nbsp;~Pig()</span></div><div class="" id="zw-129f982fb1fZcB_lG2dbd2a"><span id="zw-129f982fb1fyHip2e2dbd2a">&nbsp;&nbsp; &nbsp; &nbsp;{</span></div><div class="" id="zw-129f982fb1fb14UlA2dbd2a"><span id="zw-129f982fb1fh3CZG32dbd2a">&nbsp;&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;call();</span></div><div class="" id="zw-129f982fb20XzLBZC2dbd2a"><span id="zw-129f982fb20_g72RO2dbd2a">&nbsp;&nbsp; &nbsp; &nbsp;}</span></div><div class="" id="zw-129f982fb20JKR9112dbd2a"><br id="zw-129f982fb20W65VHr2dbd2a" /></div><div class="" id="zw-129f982fb20tBrgTM2dbd2a"><span id="zw-129f982fb20iE4Kj02dbd2a">&nbsp;&nbsp; &nbsp; &nbsp;virtual void call(void)</span></div><div class="" id="zw-129f982fb20exP5JF2dbd2a"><span id="zw-129f982fb20D_5A3k2dbd2a">&nbsp;&nbsp; &nbsp; &nbsp;{</span></div><div class="" id="zw-129f982fb20ODIuz2dbd2a"><span id="zw-129f982fb21Z3huNf2dbd2a">&nbsp;&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;cout &lt;&lt; "Pig\n";</span></div><div class="" id="zw-129f982fb215SveW2dbd2a"><span id="zw-129f982fb21pOlcV42dbd2a">&nbsp;&nbsp; &nbsp; &nbsp;}</span></div><div class="" id="zw-129f982fb21fk-vQ42dbd2a"><br id="zw-129f982fb22Wih_122dbd2a" /></div><div class="" id="zw-129f982fb224ECU2dbd2a"><span id="zw-129f982fb22aFWViV2dbd2a">&nbsp;&nbsp;};</span></div><div id="zw-129f982fb22zAfJRN2dbd2a"><br id="zw-129f982fb22n5z5JS2dbd2a" /></div><p id="zw-129f982fb25JDIe2dbd2a"></p><p id="zw-129f982fb25HS7S3f2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt; ">再运行，就不是打出<span id="zw-129f9832d52vdPTof2dbd2a">3行“Small Pig”了，而是：</span></p><p id="zw-129f98371c1bN8BfI2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt; "><span id="zw-129f98371c1-q36Te2dbd2a">Pig</span></p><p id="zw-129f9838659OY4G_i2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt; "><span id="zw-129f9838659Pq8MH92dbd2a">Small Pig</span></p><p id="zw-129f9839a94bzac002dbd2a" style="margin-bottom: 12pt; margin-top: 0pt; "><span id="zw-129f9839a9488HwZH2dbd2a">Pig</span></p><p id="zw-129f9839f0fQXv3Re2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt; "><span id="zw-129f9839f0fmSHyJ2dbd2a">为什么？因为构造函数和析构函数特殊，在它们里面this指针只能当成自己用（而不是当成子类），所以调用虚函数的结果是调用了父类的实现。</span></p><p id="zw-129f9839f0fQXv3Re2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt; "><span id="zw-129f9839f0fmSHyJ2dbd2a"></span><span id="zw-129f989ad6b57WgB2dbd2a">这个问题造成了今天的bug，花了不少时间。其实这个注意事项在</span><a id="zw-129f989ad6bAvCCPw2dbd2a" title="《Effective c++》" target="_blank" href="http://book.douban.com/subject/1842426/"><span id="zw-129f989ad34vufSeg2dbd2a">《Effective c++》</span></a><span id="zw-129f989ad6bxG9Be82dbd2a">里是有的，我也看过，但是....开发中谁还记得那么多条条框框？还是实际犯错印象比较深刻。</span></p><p id="zw-129f9871758r4Ztf2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt; "><span id="zw-129f9871758WVd_wq2dbd2a">有人问了：如果我把call改成纯虚函数会怎样呢？更郁闷，g++编译的时候就会提示构造函数里的call“找不到实现”！</span></p>]]></description>
            <link>http://donghao.org/2010/07/c-aioey.html</link>
            <guid>http://donghao.org/2010/07/c-aioey.html</guid>
            
                <category domain="http://www.sixapart.com/ns/types#category">软件开发</category>
            
            
                <category domain="http://www.sixapart.com/ns/types#tag">cpp</category>
            
                <category domain="http://www.sixapart.com/ns/types#tag">析构函数</category>
            
                <category domain="http://www.sixapart.com/ns/types#tag">构造函数</category>
            
            <pubDate>Thu, 22 07 2010 17:46:43 +0800</pubDate>
        </item>
        
        <item>
            <title>[kernel] linux在多核处理器上的负载均衡原理</title>
            <description><![CDATA[【转载请注明出处：<span id="zw-129c9db3f20tNCpB2dbd2a"><a href="http://donghao.org/uii/" id="zw-129c9db3f20iG6LUw2dbd2a">http://donghao.org/uii/</a>&nbsp;】</span><p></p><h2 id="zw-129c9dac5874scQdR2dbd2a">【原理】</h2><p id="zw-129c5e9e686moQArM2dbd2a">现在互联网公司使用的都是多CPU（多核）的服务器了，Linux操作系统会自动把任务分配到不同的处理器上，并尽可能的保持负载均衡。那Linux内核是怎么做到让各个CPU的压力均匀的呢？</p><p id="zw-129c5805c6eIHh0vO2dbd2a">做一个负载均衡机制，重点在于：</p><p id="zw-129c580851b2qKkp42dbd2a" style="margin-bottom: 12pt; margin-top: 0pt; "><span id="zw-129c5808593g77akw2dbd2a">1. 何时检查并调整负载情况？</span><br id="zw-129c580851beXqWpE2dbd2a" /></p><p id="zw-129c5837612vIEp8h2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt; "><span id="zw-129c5837612ML6j12dbd2a">2. 如何调整负载？</span></p><p id="zw-129c5839ed2Uzz2Mb2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt; "><span id="zw-129c5839ed3uNyO2O2dbd2a">先看第一个问题。</span></p><p id="zw-129c585a4a9rl4L4Y2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt; ">如果让我这样的庸俗程序员来设计，我第一个想到的就是每隔一段时间检查一次负载是否均衡，不均则调整之，这肯定不是最高效的办法，但肯定是实现上最简单的。实际上，<span id="zw-129c58456c6pjoa3B2dbd2a">2.6.20版linux kernel的确使用软中断来定时调整多CPU上的压力（调用函数run_rebalance_domains），每秒1次</span><span id="zw-129c58456c6zbAB_2dbd2a">。</span></p><p id="zw-129c5880558FScXLV2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt; ">但每秒一次还是不能满足要求，对很多应用来说，<span id="zw-129c58d3e0dW1Me9n2dbd2a">1秒太长了，一秒钟内如果发生负载失衡对很多web应用都是不能接受的，何况其他实时应用。最好kernel能够紧跟进程的变化来调整。</span></p><p id="zw-129c58f09f8O7CgTM2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt; "><span id="zw-129c592cb1e4U2iO02dbd2a">那么，好，</span><span id="zw-129c592cb1e4GsYC2dbd2a">我们在进程创建和进程exit的时候检查并调整负载呢？可以，但是不完整，一个进程创建以后如果频繁的睡眠、醒来、睡眠、醒来，它这样折腾对CPU的负载是有影响的，</span><span id="zw-129c592cbc4utYD9A2dbd2a">你就不管它了吗？说到底，我们其实关注的是进程是否在使用CPU，而不是它是否诞生了。所以，我们应该在进程睡眠和醒来这两个时间点检查CPU们的负载。</span></p><p id="zw-129c5db5c004jyQMu2dbd2a" style="text-align: justify; margin-bottom: 12pt; margin-top: 0pt; "><span id="zw-129c5db5c00sbPwKa2dbd2a">再看第二个问题，怎么调整负载呢？从最繁忙的那个CPU上挪一个进程到最闲的那个CPU上，如果负载还不均衡，就再挪一个进程，如果还不均衡，继续挪....这也是个最笨的方法，但它却真的是linux&nbsp;CPU负载均衡的核心，不过实际的算法在此基础上有很多细化。对于Intel的CPU，压缩在同一个chip上的多核是共享同一个L2的（如下图，里面的一个Processor其实就是一个chip），如果任务能尽可能的分配在同一个chip上，L2&nbsp;cache就可以继续使用，这对运行速度是有帮助的。所以除非“很不均衡”，否则尽量不要把一个chip上的任务挪到其他chip上。</span></p><p id="zw-129c5e418b4tPztob2dbd2a" style="text-align: justify; margin-bottom: 12pt; margin-top: 0pt; "><span id="zw-129c5e418b4lkLIDd2dbd2a"><img align="middle" alt="Intel MUlti-Core CPU" border="0" hspace="0" id="zw-129c5e4d535I2Tku32dbd2a" src="http://writer.zoho.com/image.do?imgurl=27c929fe53698e0d9b3cb46b35d4e45dcc3ce9803a0e8d54b8a55f66891df28829bd57fa5258b45e3aa927bb6c54f8a6" vspace="0" style="height: 331px; width: 492px; " /></span></p><p id="zw-129c5e8bdbdH88nKw2dbd2a" style="text-align: left; margin-bottom: 12pt; margin-top: 0pt; "><span id="zw-129c5e8bdbd_vr3u-2dbd2a">于是，为了应对这种CPU core之间的异质性??在不同的core之间迁移任务，代价不同??Linux kernel引入了sched_domain和sched_group的概念。sched_domain和sched_group的具体原理，可参考</span><a href="http://www.ibm.com/developerworks/cn/linux/l-cn-schldom/index.html" id="zw-129c5e8bdbd7X7WCm2dbd2a" target="_blank" title="中文资料"><span id="zw-129c5e7bf040-NSC72dbd2a">刘勃的文章</span></a><span id="zw-129c5e8bdbd5TGsC2dbd2a">和</span><a href="http://book.opensourceproject.org.cn/kernel/kernel3rd/opensource/0596005652/understandlk-chp-7-sect-5.html" id="zw-129c5e8bdbds7yMHY2dbd2a" target="_blank" title="英文资料"><span id="zw-129c5e8bd5fRdTr02dbd2a">英文资料</span></a><span id="zw-129c5e8bdbdTPxTch2dbd2a">。</span></p><h2>【代码剖析】</h2><p id="zw-129c5ea1397ItagoU2dbd2a" style="text-align: left; margin-bottom: 12pt; margin-top: 0pt; "><span id="zw-129c5ea221eg7cB4v2dbd2a"></span></p><p id="zw-129c5e3ad80g4PWN32dbd2a" style="text-align: justify; margin-bottom: 12pt; margin-top: 0pt; "><span id="zw-129c5e3ad80LtPbTZ2dbd2a">SMP负载均衡检查或调整</span>在两个内核函数里发生：</p><p id="zw-129c5782fc4ZuOe9C2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt; "><span id="zw-129c5782fc4HNRc4N2dbd2a">1. schedule（）。当进程调用了sleep、usleep、poll、epoll、pause时，也就是调用了可能睡去的操作时都会转为内核代码里对schedule()函数的调用。</span></p><p id="zw-129c579a6c7vPgi5w2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt; "><span id="zw-129c579a6c7fyjQEs2dbd2a">2. try_to_wake_up() 。说白了就是进程刚才睡了，现在要醒来，那醒来以后跑在哪个CPU上呢？这个选择CPU的过程，也就是负载均衡的过程。</span></p><p id="zw-129c562336fyYCvhw2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt; ">我们先看<span id="zw-129c598e696wzBPdp2dbd2a">schedule（）的代码，我们忽略函数前面那些和负载均衡无关的代码（本文代码以内核2.6.20版为准）：</span></p><p id="zw-129c599217b1d-eHp2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt; "><span id="zw-129c599217bSV7nJ2dbd2a">[kernel/sched.c --&gt; schedule() ]</span></p><p id="zw-129c5998394ToP23F2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt; "></p><div class="" id="zw-129c59bb5278Zgmh2dbd2a"><span id="zw-129c59bb528j4mNR62dbd2a">&nbsp;&nbsp;3489 &nbsp; &nbsp; cpu = smp_processor_id();</span></div><div class="" id="zw-129c59bb528nvRnQG2dbd2a"><span id="zw-129c59bb529ngPFbb2dbd2a">&nbsp;&nbsp;3490 &nbsp; &nbsp; if (unlikely(!rq-&gt;nr_running)) {</span></div><div class="" id="zw-129c59bb5292s79-L2dbd2a"><span id="zw-129c59bb529hFmcQP2dbd2a">&nbsp;&nbsp;</span><span id="zw-129c5a5c044-y-WrX2dbd2a" style="color: rgb(255, 0, 0); ">3491 &nbsp; &nbsp; &nbsp; &nbsp; idle_balance(cpu, rq);</span></div><div class="" id="zw-129c59bb52a9GcE_2dbd2a"><span id="zw-129c59bb52a4owQIs2dbd2a">&nbsp;&nbsp;3492 &nbsp; &nbsp; &nbsp; &nbsp; if (!rq-&gt;nr_running) {</span></div><div class="" id="zw-129c59bb52aPIscF2dbd2a"><span id="zw-129c59bb52aNxUZCS2dbd2a">&nbsp;&nbsp;3493 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; next = rq-&gt;idle;</span></div><div class="" id="zw-129c59bb52bJRU9bm2dbd2a"><span id="zw-129c59bb52bmmPllM2dbd2a">&nbsp;&nbsp;3494 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; rq-&gt;expired_timestamp = 0;</span></div><div class="" id="zw-129c59bb52bHRt0YW2dbd2a"><span id="zw-129c59bb52cB9v_2dbd2a">&nbsp;&nbsp;3495 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; wake_sleeping_dependent(cpu);</span></div><div class="" id="zw-129c59bb52cgvYYvR2dbd2a"><span id="zw-129c59bb52cvpsm8S2dbd2a">&nbsp;&nbsp;3496 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; goto switch_tasks;</span></div><div class="" id="zw-129c59bb52cmuEHp52dbd2a"><span id="zw-129c59bb52d-VrsKc2dbd2a">&nbsp;&nbsp;3497 &nbsp; &nbsp; &nbsp; &nbsp; }</span></div><div class="" id="zw-129c59bb52d5DLtta2dbd2a"><span id="zw-129c59bb52dbeIsWH2dbd2a">&nbsp;&nbsp;3498 &nbsp; &nbsp; }</span></div><div class="" id="zw-129c59c10b9nbYStx2dbd2a"><span id="zw-129c59c10b9U_9OWG2dbd2a"><br /></span></div><div class="" id="zw-129c59c11e6un-iS72dbd2a">每个CPU都有一个运行队列即这里的<span id="zw-129c5a073b4jYRRfZ2dbd2a">rq，运行队列里放着该CPU要运行的进程，如果运行队列里没有进程了，就说明当前CPU没有可调度的任务了，那就要调用idle_balance从其它CPU上“平衡”一些（就是挪一些）进程到当前rq里。</span><span id="zw-129c5a043edY91iKP2dbd2a"></span></div><div id="zw-129c59bb52e3oZ_W2dbd2a"><br id="zw-129c5eb68b20HcGAu2dbd2a" /></div><div id="zw-129c5eb68b21Hvt62dbd2a">再看<span id="zw-129c5eb79beESnaHi2dbd2a">idle_balance（）的实现：</span><span id="zw-129c5eb705a8bvt_G2dbd2a"></span></div><div id="zw-129c5eb9a51p4WRKL2dbd2a"><span id="zw-129c5eb9a52fgoJ7Z2dbd2a"><br /></span></div><div id="zw-129c5eba22eo0wjTt2dbd2a"><span id="zw-129c5eba22e44ZCE52dbd2a">[kernel/sched.c --&gt; idle_balance()]</span></div><div id="zw-129c5ec4be8e8F0pE2dbd2a"></div><div class="" id="zw-129c5ec51c7h8YN-22dbd2a"><span id="zw-129c5ec51c7uIexfO2dbd2a">&nbsp;&nbsp;2806 /*</span></div><div class="" id="zw-129c5ec51c8eCBIG62dbd2a"><span id="zw-129c5ec51c8MHxLs2dbd2a">&nbsp;&nbsp;2807 &nbsp;* idle_balance is called by schedule() if this_cpu is about to become</span></div><div class="" id="zw-129c5ec51c8fJT2aH2dbd2a"><span id="zw-129c5ec51c9TeAwy2dbd2a">&nbsp;&nbsp;2808 &nbsp;* idle. Attempts to pull tasks from other CPUs.</span></div><div class="" id="zw-129c5ec51c9s5LyFz2dbd2a"><span id="zw-129c5ec51c9UCtlrZ2dbd2a">&nbsp;&nbsp;2809 &nbsp;*/</span></div><div class="" id="zw-129c5ec51cahwKF4P2dbd2a"><span id="zw-129c5ec51cav175Ww2dbd2a">&nbsp;&nbsp;2810 static void idle_balance(int this_cpu, struct rq *this_rq)</span></div><div class="" id="zw-129c5ec51cas2gSlg2dbd2a"><span id="zw-129c5ec51cb6ZN_jP2dbd2a">&nbsp;&nbsp;2811 {</span></div><div class="" id="zw-129c5ec51cb-KZ0sW2dbd2a"><span id="zw-129c5ec51cbmFsCz2dbd2a">&nbsp;&nbsp;2812 &nbsp; &nbsp; struct sched_domain *sd;</span></div><div class="" id="zw-129c5ec51cb-6mMxW2dbd2a"><span id="zw-129c5ec51ccmyl2dbd2a">&nbsp;&nbsp;2813 &nbsp; &nbsp; int pulled_task = 0;</span></div><div class="" id="zw-129c5ec51cch9B1-2dbd2a"><span id="zw-129c5ec51ccrqGKPq2dbd2a">&nbsp;&nbsp;2814 &nbsp; &nbsp; unsigned long next_balance = jiffies + 60 * &nbsp;HZ;</span></div><div class="" id="zw-129c5ec51cdv-UIZM2dbd2a"><span id="zw-129c5ec51cdVeVuU02dbd2a">&nbsp;&nbsp;2815</span></div><div class="" id="zw-129c5ec51cdOPUTz2dbd2a"><span id="zw-129c98136f6I1_5i2dbd2a" style="color: rgb(0, 0, 255); ">&nbsp;&nbsp;2816 &nbsp; &nbsp; for_each_domain(this_cpu, sd) {</span></div><div class="" id="zw-129c5ec51ce6ZFC3E2dbd2a"><span id="zw-129c5ec51cen5-JHy2dbd2a">&nbsp;&nbsp;2817 &nbsp; &nbsp; &nbsp; &nbsp; unsigned long interval;</span></div><div class="" id="zw-129c5ec51cf65mAkn2dbd2a"><span id="zw-129c5ec51cfZn_nIU2dbd2a">&nbsp;&nbsp;2818</span></div><div class="" id="zw-129c5ec51cf59AOs2dbd2a"><span id="zw-129c5ec51d08yvGdd2dbd2a">&nbsp;&nbsp;2819 &nbsp; &nbsp; &nbsp; &nbsp; if (!(sd-&gt;flags &amp; SD_LOAD_BALANCE))</span></div><div class="" id="zw-129c5ec51d0zRmAG62dbd2a"><span id="zw-129c5ec51d0giWVIU2dbd2a">&nbsp;&nbsp;2820 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; continue;</span></div><div class="" id="zw-129c5ec51d0bYwgom2dbd2a"><span id="zw-129c5ec51d1TNuX3H2dbd2a">&nbsp;&nbsp;2821</span></div><div class="" id="zw-129c5ec51d1047tuk2dbd2a"><span id="zw-129c5ec51d1HD6x82dbd2a">&nbsp;&nbsp;2822 &nbsp; &nbsp; &nbsp; &nbsp; if (sd-&gt;flags &amp; SD_BALANCE_NEWIDLE)</span></div><div class="" id="zw-129c5ec51d2mInH0B2dbd2a"><span id="zw-129c5ec51d2wZpNYM2dbd2a">&nbsp;&nbsp;2823 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; /* If we've pulled tasks over stop searching: */</span></div><div class="" id="zw-129c5ec51d2HqkXQD2dbd2a"><span id="zw-129c5eccbc1678-6s2dbd2a" style="color: rgb(255, 0, 0); ">&nbsp;&nbsp;2824 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; pulled_task = load_balance_newidle(this_cpu,</span></div><div class="" id="zw-129c5ec51d3Kygtd42dbd2a"><span id="zw-129c5ec51d3yDyF2dbd2a" style="color: rgb(255, 0, 0); ">&nbsp;&nbsp;2825 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; this_rq, sd);</span></div><div class="" id="zw-129c5ec51d4igSqUk2dbd2a"><span id="zw-129c5ec51d4tX06px2dbd2a">&nbsp;&nbsp;2826</span></div><div class="" id="zw-129c5ec51d4x-p9IC2dbd2a"><span id="zw-129c5ec51d4AQjm1W2dbd2a">&nbsp;&nbsp;2827 &nbsp; &nbsp; &nbsp; &nbsp; interval = msecs_to_jiffies(sd-&gt;balance_interval);</span></div><div class="" id="zw-129c5ec51d5AMgup62dbd2a"><span id="zw-129c5ec51d55JD1GC2dbd2a">&nbsp;&nbsp;2828 &nbsp; &nbsp; &nbsp; &nbsp; if (time_after(next_balance, sd-&gt;last_balance + interval))</span></div><div class="" id="zw-129c5ec51d6t9QG6u2dbd2a"><span id="zw-129c5ec51d66dHEEQ2dbd2a">&nbsp;&nbsp;2829 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; next_balance = sd-&gt;last_balance + interval;</span></div><div class="" id="zw-129c5ec51d6JC2zf2dbd2a"><span id="zw-129c5ec51d6taZcrV2dbd2a">&nbsp;&nbsp;2830 &nbsp; &nbsp; &nbsp; &nbsp; if (pulled_task)</span></div><div class="" id="zw-129c5ec51d72OiEeO2dbd2a"><span id="zw-129c5ec51d7StK1vD2dbd2a">&nbsp;&nbsp;2831 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; break;</span></div><div class="" id="zw-129c5ec51d71r_1Y62dbd2a"><span id="zw-129c5ec51d8kiYhd2dbd2a">&nbsp;&nbsp;2832 &nbsp; &nbsp; }</span></div><div class="" id="zw-129c5ec51d8WSAAq2dbd2a"><span id="zw-129c5ec51d8Rq4F392dbd2a">&nbsp;&nbsp;2833 &nbsp; &nbsp; if (!pulled_task)</span></div><div class="" id="zw-129c5ec51d9RDfiwi2dbd2a"><span id="zw-129c5ec51d9mMY3_m2dbd2a">&nbsp;&nbsp;2834 &nbsp; &nbsp; &nbsp; &nbsp; /*</span></div><div class="" id="zw-129c5ec51d9U-kfDW2dbd2a"><span id="zw-129c5ec51d9f7Xw-r2dbd2a">&nbsp;&nbsp;2835 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* We are going idle. next_balance may be set based on</span></div><div class="" id="zw-129c5ec51davPngwx2dbd2a"><span id="zw-129c5ec51daSZPqX2dbd2a">&nbsp;&nbsp;2836 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* a busy processor. So reset next_balance.</span></div><div class="" id="zw-129c5ec51dalB6F0l2dbd2a"><span id="zw-129c5ec51db129tR2dbd2a">&nbsp;&nbsp;2837 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/</span></div><div class="" id="zw-129c5ec51db5BNP2o2dbd2a"><span id="zw-129c5ec51dbTcGmF2dbd2a">&nbsp;&nbsp;2838 &nbsp; &nbsp; &nbsp; &nbsp; this_rq-&gt;next_balance = next_balance;</span></div><div class="" id="zw-129c5ec51dcOyhOxF2dbd2a"><span id="zw-129c5ec51dcFVmaff2dbd2a">&nbsp;&nbsp;2839 }</span></div><div id="zw-129c5ec51dcQLW3HM2dbd2a"><br id="zw-129c5ec64c8_77QJG2dbd2a" /></div><div id="zw-129c5ec64c8QNv1hU2dbd2a">从子<span id="zw-129c5ed3c0bT9mO92dbd2a">sched_domain到父sched_domain遍历该CPU对应的domain（2816行），并调用load_balance_newidle，我们继续：</span></div><div id="zw-129c5edb0a6a-6XNw2dbd2a"><span id="zw-129c5edb0a66Mr2Pz2dbd2a"><br /></span></div><div id="zw-129c5edb3b8MiAb6t2dbd2a"><span id="zw-129c5edb3b888UHe2dbd2a">[kernel/sched.c --&gt; load_balance_newidle()]</span></div><div id="zw-129c5ee10cdocS08B2dbd2a"></div><div class="" id="zw-129c5ee5bb7Nhk2io2dbd2a"><span id="zw-129c5ee5bb7WDDsi2dbd2a">2730 static int</span></div><div class="" id="zw-129c5ee5bb7IFmIsM2dbd2a"><span id="zw-129c5ee5bb8EqwG022dbd2a">&nbsp;&nbsp;2731 load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)</span></div><div class="" id="zw-129c5ee5bb8kywbWD2dbd2a"><span id="zw-129c5ee5bb8LhA8Oi2dbd2a">&nbsp;&nbsp;2732 {</span></div><div class="" id="zw-129c5ee5bb9OCuj_c2dbd2a"><span id="zw-129c5ee5bb9-LR8WH2dbd2a">&nbsp;&nbsp;2733 &nbsp; &nbsp; struct sched_group *group;</span></div><div class="" id="zw-129c5ee5bb9fHUUa92dbd2a"><span id="zw-129c5ee5bb9r3yCir2dbd2a">&nbsp;&nbsp;2734 &nbsp; &nbsp; struct rq *busiest = NULL;</span></div><div class="" id="zw-129c5ee5bb9gCv5lW2dbd2a"><span id="zw-129c5ee5bb9rdgyph2dbd2a">&nbsp;&nbsp;2735 &nbsp; &nbsp; unsigned long imbalance;</span></div><div class="" id="zw-129c5ee5bb9ocvY7L2dbd2a"><span id="zw-129c5ee5bbanWuzFo2dbd2a">&nbsp;&nbsp;2736 &nbsp; &nbsp; int nr_moved = 0;</span></div><div class="" id="zw-129c5ee5bbacpkOl62dbd2a"><span id="zw-129c5ee5bbaNXcgG2dbd2a">&nbsp;&nbsp;2737 &nbsp; &nbsp; int sd_idle = 0;</span></div><div class="" id="zw-129c5ee5bbaweFyyA2dbd2a"><span id="zw-129c5ee5bbbedhQk2dbd2a">&nbsp;&nbsp;2738 &nbsp; &nbsp; cpumask_t cpus = CPU_MASK_ALL;</span></div><div class="" id="zw-129c5ee5bbb5JyYtj2dbd2a"><span id="zw-129c5ee5bbb2tRrJ2dbd2a">&nbsp;&nbsp;2739</span></div><div class="" id="zw-129c5ee5bbb6VKhtN2dbd2a"><span id="zw-129c5ee5bbcs2vey2dbd2a">&nbsp;&nbsp;2740 &nbsp; &nbsp; /*</span></div><div class="" id="zw-129c5ee5bbc8whKBL2dbd2a"><span id="zw-129c5ee5bbcWH12gU2dbd2a">&nbsp;&nbsp;2741 &nbsp; &nbsp; &nbsp;* When power savings policy is enabled for the parent domain, idle</span></div><div class="" id="zw-129c5ee5bbdZ0PUko2dbd2a"><span id="zw-129c5ee5bbd2mMWDb2dbd2a">&nbsp;&nbsp;2742 &nbsp; &nbsp; &nbsp;* sibling can pick up load irrespective of busy siblings. In this case,</span></div><div class="" id="zw-129c5ee5bbdsAn_x2dbd2a"><span id="zw-129c5ee5bbd5ViaIH2dbd2a">&nbsp;&nbsp;2743 &nbsp; &nbsp; &nbsp;* let the state of idle sibling percolate up as IDLE, instead of</span></div><div class="" id="zw-129c5ee5bbepSKt182dbd2a"><span id="zw-129c5ee5bbejIOken2dbd2a">&nbsp;&nbsp;2744 &nbsp; &nbsp; &nbsp;* portraying it as NOT_IDLE.</span></div><div class="" id="zw-129c5ee5bbeRRqqYi2dbd2a"><span id="zw-129c5ee5bbfeuHMKZ2dbd2a">&nbsp;&nbsp;2745 &nbsp; &nbsp; &nbsp;*/</span></div><div class="" id="zw-129c5ee5bbfbU47Ch2dbd2a"><span id="zw-129c5ee5bbfiFN6A2dbd2a">&nbsp;&nbsp;2746 &nbsp; &nbsp; if (sd-&gt;flags &amp; SD_SHARE_CPUPOWER &amp;&amp;</span></div><div class="" id="zw-129c5ee5bc0Pk1u6_2dbd2a"><span id="zw-129c5ee5bc0Ka5Vv2dbd2a">&nbsp;&nbsp;2747 &nbsp; &nbsp; &nbsp; &nbsp; !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))</span></div><div class="" id="zw-129c5ee5bc0hZXDul2dbd2a"><span id="zw-129c5ee5bc0LU3Gvv2dbd2a">&nbsp;&nbsp;2748 &nbsp; &nbsp; &nbsp; &nbsp; sd_idle = 1;</span></div><div class="" id="zw-129c5ee5bc1TTsug82dbd2a"><span id="zw-129c5ee5bc1DUmhSf2dbd2a">&nbsp;&nbsp;2749</span></div><div class="" id="zw-129c5ee5bc1Ydcy-c2dbd2a"><span id="zw-129c5ee5bc2OrmJig2dbd2a">&nbsp;&nbsp;2750 &nbsp; &nbsp; schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);</span></div><div class="" id="zw-129c5ee5bc2GEwTwV2dbd2a"><span id="zw-129c5ee5bc2k3jgZ12dbd2a">&nbsp;&nbsp;2751 redo:</span></div><div class="" id="zw-129c5ee5bc3lIaeBZ2dbd2a"><span id="zw-129c5ee91f45KwahV2dbd2a" style="color: rgb(255, 0, 0); ">&nbsp;&nbsp;2752 &nbsp; &nbsp; group = find_busiest_group(sd, this_cpu, &amp;imbalance, NEWLY_IDLE,</span></div><div class="" id="zw-129c5ee5bc3yDE4DF2dbd2a"><span id="zw-129c5ee5bc3AIgCMd2dbd2a">&nbsp;&nbsp;2753 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&amp;sd_idle, &amp;cpus, NULL);</span></div><div class="" id="zw-129c5ee5bc4a0kHaq2dbd2a"><span id="zw-129c5ee5bc4c5Yh72dbd2a">&nbsp;&nbsp;2754 &nbsp; &nbsp; if (!group) {</span></div><div class="" id="zw-129c5ee5bc4MHc2X92dbd2a"><span id="zw-129c5ee5bc4HlL4v82dbd2a">&nbsp;&nbsp;2755 &nbsp; &nbsp; &nbsp; &nbsp; schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);</span></div><div class="" id="zw-129c5ee5bc5cCCtjG2dbd2a"><span id="zw-129c5ee5bc5sGa1t2dbd2a">&nbsp;&nbsp;2756 &nbsp; &nbsp; &nbsp; &nbsp; goto out_balanced;</span></div><div class="" id="zw-129c5ee5bc5O0nROV2dbd2a"><span id="zw-129c5ee5bc6TSQC9e2dbd2a">&nbsp;&nbsp;2757 &nbsp; &nbsp; }</span></div><div class="" id="zw-129c5ee5bc6wFSzYb2dbd2a"><span id="zw-129c5ee5bc6CcK9NK2dbd2a">&nbsp;&nbsp;2758</span></div><div class="" id="zw-129c5ee5bc6CQd5-V2dbd2a"><span id="zw-129c5eeb24eTnuy2I2dbd2a" style="color: rgb(255, 0, 0); ">&nbsp;&nbsp;2759 &nbsp; &nbsp; busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance,</span></div><div class="" id="zw-129c5ee5bc7-gFzL2dbd2a"><span id="zw-129c5ee5bc7fdCzt2dbd2a" style="color: rgb(255, 0, 0); ">&nbsp;&nbsp;2760 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &amp;cpus);</span></div><div class="" id="zw-129c5ee5bc8khWd4Z2dbd2a"><span id="zw-129c5ee5bc84gNtiY2dbd2a">&nbsp;&nbsp;2761 &nbsp; &nbsp; if (!busiest) {</span></div><div class="" id="zw-129c5ee5bc8mRbMJI2dbd2a"><span id="zw-129c5ee5bc8IubN8k2dbd2a">&nbsp;&nbsp;2762 &nbsp; &nbsp; &nbsp; &nbsp; schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);</span></div><div class="" id="zw-129c5ee5bc9DL5Y502dbd2a"><span id="zw-129c5ee5bc9FZC6YP2dbd2a">&nbsp;&nbsp;2763 &nbsp; &nbsp; &nbsp; &nbsp; goto out_balanced;</span></div><div class="" id="zw-129c5ee5bc9v1Xwwt2dbd2a"><span id="zw-129c5ee5bc9lZ6PRW2dbd2a">&nbsp;&nbsp;2764 &nbsp; &nbsp; }</span></div><div class="" id="zw-129c5ee5bcaJKFM22dbd2a"><span id="zw-129c5ee5bcawSsGzP2dbd2a">&nbsp;&nbsp;2765</span></div><div class="" id="zw-129c5ee5bcai6XWSv2dbd2a"><span id="zw-129c5ee5bcbA_O42dbd2a">&nbsp;&nbsp;2766 &nbsp; &nbsp; BUG_ON(busiest == this_rq);</span></div><div class="" id="zw-129c5ee5bcbKEBT-O2dbd2a"><span id="zw-129c5ee5bcbDSKfrx2dbd2a">&nbsp;&nbsp;2767</span></div><div class="" id="zw-129c5ee5bcbWyx30Q2dbd2a"><span id="zw-129c5ee5bccAlMyUs2dbd2a">&nbsp;&nbsp;2768 &nbsp; &nbsp; schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);</span></div><div class="" id="zw-129c5ee5bccejIE72dbd2a"><span id="zw-129c5ee5bccHgLLcS2dbd2a">&nbsp;&nbsp;2769</span></div><div class="" id="zw-129c5ee5bcdNKQef12dbd2a"><span id="zw-129c5ee5bcdfaYI1s2dbd2a">&nbsp;&nbsp;2770 &nbsp; &nbsp; nr_moved = 0;</span></div><div class="" id="zw-129c5ee5bcdV7dBY62dbd2a"><span id="zw-129c5ee5bcdmQnkeW2dbd2a">&nbsp;&nbsp;2771 &nbsp; &nbsp; if (busiest-&gt;nr_running &gt; 1) {</span></div><div class="" id="zw-129c5ee5bceeVpMW42dbd2a"><span id="zw-129c5ee5bcegGY5x2dbd2a">&nbsp;&nbsp;2772 &nbsp; &nbsp; &nbsp; &nbsp; /* Attempt to move tasks */</span></div><div class="" id="zw-129c5ee5bceAfFWXC2dbd2a"><span id="zw-129c5ee5bcfQ6chBc2dbd2a">&nbsp;&nbsp;2773 &nbsp; &nbsp; &nbsp; &nbsp; double_lock_balance(this_rq, busiest);</span></div><div class="" id="zw-129c5ee5bcfdTjaAO2dbd2a"><span id="zw-129c5ef4657ISnsYj2dbd2a" style="color: rgb(255, 0, 0); ">&nbsp;&nbsp;2774 &nbsp; &nbsp; &nbsp; &nbsp; nr_moved = move_tasks(this_rq, this_cpu, busiest,</span></div><div class="" id="zw-129c5ee5bcfwCcs2h2dbd2a"><span id="zw-129c5ee5bd0OnSNPM2dbd2a" style="color: rgb(255, 0, 0); ">&nbsp;&nbsp;2775 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; minus_1_or_zero(busiest-&gt;nr_running),</span></div><div class="" id="zw-129c5ee5bd0rllTN42dbd2a"><span id="zw-129c5ee5bd0TJRIQN2dbd2a" style="color: rgb(255, 0, 0); ">&nbsp;&nbsp;2776 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; imbalance, sd, NEWLY_IDLE, NULL);</span></div><div id="zw-129c5ee5bd14jhSa32dbd2a"><br id="zw-129c5ee5bd1fAelT2dbd2a" /></div><div id="zw-129c5ef5951WS-5zS2dbd2a">原来就是我们上面说的“笨办法”，针对当前CPU所属的每个domain（从子到父），找到该<span id="zw-129c5f010a3mXtISa2dbd2a">sched_domain里最忙的sched_group（2752行），再从该group里找出最忙的运行队列（2759行），最后从该“最忙”运行队列里挑出几个进程到当前CPU的运行队列里。move_tasks函数到底挪多少进程到当前CPU是由第4和第5个参数决定的，第4个参数是指最多挪多少个进程，第5个参数是指最多挪多少“压力”。有了这两个参数限制，就不会挪过头了（即把太多进程挪到当前CPU，造成新的不均衡）。</span></div><div id="zw-129c98704db3h_p9m2dbd2a"><span id="zw-129c98704dbgrJo2dbd2a"><br /></span></div><div id="zw-129c987090fd4AHI2dbd2a"><span id="zw-129c987090frFmNbo2dbd2a">举个例子，假如有一台8核的机器，两个CPU插槽，也就是两个chip，每个chip上4个核，再假设现在core 4最忙，core 0第二忙，如图：</span></div><div id="zw-129c99ce90efuI-0k2dbd2a"><span id="zw-129c99ce90eJUwNTJ2dbd2a"><img align="middle" alt="" border="0" hspace="0" id="zw-129c9a0b3f4pmFQ4w2dbd2a" src="http://writer.zoho.com/image.do?imgurl=27c929fe53698e0d9b3cb46b35d4e45dcc3ce9803a0e8d54b8a55f66891df2884034c687b942b29389fb0f680e1d1b18" vspace="0" style="height: 240px; width: 419px; " /></span></div><div id="zw-129c987a7f1KYoyxG2dbd2a"><span id="zw-129c9a17e6efHgoLD2dbd2a">按照</span><span id="zw-129c9a17f2dnoyIJA2dbd2a"><a href="http://www.ibm.com/developerworks/cn/linux/l-cn-schldom/index.html" id="zw-129c9a17f2djkuSKm2dbd2a" target="_blank" title="中文资料">刘勃的文章</a>里的提法，首先是core domain，即Processor 0属于domain 1，Processor 1属于domain 2，其中domain 1包含4个sched_group，每个group对应一个core，如下图（group未画出）：</span></div><div id="zw-129c9a30335rZ6o6n2dbd2a"><span id="zw-129c9a3033584ePsK2dbd2a"><img align="middle" alt="" border="0" hspace="0" id="zw-129c9a3634aTHxpW2dbd2a" src="http://writer.zoho.com/image.do?imgurl=27c929fe53698e0d9b3cb46b35d4e45dcc3ce9803a0e8d54b8a55f66891df288fc159409f04c52a7b7e01d78b4d1f172" vspace="0" style="height: 317px; width: 514px; " /></span></div><div id="zw-129c5feb6f021WPni2dbd2a"><span id="zw-129c5feb6f05svZRj2dbd2a">假如现在是 Core 3&nbsp;在执行idle_balance，则先在domain 1里找最忙的group，找到第二忙的group是core 0（core 4不在domain 1里，所以不会找到它），再从core 0里找最忙的runqueue（运行队列），core 0就一个运行队列，所以直接就是它对应的runqueue了，然后从该runqueue里挪出几个任务到Core 3，这一层domain的均衡做完了。</span></div><div id="zw-129c9a5c381xo6IMi2dbd2a"><br id="zw-129c9a5c9e8ZyVPyW2dbd2a" /></div><div id="zw-129c9a5c9e8RHr08E2dbd2a">接着是domain&nbsp;<span id="zw-129c9a65b06xe3gz82dbd2a">1的父domain，即 cpu_domain，下图的domain 0：</span></div><div id="zw-129c9a6a77dl-fWXK2dbd2a"><span id="zw-129c9a6a77dJX-OBL2dbd2a"><img align="middle" alt="" border="0" hspace="0" id="zw-129c9a6d560X_vqQO2dbd2a" src="http://writer.zoho.com/image.do?imgurl=27c929fe53698e0d9b3cb46b35d4e45dcc3ce9803a0e8d54b8a55f66891df2889394903d65f7c60901be74699310c7fc" vspace="0" style="height: 320px; width: 529px; " /></span></div><div id="zw-129c9a36e622slJrQ2dbd2a"><span id="zw-129c9a36e62cfx6J72dbd2a">这个domain 0包含了两个group，每个group对应一个chip，即每个group对应了4个core。</span></div><div id="zw-129c9a9e805cUXl0k2dbd2a"><span id="zw-129c9a9e805YfG5Qf2dbd2a">在domain 0找最繁忙的group，显然会找到Processor1 对应的group（因为core 4超忙），那么继续在Processor 1里找最忙的runqueue，于是找到core 4，最后从core 4的runqueue里挑出几个任务挪到core 3,。</span></div><div id="zw-129c9b5e792Rlc1my2dbd2a"><span id="zw-129c9b5e792lLcHsk2dbd2a">这样，整个系统8个核都基本平衡了。</span></div><div id="zw-129c9b64972NJW24y2dbd2a"><span id="zw-129c9b64972OJEotj2dbd2a"><br /></span></div><div id="zw-129c9b64ab2DnsAX02dbd2a"><span id="zw-129c9b64ab3Yrzx72dbd2a">也许有人要问，为什么是从子domain到父domain这样遍历，而不是倒过来，从父到子遍历呢？这是因为子domain通常都是在一个chip上，任务的很多数据在共享的L2 cache上，为了不让其失效，有必要尽量让任务保持在一个chip上。</span></div><div id="zw-129c9b942810hOL1x2dbd2a"><span id="zw-129c9b942816drIr92dbd2a"><br /></span></div><div id="zw-129c9b94414hCBmX72dbd2a"><span id="zw-129c9b94414T83TvL2dbd2a">也许还有人要问：如果core 3本来就是最忙的core，它如果运行idle_balance，会发生什么？答案是什么也不会发生。因为在find_busiest_group函数里，如果发现最忙的是“本CPU”，那么就直接返回NULL，也就不再做任何事。</span></div><div id="zw-129c9e1746eVLaUuM2dbd2a"><span id="zw-129c9e1746eeMWFs2dbd2a">那core 3岂不永远是最忙的了？呵呵，大家忘了，系统里总有闲的CPU（哪怕是相对比较闲），它总会执行schedule()，就算它从不调用sleep从不睡眠，时钟中断也会迫使其进程切换，进而调用schedule，进而将繁忙CPU的任务揽一部分到自己身上。这样，谁最闲，谁早晚会从忙人身上揽活儿过来，所以忙人不会永远最忙，闲人也不会永远最闲，所以就平等，就均衡了。</span></div><div id="zw-129c9a6e3bevPcHA2dbd2a"><span id="zw-129c9a6e3bfCl1_M52dbd2a"><br /></span></div><div id="zw-129c5feba0bPFyDXt2dbd2a"><span id="zw-129c5feba0bymLgD42dbd2a">再看try_to_wake_up()：</span></div><div id="zw-129c5ff7c53Vw-OK_2dbd2a"><span id="zw-129c5ff7c53HZSJRI2dbd2a">[kernel/sched.c --&gt; try_to_wake_up()]</span></div><div id="zw-129c600c89bbsRm2e2dbd2a"></div><div class="" id="zw-129c600cc30Z8BL8e2dbd2a"><span id="zw-129c600cc30PX8ig2dbd2a">1398 static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)</span></div><div class="" id="zw-129c600cc302E7ti2dbd2a"><span id="zw-129c600cc30DIvuq2dbd2a">&nbsp;&nbsp;1399 {</span></div><div class="" id="zw-129c600cc30qDfbsT2dbd2a">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; &nbsp;......<br /></div><div class="" id="zw-129c600cc37cJ2hc72dbd2a"><span id="zw-129c600cc37X28jNd2dbd2a"></span></div><div class="" id="zw-129c600cc37F4HsTE2dbd2a"><span id="zw-129c600cc37EAYRrh2dbd2a">&nbsp;&nbsp;1417</span></div><div class="" id="zw-129c600cc373HN--e2dbd2a"><span id="zw-129c600cc38n2gFES2dbd2a">&nbsp;&nbsp;1418 &nbsp; &nbsp; cpu = task_cpu(p);</span></div><div class="" id="zw-129c600cc38gqBjZr2dbd2a"><span id="zw-129c600cc38Hq0lCP2dbd2a">&nbsp;&nbsp;1419 &nbsp; &nbsp; this_cpu = smp_processor_id();</span></div><div class="" id="zw-129c600cc38DKeyJV2dbd2a"><span id="zw-129c600cc384z3Lh-2dbd2a">&nbsp;&nbsp;1420</span></div><div class="" id="zw-129c600cc381VPMhQ2dbd2a"><span id="zw-129c600cc398pW-tE2dbd2a">&nbsp;&nbsp;1421 #ifdef CONFIG_SMP</span></div><div class="" id="zw-129c600cc398S7w9t2dbd2a"><span id="zw-129c600cc39CLm_T2dbd2a">&nbsp;&nbsp;1422 &nbsp; &nbsp; if (unlikely(task_running(rq, p)))</span></div><div class="" id="zw-129c600cc39ODL9Ev2dbd2a"><span id="zw-129c600cc39DmRjt2dbd2a">&nbsp;&nbsp;1423 &nbsp; &nbsp; &nbsp; &nbsp; goto out_activate;</span></div><div class="" id="zw-129c600cc3azm22eJ2dbd2a"><span id="zw-129c600cc3ajz9eJ2dbd2a">&nbsp;&nbsp;1424</span></div><div class="" id="zw-129c600cc3a7aud8w2dbd2a"><span id="zw-129c600cc3aUB2-dq2dbd2a">&nbsp;&nbsp;1425 &nbsp; &nbsp; new_cpu = cpu;</span></div><div class="" id="zw-129c600cc3aSUiSZA2dbd2a"><span id="zw-129c600cc3bSUDIa2dbd2a">&nbsp;&nbsp;1426</span></div><div class="" id="zw-129c600cc3bP0F7H2dbd2a"><span id="zw-129c600cc3bOWVj_A2dbd2a">&nbsp;&nbsp;1427 &nbsp; &nbsp; schedstat_inc(rq, ttwu_cnt);</span></div><div class="" id="zw-129c600cc3bujX8Ay2dbd2a"><span id="zw-129c600cc3b-Ua9Yc2dbd2a">&nbsp;</span><span id="zw-129c600fbf8yHO-d2dbd2a" style="color: rgb(0, 0, 255); ">&nbsp;1428 &nbsp; &nbsp; if (cpu == this_cpu) {</span></div><div class="" id="zw-129c600cc3bpR9OtZ2dbd2a"><span id="zw-129c600cc3cidfcda2dbd2a">&nbsp;&nbsp;1429 &nbsp; &nbsp; &nbsp; &nbsp; schedstat_inc(rq, ttwu_local);</span></div><div class="" id="zw-129c600cc3c5hRYEz2dbd2a"><span id="zw-129c600cc3cb4gBzX2dbd2a">&nbsp;&nbsp;1430 &nbsp; &nbsp; &nbsp; &nbsp; goto out_set_cpu;</span></div><div class="" id="zw-129c600cc3cw87um2dbd2a"><span id="zw-129c600cc3cBc-NwT2dbd2a">&nbsp;&nbsp;1431 &nbsp; &nbsp; }</span></div><div id="zw-129c600cc3da7W--a2dbd2a"><br id="zw-129c600cc3dX9Acix2dbd2a" /></div><p id="zw-129c600cc43gO6vxD2dbd2a"></p><div id="zw-129c600cc43FcT5ol2dbd2a"><span id="zw-129c6011b7440O92x2dbd2a">变量this_cpu和变量cpu有什么区别？变量this_cpu是实际运行这个函数的处理器（“目标处理器”），而变量cpu是进程p在睡眠之前运行的处理器??为了方便我们暂且称之为“源处理器”。当然，这两个处理器也可能是同一个，比如进程p在处理器A上运行，然后睡眠，而运行try_to_wake_up的也是处理器A，其实这样就最好了，进程p在处理器A里cache的数据都不用动，直接让A运行p就行了??这就是1428行的逻辑。</span><br id="zw-129c600caf2NaXF0_2dbd2a" /></div><div id="zw-129c605abfd3ANpZ92dbd2a"><span id="zw-129c605abfd3KE6S2dbd2a"><br /></span></div><div id="zw-129c605aee5kYTeI2dbd2a"><span id="zw-129c605aee5B9N8qj2dbd2a">如果this_cpu和cpu不是同一个处理器，那么代码继续：</span></div><div id="zw-129c605fcedZR_irg2dbd2a"></div><div class="" id="zw-129c6064579g3vWv32dbd2a"><span id="zw-129c6064579o15wjE2dbd2a">&nbsp;&nbsp;1447 &nbsp; &nbsp; if (this_sd) {</span></div><div class="" id="zw-129c606457a283zvR2dbd2a"><span id="zw-129c606457aC_uF9t2dbd2a">&nbsp;&nbsp;1448 &nbsp; &nbsp; &nbsp; &nbsp; int idx = this_sd-&gt;wake_idx;</span></div><div class="" id="zw-129c606457aWZFsZ72dbd2a"><span id="zw-129c606457atDr1N2dbd2a">&nbsp;&nbsp;1449 &nbsp; &nbsp; &nbsp; &nbsp; unsigned int imbalance;</span></div><div class="" id="zw-129c606457aSVmnjr2dbd2a"><span id="zw-129c606457adLTbrh2dbd2a">&nbsp;&nbsp;1450</span></div><div class="" id="zw-129c606457bYf0Xix2dbd2a"><span id="zw-129c606457b8JSAAh2dbd2a">&nbsp;&nbsp;1451 &nbsp; &nbsp; &nbsp; &nbsp; imbalance = 100 + (this_sd-&gt;imbalance_pct - 100) / 2;</span></div><div class="" id="zw-129c606457bEyxHOu2dbd2a"><span id="zw-129c606457bZQqZMo2dbd2a">&nbsp;&nbsp;1452</span></div><div class="" id="zw-129c606457b18qZ262dbd2a"><span id="zw-129c606457bYEySpH2dbd2a">&nbsp;&nbsp;</span><span id="zw-129c609aec2iUhj4W2dbd2a" style="color: rgb(0, 0, 255); ">1453 &nbsp; &nbsp; &nbsp; &nbsp; load = source_load(cpu, idx);</span></div><div class="" id="zw-129c606457cgYlQ2dbd2a"><span id="zw-129c606457cBudTAw2dbd2a" style="color: rgb(0, 0, 255); ">&nbsp;&nbsp;1454 &nbsp; &nbsp; &nbsp; &nbsp; this_load = target_load(this_cpu, idx);</span></div><div class="" id="zw-129c606457c6-0kY52dbd2a"><span id="zw-129c606457cUsq8P2dbd2a">&nbsp;&nbsp;1455</span></div><div class="" id="zw-129c606457ch6ll172dbd2a"><span id="zw-129c606457dvQt8YJ2dbd2a">&nbsp;&nbsp;1456 &nbsp; &nbsp; &nbsp; &nbsp; new_cpu = this_cpu; /* Wake to this CPU if we can */</span></div><div class="" id="zw-129c606457dh8set52dbd2a"><span id="zw-129c606457dy5TT5q2dbd2a">&nbsp;&nbsp;1457</span></div><div class="" id="zw-129c606457dTC9AD2dbd2a"><span id="zw-129c606457dD5S0hl2dbd2a">&nbsp;&nbsp;1458 &nbsp; &nbsp; &nbsp; &nbsp; if (this_sd-&gt;flags &amp; SD_WAKE_AFFINE) {</span></div><div class="" id="zw-129c606457eDebWmp2dbd2a"><span id="zw-129c606457eia0lI2dbd2a">&nbsp;&nbsp;1459 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; unsigned long tl = this_load;</span></div><div class="" id="zw-129c606457eem2lc2dbd2a"><span id="zw-129c606457eHo23Pv2dbd2a">&nbsp;&nbsp;1460 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; unsigned long tl_per_task;</span></div><div class="" id="zw-129c606457eC81lDU2dbd2a"><span id="zw-129c606457frWcL-f2dbd2a">&nbsp;&nbsp;1461</span></div><div class="" id="zw-129c606457f1OARXM2dbd2a"><span id="zw-129c606457f3jhG32dbd2a">&nbsp;&nbsp;1462 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; tl_per_task = cpu_avg_load_per_task(this_cpu);</span></div><div class="" id="zw-129c606457fVAVGcN2dbd2a"><span id="zw-129c606457f-tTnZQ2dbd2a">&nbsp;&nbsp;1463</span></div><div class="" id="zw-129c606457fFQskN2dbd2a"><span id="zw-129c6064580-II0sF2dbd2a">&nbsp;&nbsp;1464 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; /*</span></div><div class="" id="zw-129c6064580GX5_W_2dbd2a"><span id="zw-129c6064580stCono2dbd2a">&nbsp;&nbsp;1465 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* If sync wakeup then subtract the (maximum possible)</span></div><div class="" id="zw-129c6064580kS7TY2dbd2a"><span id="zw-129c60645807fAr2I2dbd2a">&nbsp;&nbsp;1466 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* effect of the currently running task from the load</span></div><div class="" id="zw-129c6064581SrsUYz2dbd2a"><span id="zw-129c60645810LJkM2dbd2a">&nbsp;&nbsp;1467 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* of the current CPU:</span></div><div class="" id="zw-129c6064581wH0-BH2dbd2a"><span id="zw-129c6064581YyWKQW2dbd2a">&nbsp;&nbsp;1468 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/</span></div><div class="" id="zw-129c6064581qBNR2x2dbd2a"><span id="zw-129c6064581mzT3i2dbd2a">&nbsp;&nbsp;1469 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; if (sync)</span></div><div class="" id="zw-129c6064582cwOSS72dbd2a"><span id="zw-129c6064582KYP9m2dbd2a">&nbsp;&nbsp;1470 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; tl -= current-&gt;load_weight;</span></div><div class="" id="zw-129c6064582KTRT1e2dbd2a"><span id="zw-129c6064582oKa3F22dbd2a">&nbsp;&nbsp;1471</span></div><div class="" id="zw-129c6064582n6lJjv2dbd2a"><span id="zw-129c6064583bMhv2Y2dbd2a">&nbsp;</span><span id="zw-129c606c383GCHEH_2dbd2a" style="color: rgb(255, 0, 0); ">&nbsp;1472 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; if ((tl &lt;= load &amp;&amp;</span></div><div class="" id="zw-129c6064583Y3yKi52dbd2a"><span id="zw-129c6064583z0AJPf2dbd2a" style="color: rgb(255, 0, 0); ">&nbsp;&nbsp;1473 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; tl + target_load(cpu, idx) &lt;= tl_per_task) ||</span></div><div class="" id="zw-129c6064583_yP71D2dbd2a"><span id="zw-129c6064583wiUKL42dbd2a" style="color: rgb(255, 0, 0); ">&nbsp;&nbsp;1474 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; 100*(tl + p-&gt;load_weight) &lt;= imbalance*load) {</span></div><div class="" id="zw-129c6064584QghIrB2dbd2a"><span id="zw-129c60645842UJjBx2dbd2a">&nbsp;&nbsp;1475 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; /*</span></div><div class="" id="zw-129c6064584CTLCmo2dbd2a"><span id="zw-129c6064584JID8Xn2dbd2a">&nbsp;&nbsp;1476 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* This domain has SD_WAKE_AFFINE and</span></div><div class="" id="zw-129c6064584qKTFLf2dbd2a"><span id="zw-129c6064584Taiki2dbd2a">&nbsp;&nbsp;1477 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* p is cache cold in this domain, and</span></div><div class="" id="zw-129c6064585fg1Sj32dbd2a"><span id="zw-129c60645850dfWNl2dbd2a">&nbsp;&nbsp;1478 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* there is no bad imbalance.</span></div><div class="" id="zw-129c6064585pjJxy2dbd2a"><span id="zw-129c60645851IoRjC2dbd2a">&nbsp;&nbsp;1479 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/</span></div><div class="" id="zw-129c6064585OufLW62dbd2a"><span id="zw-129c6064585HMWy8K2dbd2a">&nbsp;&nbsp;1480 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; schedstat_inc(this_sd, ttwu_move_affine);</span></div><div class="" id="zw-129c6064585zdHzlM2dbd2a"><span id="zw-129c6064585HZIpZ2dbd2a">&nbsp;&nbsp;1481 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; goto out_set_cpu;</span></div><div class="" id="zw-129c6064585M_NuVW2dbd2a"><span id="zw-129c6064585PxHwu2dbd2a">&nbsp;&nbsp;1482 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }</span></div><div class="" id="zw-129c6064585Oh1Thm2dbd2a"><span id="zw-129c6064585lXBXog2dbd2a">&nbsp;&nbsp;1483 &nbsp; &nbsp; &nbsp; &nbsp; }</span></div><div id="zw-129c6064586GWOuu2dbd2a"><br id="zw-129c6064f28d_VTna2dbd2a" /></div><div id="zw-129c6064f270s17NT2dbd2a">计算出“目标处理器”和“源处理器”各自的负载（<span id="zw-129c60980fegyxpTA2dbd2a">1453行和1454行）</span>，再计算“目标处理器”上的每任务平均负载&nbsp;<span id="zw-129c60a72efJWDu-G2dbd2a">tl_per_task，最后进行判断：如果“目标处理器”的负载小于“源处理器”的负载且两处理器负载相加都比 tl_per_task小的话，唤醒的进程转为“目标处理器”执行。还有一种情况就是1474行的判断，如果“目标处理器”的负载加上被唤醒的进程的负载后，还比“源处理器”的负载（乘以imbalance后）的小的话，也要把唤醒的进程转为“目标处理器”执行。如果两个因素都不满足，那还是由p进程原来呆的那个CPU（即”源处理器“）继续来处理吧。</span></div><div id="zw-129c982cac6V2zSam2dbd2a"><span id="zw-129c982cac65mKOMO2dbd2a"><br /></span></div><div id="zw-129c982cc1e4Glgf32dbd2a"><span id="zw-129c982cc1e_X_8gs2dbd2a">有点儿绕，是吧？其实代码虽绕，用意是简单的：</span></div><div id="zw-129c615b0e51CxIb2dbd2a"><span id="zw-129c615b0e5bYCeYM2dbd2a"><br /></span></div><div id="zw-129c615b48dczOP6d2dbd2a"><span id="zw-129c615b48diyb0Qa2dbd2a">1472行-1473行其实是这样一个用意：如果“目标处理器”的负载很小，小得即使把压力全给到“源处理器”上去也不会超过“源处理器”上的平均任务负载，那么这“目标处理器”的负载是真的很小，值得把p进程挪过来。</span></div><div id="zw-129c618292cESlNg2dbd2a"><span id="zw-129c618292dQxVGyt2dbd2a">1474行的用意则是：如果我们真的把p进程挪到“目标处理器”以后，“目标处理器”的压力也不比“源处理器”大多少，所以，还是值得一挪。</span></div><div id="zw-129c61958a27L8Tkw2dbd2a"><span id="zw-129c61958a2ySvxBU2dbd2a"><br /></span></div><div id="zw-129c61959f6g1oIBd2dbd2a">说来说去，还是那个笨原则：把任务从最忙的CPU那儿转到很闲的CPU这儿。</div><div id="zw-129c9d590d6ZWIHIX2dbd2a"><br id="zw-129c9d590d6f-1O892dbd2a" /></div><div id="zw-129c9d59249o1IMgS2dbd2a">我们已经看过了睡眠和醒来时的内核函数，那么软中断里的<span id="zw-129c9d5f221J0dmmL2dbd2a">run_rebalance_domains又干了些什么呢？其实也很简单，它调用了load_balance函数，而这个函数和load_balance_newidle实现上基本一样，就不累述了。</span></div><div id="zw-129d90a407dDAAjbR2dbd2a"><span id="zw-129d90a407eO4H6Pj2dbd2a"><br /></span></div><div id="zw-129d90a41fdk7ZkFt2dbd2a"><span id="zw-129d90a41fd041H3h2dbd2a">这里没有探讨进程优先级和进程负载的计算方法，因为太复杂我也不太理解，以后看代码如果有心得，再与大家分享。</span></div><div id="zw-129d90a41fdk7ZkFt2dbd2a"><span id="zw-129d90a41fd041H3h2dbd2a"><br /></span></div>]]></description>
            <link>http://donghao.org/2010/07/kernel-smpiioouaoai.html</link>
            <guid>http://donghao.org/2010/07/kernel-smpiioouaoai.html</guid>
            
                <category domain="http://www.sixapart.com/ns/types#category">操作系统</category>
            
            
                <category domain="http://www.sixapart.com/ns/types#tag">kernel</category>
            
                <category domain="http://www.sixapart.com/ns/types#tag">linux</category>
            
                <category domain="http://www.sixapart.com/ns/types#tag">SMP</category>
            
            <pubDate>Tue, 20 07 2010 11:08:35 +0800</pubDate>
        </item>
        
        <item>
            <title>多线程并非万金油</title>
            <description><![CDATA[为了发挥多核机器的威力，可以用多进程或多线程的办法，由于多进程往往涉及共享内存等IPC问题，所以很多人都倾向于选择多线程，并以此为灵丹妙药。但多线程并非万能，它虽然使用方便，却也有硬伤——线程一死，会牵连其它。孙子曰“<span id="zw-129ca97a249CB8tW2dbd2a" style="font-family: Simsun; line-height: 20px; ">不尽知用兵之害者，则不能尽知用兵之利也”，不了解多线程的缺点，也就不能很好的使用它。</span><p></p><p id="zw-129ca82fddclFR8v2dbd2a"><span id="zw-129ca97a249CB8tW2dbd2a" style="font-family: Simsun; line-height: 20px; "></span></p><p id="zw-129ca866412a8Pd3r2dbd2a"><span class="Apple-style-span" id="zw-129ca866412xxhbi2dbd2a" style="font-family: Simsun; line-height: 20px; ">我们项目中有一个daemon，功能是转发并处理消息，为了能看到daemon运行的细节，我们还做了一个monitor线程，由该线程通过某个端口提供简单的web服务，这样就可以直接用浏览器查看daemon的运行状态（比如处理了多少消息，丢弃了多少等）。后来，monitor线程出现了一个bug，造成线程挂掉——于是造成了整个daemon挂掉。这下郁闷了，daemon本身是很重要的，而monitor是不那么重要的，现在是</span><span class="Apple-style-span" id="zw-129ca8e990azpYqes2dbd2a" style="font-family: Simsun; line-height: 20px; font-weight: bold; ">次要部分的bug拖累了重要部分的运行</span><span class="Apple-style-span" id="zw-129ca8e990ajpEyQ2dbd2a" style="font-family: Simsun; line-height: 20px; ">。</span></p><p id="zw-129ca8ea1058axCHj2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt; "><span class="Apple-style-span" id="zw-129ca8ea105dm--m32dbd2a" style="font-family: Simsun; line-height: 20px; ">这就是多线程程序的缺陷。如果多个线程做的是同样的事情，那还尚可；但如果多个线程，有的做这件事，有的做那件事，而且事情的重要程度不同，那不重要的线程由于代码错误或其他原因死了，其它的线程——包括执行重要功能的——也只能跟着挂。这在健壮性上肯定是不好的。apache采用多进程应该也是出于这样的考虑，因为它的module可能是用户自己写的，可能并不稳定，但由于module不稳定而挂掉整个apache，显然不应该。当然，apache2开始支持多线程，但即使这样，它默认还是多进程的，并没有整个倒向多线程。</span></p><p id="zw-129ca91cb70hMko782dbd2a" style="margin-bottom: 12pt; margin-top: 0pt; "><span class="Apple-style-span" id="zw-129ca91cb70RJpVOG2dbd2a" style="font-family: Simsun; line-height: 20px; ">也许有人会说：你代码写好一点，不要有bug，多线程不就没事了吗？首先，我们讨论的是软件健壮性的问题——怎样在坏了一部分以后其它部分还能工作，而不是软件正确性的问题——怎样写正确的代码。不是一个方向的问题，并不矛盾。其次，软件不可能没有bug，我们如果能把不同杀伤性的bug通过不同进程把它们隔开，就能降低影响，这跟挖掘bug的目标是一致的——都是为了增加软件的可用性。</span></p><p id="zw-129ca957422nBfc02dbd2a" style="margin-bottom: 12pt; margin-top: 0pt; "><span class="Apple-style-span" id="zw-129ca9574228QN1M42dbd2a" style="font-family: Simsun; line-height: 20px; ">所以，多线程并非万金油。为了健壮性，可以考虑把不同性质的任务分到不同的进程上，再由父进程统一管理。而在这些进程之下，可以再有多线程。当然，这样开发就复杂了。</span></p>]]></description>
            <link>http://donghao.org/2010/07/aissiciooi.html</link>
            <guid>http://donghao.org/2010/07/aissiciooi.html</guid>
            
                <category domain="http://www.sixapart.com/ns/types#category">软件开发</category>
            
            
                <category domain="http://www.sixapart.com/ns/types#tag">多线程</category>
            
            <pubDate>Tue, 13 07 2010 15:11:04 +0800</pubDate>
        </item>
        
        <item>
            <title>旧事一则</title>
            <description><![CDATA[<div>现在唐骏文凭造假的事闹得沸沸扬扬。提起唐先生，想起旧事一则。</div><div><br /></div><div>那时候我还在软件学院读研。一天早上，一个IBM的工程师来学院演讲，主要是讲开发相关的东西，讲座中提到”各位同学将来应该都是以写代码为主“云云；正好当天下午，唐骏也来学院演讲（先生还挺喜欢走穴演讲），讲的是个很虚幻的主题，我已记不大清了，但中间也提到一句”各位软件学院的同学将来是软件业的领军者“云云。当天晚上，有同学在学院论坛上发帖，说：唐骏是好样的，因为他说咱们是领军者，那个IBM的不像话，居然说我们是码农。</div><div><br /></div><div>人贵有自知之明。软件学院的学生基本都是考计算机系考不上调剂的，或知道自己考不上计算机系而改考的（比如我），这个地方毕业，能找个安安稳稳的写代码的工作就不错了，还他妈的”软件领军者“！？扯淡去吧。</div><div><br /></div><div>IBM的工程师一句实话，同学们不爱听，唐先生一句马屁，大家还挺入耳，也难怪唐骏这么火——说话这么动听这么广结善缘，捧他的人、爱听他说的人能不多吗？</div><div><br /></div><div>人们太爱听好听的了，所以现在各类兜售“成功学”的书都很火，什么《我的成功可以复制》啊，《世界因你不同》啊。如果有个老实巴交的工程师站出来告诉大家：”成功只能是老老实实学习，辛辛苦苦工作“，估计没几个人愿听</div><div><br /></div><div>唐骏本来就是个前微软分部的CEO，一个普普通通的职业经理人，他之所以能火，不也是浮躁的人们追捧的结果吗？</div> ]]></description>
            <link>http://donghao.org/2010/07/eeaooo.html</link>
            <guid>http://donghao.org/2010/07/eeaooo.html</guid>
            
                <category domain="http://www.sixapart.com/ns/types#category">生活随感</category>
            
            
                <category domain="http://www.sixapart.com/ns/types#tag">软件学院</category>
            
                <category domain="http://www.sixapart.com/ns/types#tag">唐骏</category>
            
            <pubDate>Thu, 08 07 2010 10:04:43 +0800</pubDate>
        </item>
        
        <item>
            <title>Padding也要小心</title>
            <description><![CDATA[<div>为了在32位机器和64位机器之间传递状态消息，我们给消息格式做了padding：</div><div><br /></div><div>struct StateMsg</div><div>{</div><div><span class="Apple-tab-span" style="white-space:pre">	</span>uint32_t msgType;</div><div><span class="Apple-tab-span" style="white-space:pre">	</span>uint32_t padding;</div><div><span class="Apple-tab-span" style="white-space:pre">	</span>uint64_t msgID;</div><div>};</div><div><br /></div><div>这样，不管是在32位机器上还是64位机器上，消息的大小都是16个字节。开始一切正常，直到后来我们发现有问题：程序里会比较本条状态消息与上一条有什么不同，如果不一样，要清空路由表；如果一样，就说明状态没有变化，于是不做任何操作。而错误出现在我们的消息比较用的是memcmp：</div><div><br /></div><div>memcmp(oldMsg, newMsg, sizeof(struct StateMsg));</div><div><br /></div><div>这下连padding也加入比较了，但是padding我们却没有对它赋初值！结果，每条消息都和上一条不同，路由表于是被频繁的清空....</div><div>padding本身是用来对齐的，对业务没有任何意义，所以赋值的时候容易忘掉它。教训啊。</div>]]></description>
            <link>http://donghao.org/2010/07/paddingooa.html</link>
            <guid>http://donghao.org/2010/07/paddingooa.html</guid>
            
                <category domain="http://www.sixapart.com/ns/types#category">软件开发</category>
            
            
                <category domain="http://www.sixapart.com/ns/types#tag">padding</category>
            
            <pubDate>Thu, 01 07 2010 11:26:27 +0800</pubDate>
        </item>
        
        <item>
            <title>公用电话</title>
            <description><![CDATA[去年公司电话紧张，于是收了我的座机，让我和边上的同事（赵明华）共用一部。<div>于是我在旺旺上的签名写成：</div><div>”分机：70663（请门口赵大爷叫一声）“</div><div>我的旺旺名字（也就是花名）是”三百“。</div><div><br /></div><div>结果，朋友不久就来了个消息：<br />“你们公司三百个人用一个电话号码？！“</div>]]></description>
            <link>http://donghao.org/2010/06/oac.html</link>
            <guid>http://donghao.org/2010/06/oac.html</guid>
            
                <category domain="http://www.sixapart.com/ns/types#category">对话收录</category>
            
            
                <category domain="http://www.sixapart.com/ns/types#tag">分机</category>
            
            <pubDate>Wed, 23 06 2010 10:53:29 +0800</pubDate>
        </item>
        
        <item>
            <title>如厕</title>
            <description><![CDATA[<span class="Apple-style-span" style="color: rgb(0, 0, 0); font-family: Verdana; ">我到26层去查故障，顺便上个厕所。一进厕所就撞见苏宁。<div style="margin-top: 0px; margin-bottom: 0px; ">苏：你搬下来了？</div><div style="margin-top: 0px; margin-bottom: 0px; ">我：没有啊</div><div style="margin-top: 0px; margin-bottom: 0px; ">苏：那你是专程来这里上厕所的？</div><div style="margin-top: 0px; margin-bottom: 0px; ">我：是的</div><div style="margin-top: 0px; margin-bottom: 0px; ">苏：难得啊。我代表26层全体马桶欢迎你。</div><div style="margin-top: 0px; margin-bottom: 0px; ">我：嗯，你来代表，你是他们的“桶帅”嘛。马桶里面最帅的。</div></span> ]]></description>
            <link>http://donghao.org/2010/06/ec-2.html</link>
            <guid>http://donghao.org/2010/06/ec-2.html</guid>
            
                <category domain="http://www.sixapart.com/ns/types#category">对话收录</category>
            
            
                <category domain="http://www.sixapart.com/ns/types#tag">马桶</category>
            
            <pubDate>Tue, 22 06 2010 18:37:12 +0800</pubDate>
        </item>
        
        <item>
            <title>[kernel] 使用sendfile</title>
            <description><![CDATA[<span class="Apple-style-span" style="color: rgb(0, 0, 0); font-family: Verdana; ">linux内核里对于要发送到网络上去的包，是通过sk_buffer结构组织的，这个结构主要就是包含一个指针和一个长度，指针指向要发送数据的开头处，长度当然就是要发送的长度。当我们使用send系统调用时，内核要把用户空间的数据拷往内核空间在（创建一个副本），然后构造sk_buffer指向这个内核空间里的副本，接着把sk_buffer排到队列里去，等待网卡处理。<div style="margin-top: 0px; margin-bottom: 0px; ">后来诞生了sendfile，sendfile会直接将sK_buffer指向文件在内存中的cache，这样就节省了一次拷贝。<br /><div style="margin-top: 0px; margin-bottom: 0px; ">这里有个注意事项：sendfile不适合频繁改动的文件。假如你调用了一次sendfile，sk_buffer指向文件的某块cache，然后被放入队列；接着，你改动了文件的内容，那么排在发送队列里的那个sk_buffer指向的cache内容就变化了！那发出去的消息就是你改动后的数据。在不知道sk_buffer是否已发往网络的情况下，一边改动文件一边调用sendfile会造成发送出去的内容不确定。所以，sendfile更适合用在静态文件的场合。</div><div style="margin-top: 0px; margin-bottom: 0px; ">当然，如果有个办法能查询sk_buffer是否真的发送出去了（而不是呆在发送队列里），那么“改一次文件内容，调一次sendfile；再改一次文件内容，再调一次sendfile”也是一个不错的节省拷贝时间的发送策略。</div></div></span>]]></description>
            <link>http://donghao.org/2010/06/kernel-eoasendfile.html</link>
            <guid>http://donghao.org/2010/06/kernel-eoasendfile.html</guid>
            
                <category domain="http://www.sixapart.com/ns/types#category">操作系统</category>
            
            
                <category domain="http://www.sixapart.com/ns/types#tag">sendfile</category>
            
            <pubDate>Mon, 21 06 2010 13:50:52 +0800</pubDate>
        </item>
        
        <item>
            <title>[kernel] 在release方法里，而不是flush方法里释放</title>
            <description><![CDATA[<span class="Apple-style-span" style="color: rgb(0, 0, 0); font-family: Simsun; font-size: medium; "><div style="margin-top: 6px; margin-right: 6px; margin-bottom: 6px; margin-left: 6px; padding-top: 0px; padding-right: 0px; padding-bottom: 0px; padding-left: 0px; font-family: Verdana; font-size: 10pt; background-color: rgb(255, 255, 255); color: rgb(0, 0, 0); min-height: 1100px; counter-reset: __goog_page__ 0; line-height: normal; ">做一个文件系统（姑且叫它“蝗虫文件系统“），需要在进程退出时自动删除它打开的文件。另外此文件还支持poll（我自己做的，一般的文件系统不支持），支持poll当然需要等待队列，所以我把一个等待队列放在文件对应的struct inode里，实现该文件struct file_operations里的flush方法，在flush时删除文件并释放该等待队列（kfree）。<br /><div style="margin-top: 0px; margin-bottom: 0px; text-align: justify; ">这样做似乎一切顺利，直到应用程序开始使用epoll:先在蝗虫文件系统里创建几个文件，再把它们的fd放入epoll（epoll_ctl），然后进程退出，如此多来几次，内核就panic了。还好我用的是QEMU，可以清楚看见在什么地方panic的，结果是 __fput --&gt; eventpoll_release --&gt; eventpoll_release_file --&gt; ep_remove --&gt; ep_unregister_pollwait --&gt; remove_wait_queue，看看ep_unregister_pollwait的代码：</div><div style="margin-top: 0px; margin-bottom: 0px; text-align: justify; "><br /></div><div style="margin-top: 0px; margin-bottom: 0px; text-align: justify; ">[fs/eventpoll.c --&gt; ep_unregister_pollwait]</div><div style="margin-top: 0px; margin-bottom: 0px; text-align: justify; "><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;1109 static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;1110 {</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;1111 &nbsp; &nbsp; int nwait;</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;1112 &nbsp; &nbsp; struct list_head *lsthead = &amp;epi-&gt;pwqlist;</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;1113 &nbsp; &nbsp; struct eppoll_entry *pwq;</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;1114</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;1115 &nbsp; &nbsp; /* This is called without locks, so we need the atomic exchange */</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;1116 &nbsp; &nbsp; nwait = xchg(&amp;epi-&gt;nwait, 0);</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;1117</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;1118 &nbsp; &nbsp; if (nwait) {</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;1119 &nbsp; &nbsp; &nbsp; &nbsp; while (!list_empty(lsthead)) {</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;1120 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; pwq = list_entry(lsthead-&gt;next, struct eppoll_entry, llink);</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;1121</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;1122 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; ep_list_del(&amp;pwq-&gt;llink);</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;1123 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;<font class="Apple-style-span" color="#FF0000">remove_wait_queue(pwq-&gt;whead, &amp;pwq-&gt;wait);</font></div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;1124 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; kmem_cache_free(pwq_cache, pwq);</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;1125 &nbsp; &nbsp; &nbsp; &nbsp; }</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;1126 &nbsp; &nbsp; }</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;1127 }</div><div style="margin-top: 0px; margin-bottom: 0px; text-align: justify; "><br /></div>在把多个文件句柄加入epoll里的时候，这些文件句柄对应的inode上的等待队列要被epoll挂在一个数据结构上（struct eppoll_entry)，当进程退出，这些等待队列当然要从数据结构上拿掉，但是，这些等待队列已经被我kfree了，所以panic。看来把kfree放在flush是不行了，那放哪儿？有哪个调用是比eventpoll_release发生的更晚的？有！</div><div style="margin-top: 0px; margin-bottom: 0px; text-align: justify; "><br /></div><div style="margin-top: 0px; margin-bottom: 0px; text-align: justify; ">[fs/file_table.c --&gt; __fput]</div><div style="margin-top: 0px; margin-bottom: 0px; text-align: justify; "><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;153 void fastcall __fput(struct file *file)</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;154 {</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;155 &nbsp; &nbsp; struct dentry *dentry = file-&gt;f_dentry;</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;156 &nbsp; &nbsp; struct vfsmount *mnt = file-&gt;f_vfsmnt;</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;157 &nbsp; &nbsp; struct inode *inode = dentry-&gt;d_inode;</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;158</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;159 &nbsp; &nbsp; might_sleep();</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;160</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;161 &nbsp; &nbsp; fsnotify_close(file);</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;162 &nbsp; &nbsp; /*</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;163 &nbsp; &nbsp; &nbsp;* The function eventpoll_release() should be the first called</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;164 &nbsp; &nbsp; &nbsp;* in the file cleanup chain.</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;165 &nbsp; &nbsp; &nbsp;*/</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;166 &nbsp; &nbsp;&nbsp;<font class="Apple-style-span" color="#0000FF">eventpoll_release(file);</font></div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;167 &nbsp; &nbsp; locks_remove_flock(file);</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;168</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;169 &nbsp; &nbsp; if (file-&gt;f_op &amp;&amp; file-&gt;f_op-&gt;release)</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;170 &nbsp; &nbsp; &nbsp;<font class="Apple-style-span" color="#FF0000">&nbsp;&nbsp; file-&gt;f_op-&gt;release(inode, file);</font></div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;171 &nbsp; &nbsp; security_file_free(file);</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;172 &nbsp; &nbsp; if (unlikely(inode-&gt;i_cdev != NULL))</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;173 &nbsp; &nbsp; &nbsp; &nbsp; cdev_put(inode-&gt;i_cdev);</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;174 &nbsp; &nbsp; fops_put(file-&gt;f_op);</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;175 &nbsp; &nbsp; if (file-&gt;f_mode &amp; FMODE_WRITE)</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;176 &nbsp; &nbsp; &nbsp; &nbsp; put_write_access(inode);</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;177 &nbsp; &nbsp; file_kill(file);</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;178 &nbsp; &nbsp; file-&gt;f_dentry = NULL;</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;179 &nbsp; &nbsp; file-&gt;f_vfsmnt = NULL;</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;180 &nbsp; &nbsp; file_free(file);</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;181 &nbsp; &nbsp; dput(dentry);</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;182 &nbsp; &nbsp; mntput(mnt);</div><div style="margin-top: 0px; margin-bottom: 0px; ">&nbsp;&nbsp;183 }</div><br /><div style="margin-top: 0px; margin-bottom: 0px; ">struct file_operations里的release方法就是在epoll释放之后调用的，所以应该把kfree挪到release里去执行。</div><div style="margin-top: 0px; margin-bottom: 0px; ">照此改之，没有panic了。</div></div></div></span> ]]></description>
            <link>http://donghao.org/2010/06/kernel-oureleaseaioecflushaiei.html</link>
            <guid>http://donghao.org/2010/06/kernel-oureleaseaioecflushaiei.html</guid>
            
                <category domain="http://www.sixapart.com/ns/types#category">操作系统</category>
            
            
                <category domain="http://www.sixapart.com/ns/types#tag">epoll</category>
            
                <category domain="http://www.sixapart.com/ns/types#tag">flush</category>
            
                <category domain="http://www.sixapart.com/ns/types#tag">kernel</category>
            
                <category domain="http://www.sixapart.com/ns/types#tag">release</category>
            
            <pubDate>Tue, 01 06 2010 17:11:56 +0800</pubDate>
        </item>
        
        <item>
            <title>[linux] tail和poll无关</title>
            <description><![CDATA[不知道是我听错了还是别人说错了，我一直以为 tail -f 是通过poll实现的，而每个ext2（或ext3）文件都支持poll。当文件有新数据时，针对此文件的poll会返回结果。&nbsp;<br /><br />昨天想参考kernel是怎么让设备实现poll的，看了ext2文件系统，发现它没提供poll接口。于是自己写了个程序来试试，对着一个ext2文件epoll，结果，即使文件里的数据增加了，epoll_wait也没有任何反应。就socket支持poll，硬盘文件不支持的。那tail -f是怎么实现的？下了<a href="http://www.gnu.org/software/coreutils/" id="vbp:" title="coreutils" style="color: rgb(85, 26, 139); ">coreutils</a>一看，喔，其实是每1秒检查一遍文件，看大小有否变化，如果有，则异步的read该文件，取得新数据，打印到终端......<div>想想也对，tail命令出现的很早，那时候说不定还没有poll呢，它就用这个简单办法不也解决了问题么。</div>]]></description>
            <link>http://donghao.org/2010/06/linux-tailipollio.html</link>
            <guid>http://donghao.org/2010/06/linux-tailipollio.html</guid>
            
                <category domain="http://www.sixapart.com/ns/types#category">软件开发</category>
            
            
                <category domain="http://www.sixapart.com/ns/types#tag">linux</category>
            
                <category domain="http://www.sixapart.com/ns/types#tag">poll</category>
            
                <category domain="http://www.sixapart.com/ns/types#tag">tail</category>
            
            <pubDate>Tue, 01 06 2010 16:47:29 +0800</pubDate>
        </item>
        
        <item>
            <title>没什么可后悔的</title>
            <description><![CDATA[<div>看到<a href="http://www.alibuybuy.com/19769.html">奇文一篇</a>，说世界上有75%的人都后悔自己年轻时努力不够。</div><div>我不理解。</div><div><br /></div><div>读中学的时候，我不知道花了多少精力来背单词音标（因为我考音标总是丢分）、背历史、背古文，还有准备各种各样的考试，我自己回想起来真是没什么遗憾，真的够卖力了。但我现在用得上英语音标吗？用得上历史年代吗？用得上古文吗？我把我的少年时代拿来背书了。</div><div>上大学的时候，我花了很大力气才考过了有机化学、物理化学、分析化学、结构化学，整个大学我都在对挂科的恐惧中渡过的，多少个自习室熄灯的晚上我还在看那该死的无用的化学书啊。后来我当上了程序员，我学那么苦那么多那么卖力的化学知识，拿来干什么呢？我四年的青春就拿来送给化学系了。</div><div><br /></div><div>少壮不努力，老大徒伤悲；那少壮努力了，就不伤悲了吗？努力错方向的人、努力白费了的人，他们就不后悔了吗？</div><div><br /></div><div>“努力”只是一种投资，跟其他投资形式一样，都是有风险的，你能担保你的“努力”就一定有回报吗？</div><div><br /></div><div>说回那篇奇文，如果你年轻时努力透了，你可能现在每天睡前，发现自己浪费了那么多青春时光在死记硬背上，回首往事，居然除了课本，啥也记不起来，你是不是还是后悔呢？</div><div>没努力过，后悔；努力过，也后悔，人这种动物，真是太麻烦了，上帝如果天天听人类说后悔事，估计再来一次世界末日的心思都有了。</div><div><br /></div><div>所以，别再后悔了，别再唠叨了，过去的就一定要过去。现在不是过得挺好吗？好好过吧。</div>]]></description>
            <link>http://donghao.org/2010/05/aeaeoua.html</link>
            <guid>http://donghao.org/2010/05/aeaeoua.html</guid>
            
                <category domain="http://www.sixapart.com/ns/types#category">生活随感</category>
            
            
                <category domain="http://www.sixapart.com/ns/types#tag">后悔</category>
            
            <pubDate>Wed, 26 05 2010 09:46:47 +0800</pubDate>
        </item>
        
        <item>
            <title>[kernel] epoll里的EPOLLET标记</title>
            <description><![CDATA[补充了一篇epoll里EPOLLET标记的代码剖析：<div><br /></div><div>&nbsp;&nbsp; &nbsp; &nbsp;&nbsp;<a href="http://donghao.org/docs/linux_kernel_poll_epoll_3.pdf" style="text-decoration: underline; ">《poll和epoll内核源码剖析》（三）</a></div><div><br /></div>]]></description>
            <link>http://donghao.org/2010/05/kernel-epollaiaepolletec.html</link>
            <guid>http://donghao.org/2010/05/kernel-epollaiaepolletec.html</guid>
            
                <category domain="http://www.sixapart.com/ns/types#category">操作系统</category>
            
            
                <category domain="http://www.sixapart.com/ns/types#tag">epoll</category>
            
                <category domain="http://www.sixapart.com/ns/types#tag">kernel</category>
            
                <category domain="http://www.sixapart.com/ns/types#tag">linux</category>
            
                <category domain="http://www.sixapart.com/ns/types#tag">poll</category>
            
            <pubDate>Fri, 14 05 2010 15:35:49 +0800</pubDate>
        </item>
        
        <item>
            <title>[tcp] 异步connect是否成功？</title>
            <description><![CDATA[<span class="Apple-style-span" style="color: rgb(0, 0, 0); font-family: Verdana; ">我原先的client端代码流程如下：&nbsp;&nbsp;<br /><br /><div style="margin-top: 0px; margin-bottom: 0px; "><div style="margin-top: 0px; margin-bottom: 0px; ">创建一个socket</div><div style="margin-top: 0px; margin-bottom: 0px; ">设为异步socket（fcntl）</div><div style="margin-top: 0px; margin-bottom: 0px; ">将socket加入epoll</div><div style="margin-top: 0px; margin-bottom: 0px; ">connect到远端（此时connect调用返回非0，但errno为EINPROGRESS，表示正在建立连接中）</div>epoll_wait之</div><div style="margin-top: 0px; margin-bottom: 0px; ">捕获到EPOLLOUT事件，此时便认为connect已经成功，client端开始发消息</div><br /><div style="margin-top: 0px; margin-bottom: 0px; ">这个过程通常能够运转，但是线上环境复杂多变，如果发生这种情况：server进程调用listen开始侦听后，被gdb或信号挂住了，此时异步connect会怎样？很遗憾，client端的epoll_wait依然返回EPOLLOUT，甚至往此socket里发消息都返回成功，只有当发的消息多得占完了server端的tcp缓冲以后（窗口收缩到很小），send调用才开始失败。这时候用 losf -i 看网络连接也很有趣，client端的机器显示连接建立了，server端的却显示没有这个连接。</div>仔细想想，OS这样做是正确的，毕竟connect的语义只是“连接”，当server挂住时，连接还是能成功的，但你能不能往里面发消息那就是另外一回事了。<br /><br /><div style="margin-top: 0px; margin-bottom: 0px; ">所以对于应用来说，异步socket想要知道connect后连接是不是可以正常收发数据了，还是要靠应用层的一问一答才能知道。</div><div style="margin-top: 0px; margin-bottom: 0px; "><br /></div><div style="margin-top: 0px; margin-bottom: 0px; "><br /></div><div style="margin-top: 0px; margin-bottom: 0px; ">====== 2010.5.14 ======</div><div style="margin-top: 0px; margin-bottom: 0px; "><br /></div><div style="margin-top: 0px; margin-bottom: 0px; ">昨天同事<span class="Apple-style-span" style="font-family: &#65533;&#65533;&#65533;&#65533;; font-size: 12px; line-height: 18px; -webkit-border-horizontal-spacing: 1px; -webkit-border-vertical-spacing: 1px; ">朱照远给了一个更正确的解决方案，可参考之：</span></div><div style="margin-top: 0px; margin-bottom: 0px; "><font class="Apple-style-span" face="&#65533;&#65533;&#65533;&#65533;" size="3"><span class="Apple-style-span" style="font-size: 12px; line-height: 18px; -webkit-border-horizontal-spacing: 1px; -webkit-border-vertical-spacing: 1px;">“<span class="Apple-style-span" style="font-family: 'Lucida Grande', Verdana, Arial, 'Bitstream Vera Sans', sans-serif; line-height: 16px; -webkit-border-horizontal-spacing: 0px; -webkit-border-vertical-spacing: 0px; color: rgb(51, 51, 51); ">收到EPOLLOUT也不能认为是TCP层次上connect(2)已经成功，要调用getsockopt看SOL_SOCKET的SO_ERROR是否为0。若为0，才表明真正的TCP层次上connect成功。至于应用层次的server是否收/发数据，那是另一回事了。”</span></span></font></div></span>]]></description>
            <link>http://donghao.org/2010/05/tcp-oiconnectecne.html</link>
            <guid>http://donghao.org/2010/05/tcp-oiconnectecne.html</guid>
            
                <category domain="http://www.sixapart.com/ns/types#category">软件开发</category>
            
            
                <category domain="http://www.sixapart.com/ns/types#tag">connect</category>
            
                <category domain="http://www.sixapart.com/ns/types#tag">tcp</category>
            
            <pubDate>Tue, 11 05 2010 15:28:12 +0800</pubDate>
        </item>
        
        <item>
            <title>大项目不等于大trunk</title>
            <description><![CDATA[<span class="Apple-style-span" style="color: rgb(0, 0, 0); font-family: Simsun; font-size: medium; "><div style="border-top-width: 0px; border-right-width: 0px; border-bottom-width: 0px; border-left-width: 0px; border-style: initial; border-color: initial; font-family: verdana; font-size: 10pt; direction: ltr; background-color: rgb(255, 255, 255); line-height: 1.2; margin-top: 4%; margin-right: 0.1%; margin-bottom: 4%; margin-left: 1.1%; "><p id="zw-1286c815e79ddtiFJ2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt; "><span class="Apple-style-span" style="font-family: Simsun; line-height: normal; font-size: medium; "></span></p><div style="border-top-width: 0px; border-right-width: 0px; border-bottom-width: 0px; border-left-width: 0px; border-style: initial; border-color: initial; font-family: verdana; font-size: 10pt; direction: ltr; background-color: rgb(255, 255, 255); line-height: 1.2; margin-top: 4%; margin-right: 0.1%; margin-bottom: 4%; margin-left: 1.1%; "><p id="zw-1286c815e79ddtiFJ2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt; ">最近参与一个大项目，几十个开发人员，代码都放在一个trunk下，也就是说，这几十个人的代码都在一起编译，一起跑单元测试，一起在半夜跑自动化测试用例。反正我以前是从未加入过这么大的trunk（之前我参加的项目最多就四五个人在一起写代码而已），现在发现巨大的trunk造成了一些很郁闷的问题。</p><p id="zw-1286c844161I7jonO2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt; ">首先，编译越来越慢。项目用<a href="http://www.scons.org/" id="zw-1286ca8bc482Fz-Qr2dbd2a" target="_blank" title="scons">scons</a>来编译（我暂时还没发现scons比Makefile好在哪里），一编译就耗干系统的内存（我们的系统是8G内存），整个trunk重编译一次是两个多小时，好在我负责的只是其中一部分，但是即使是这一部分，也要花十多分钟编译（我目前还不能说这是scons慢造成的，但是python占内存偏多，嫌疑很大），这在我调试阶段很费时间。</p><p id="zw-1286ca97ef4U-Tj8j2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt; ">其次，互相干扰很严重。几十个程序员，就算一个人每一个月才出现一次代码错误，那加起来就是：每天都有人至少犯一次代码错误。虽然使用了<a href="http://www.reviewboard.org/" id="zw-1286ca97ef4AJNwre2dbd2a" target="_blank" title="ReviewBoard">ReviewBoard</a>，但是review不可能洞察一切，还是会有单元测试频繁的不过。后来加了一个提交代码的工具，为每个提交代码的人单独编译并跑单元测试，通过了才真正交给subversion，但这样还是有问题：每次提交，我都发现有很多人排着队的等前面的人编译完，我有几次好不容易排上了，跟别人的提交一起编译，结果由于别人的提交编译不过，我还得重新排队......问题的根源在于：这个trunk上的提交太过频繁了。毕竟几十个人呢，几乎每十分钟就有新的code需要提交。</p><p id="zw-1286c9327e7UMT692dbd2a" style="margin-bottom: 12pt; margin-top: 0pt; ">可能长期在互联网公司干，我对那种动辄几百万行code，几十兆可执行文件的大软件越来越反感。我觉得好的软件，一定是满足且刚好满足了某一类用户的某一个需要，而不是妄图去满足所有用户的很多需要。说白了：尽量做小巧的东西。如果项目真的是个大项目，那最好是能拆成多个小软件，好歹代码能分到不同的trunk下去，trunk小了，挤在一个trunk下的开发人员就会少很多，就不会有漫长的编译和频繁的提交了，然后，这些小软件一起运行，完成某个大工作。</p><p id="zw-1286cab0719YdP4s2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt; ">说得有点理想化了，把大项目拆成小部分，这个道理谁都懂，但难度在于怎么拆？这种问题我回答不了，也许<a href="http://book.douban.com/subject/1467587/" id="zw-1286cab071a2x_i2N2dbd2a" target="_blank" title="《unix编程艺术》">《unix编程艺术》</a>能够回答：不要写一个巨大的进程，应该写多个，然后一起工作。进程都可以拆开，代码有什么不能拆开的？有人可能担心效率，但是相比于混乱的管理，这点效率损失我觉得还算值得。</p><p id="zw-1286cac12c3zuCWsQ2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt; ">以上是我的看法。实际中，反例也是有的，比如windows NT就是一堆人挤在一个代码trunk下做出来的，而且产品还很成功。但是从<a href="http://book.douban.com/subject/3699395/" id="zw-1286cac12c3EdXWe2dbd2a" target="_blank" title="《观止》">《观止》</a>里可以看出，NT的开发人员也一样被频繁的build break和单元测试fail折磨得人不人鬼不鬼，也许这就是为什么他们要辛苦上6年才能开发完成，以及为什么操作系统的未来是属于微内核的（微内核巧妙的把OS拆开了）。</p><p id="zw-1286cac12c3zuCWsQ2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt; "><br /></p><p id="zw-1286cac12c3zuCWsQ2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt; ">====== 2010.5.10 ======</p><p id="zw-1286cac12c3zuCWsQ2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt; ">补充一下，每次编译，除了内存，CPU也被占满，登录同一台服务器的其他人连vi都用不了，到了后来，我在哪台机上编译，哪台机上的开发人员就埋怨，真成“嫌人”了；编译完后，20G的硬盘空间就没了，想只编译一部分是不行的，因为没拆开，整个trunk必须一起编译。</p><p id="zw-1286cac12c3zuCWsQ2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt; ">更可怕的是，这么大的trunk，还要打分支，一个160G的硬盘，才能下载编译几个分支呢？</p></div><p></p></div></span> ]]></description>
            <link>http://donghao.org/2010/05/oiiaeouotrunk.html</link>
            <guid>http://donghao.org/2010/05/oiiaeouotrunk.html</guid>
            
                <category domain="http://www.sixapart.com/ns/types#category">软件开发</category>
            
            
                <category domain="http://www.sixapart.com/ns/types#tag">software</category>
            
                <category domain="http://www.sixapart.com/ns/types#tag">unix</category>
            
            <pubDate>Thu, 06 05 2010 16:18:56 +0800</pubDate>
        </item>
        
        <item>
            <title>超时问题调研</title>
            <description><![CDATA[<p id="zw-12844248aa1R1WlAy2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt;">我们的消息中间件，要在每个通信的机器上启一个daemon，负责转发。我们还为这个中间件开发了php接口（尽管我并不看好php，但它毕竟是前端的常用语言）。</p><p id="zw-12844268a6aeWRDn32dbd2a" style="margin-bottom: 12pt; margin-top: 0pt;">在
处理业务部门反馈的问题时，发现一个奇特的现象：php程序通过中间件收发消息，设置是3ms超时，运行正常，没有超时的消息；但如果在php里加一个循
环（加循环的位置与中间件的调用无关），超时出现了。php自己傻循环怎么会影响中间件消息的response time呢？</p><p id="zw-12847ac299dwPkKOt2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt;">先是怀疑我们的php module写得不对，参考了一下别人的代码，发现没有什么特别的，用的是极简单的php module API，如果这样写都不对，真不知道什么是对的了。</p><p id="zw-12844288100s7UzN62dbd2a" style="margin-bottom: 12pt; margin-top: 0pt;">再
是怀疑php本身，于是换了几个php版本（php接口也要重新编译，累人），问题依然出现。再把我们的中间件API加入php的
basic_function里，即做为php的内置函数（就像split,sleep这些函数一样），问题还是一样。可能不是php的问题。</p><p id="zw-128442afffe2uqCUV2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt;">接着怀疑apache，于是降低apache的进程数，发现httpd进程数为1或2的时候真的没有超时了，但进程数如果多于3就不行。难道apache的调度有问题？我们默认用的是prefork任务模式，换成worker试试（参见<a id="zw-128484170b1FmHyWU2dbd2a" href="http://httpd.apache.org/docs/2.0/mod/worker.html">这里</a>），换mpm了，php给apache的so也要重新编译（还是累人），最后发现换成worker任务内模式当进程数多了还是一样超时。</p><p id="zw-128442c897dyWqcGw2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt;">难道和php、apache都没有关系？干脆启动两个独立的php进程，没有超时；启动3个，没有超时；启动4个，超时来了。看来4这个数字很神秘。最后猜测是机器乃4核CPU，4个php进程占完了4个CPU，中间件daemon就没有保障了。</p><p id="zw-1284841e90dixOnVl2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt;">这个猜测有点跳跃，要想办法证明之，于是想到了<a id="zw-1284842f5eeuYMSy2dbd2a" href="http://www.ibm.com/developerworks/cn/linux/l-affinity.html">CPU affinity</a>，我们用<a id="zw-1284841e90dnFokN2dbd2a" href="http://www.hiadmin.com/?p=1452">taskset</a>把4个php进程都绑定到0,1,2三个CPU上，此时超时消失了。看来之前确实是php进程占了4个CPU，导致中间件daemon无法足够快的分配到CPU。</p><p id="zw-12847af1464BOT6OW2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt;">解决方案呢？</p><p id="zw-12847b24047p_c11f2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt;">如果我改改daemon，是不是能解决这个问题？于是把daemon的优先级设为最高（RT），超时依旧。</p><p id="zw-128483dd2d4skqUe22dbd2a" style="margin-bottom: 12pt; margin-top: 0pt;">仔
细想想有了个结论：daemon用的CPU并不多，但是其睡眠和唤醒非常频繁（daemon主要是IO操作，转发来自网络的包），当其从睡眠中醒来时，必
须很快分配给它一个CPU，但是php占了4个CPU，所以不能很快的给daemon，所以daemon出现了延时（睡过头了），造成消息超时。这大概能
说明为什么把daemon设成高优先级不能解决问题——优先级虽高，但终究是要睡的。</p><p id="zw-12847b3413fyDZSqH2dbd2a" style="margin-bottom: 12pt; margin-top: 0pt;">我的结论就是这样，最后的证明还是要看看内核代码了。未完待续。<br id="zw-12847b3413fGZo5zS2dbd2a" /></p>]]></description>
            <link>http://donghao.org/2010/04/eieian.html</link>
            <guid>http://donghao.org/2010/04/eieian.html</guid>
            
                <category domain="http://www.sixapart.com/ns/types#category">软件开发</category>
            
            
                <category domain="http://www.sixapart.com/ns/types#tag">affinity</category>
            
                <category domain="http://www.sixapart.com/ns/types#tag">apache</category>
            
                <category domain="http://www.sixapart.com/ns/types#tag">linux</category>
            
                <category domain="http://www.sixapart.com/ns/types#tag">php</category>
            
                <category domain="http://www.sixapart.com/ns/types#tag">smp</category>
            
                <category domain="http://www.sixapart.com/ns/types#tag">taskset</category>
            
            <pubDate>Thu, 29 04 2010 14:26:42 +0800</pubDate>
        </item>
        
    </channel>
</rss>
