少年當立凌雲志: January 2010

SPIN LOCK
essential linux device driver的第二章A peek inside the kernel裡頭的Concurreny in the kernel有提到使用spin lock的四種case。寫得非常好。可以參考看看。

我的理解是這樣的，spin lock存在的目的是為了避免context switch的發生，什麼時候要避免使用context switch呢？就是處理共用變數的時間非常短。這時就不需要把process抓去睡覺，取而代之的是使用spin lock等待鎖的釋放，只要spin lock等待的時間少於兩次context switch(swap process out & swap process in)的時間。這樣就划的來了。而在單cpu上面只要你佔用了processor此時你要了一個lock，剛好的這個lock已經被其他的context（process or interrupt）所佔據。無論如何，當下你一定要進行context switch才能把執行權限放給其他的context把事情做完，並且把鎖釋放，這樣你才有機會獲得這把鎖。因此才有了以下不同的spin lock在單cpu裡面實做的變形，而不是真的在那邊spin。（在單處理器裡，spin lock的實做就是disable interrupt & disable preemption這樣就可以避免使用semaphore導致context switch）

下面把這三個function的source code攤出來看。
spin_lock() spin_lock_irq() spin_lock_irqsave()

在我的2.6.29的kernel上面spin lock的實做在uni-processor的source code裡頭是長這樣(include/linux/spinlock_api_up.h)：


#ifndef __LINUX_SPINLOCK_API_UP_H
#define __LINUX_SPINLOCK_API_UP_H

#ifndef __LINUX_SPINLOCK_H
# error "please don't include this file directly"
#endif

/*
 * include/linux/spinlock_api_up.h
 *
 * spinlock API implementation on UP-nondebug (inlined implementation)
 *
 * portions Copyright 2005, Red Hat, Inc., Ingo Molnar
 * Released under the General Public License (GPL).
 */

#define in_lock_functions(ADDR)  0

#define assert_spin_locked(lock) do { (void)(lock); } while (0)

/*
 * In the UP-nondebug case there's no real locking going on, so the
 * only thing we have to do is to keep the preempt counts and irq
 * flags straight, to suppress compiler warnings of unused lock
 * variables, and to add the proper checker annotations:
 */
#define __LOCK(lock) \
  do { preempt_disable(); __acquire(lock); (void)(lock); } while (0)

#define __LOCK_BH(lock) \
  do { local_bh_disable(); __LOCK(lock); } while (0)

#define __LOCK_IRQ(lock) \
  do { local_irq_disable(); __LOCK(lock); } while (0)

#define __LOCK_IRQSAVE(lock, flags) \
  do { local_irq_save(flags); __LOCK(lock); } while (0)

#define __UNLOCK(lock) \
  do { preempt_enable(); __release(lock); (void)(lock); } while (0)

#define __UNLOCK_BH(lock) \
  do { preempt_enable_no_resched(); local_bh_enable(); __release(lock); (void)(lock); } while (0)

#define __UNLOCK_IRQ(lock) \
  do { local_irq_enable(); __UNLOCK(lock); } while (0)

#define __UNLOCK_IRQRESTORE(lock, flags) \
  do { local_irq_restore(flags); __UNLOCK(lock); } while (0)

#define _spin_lock(lock)   __LOCK(lock)
#define _spin_lock_nested(lock, subclass) __LOCK(lock)
#define _read_lock(lock)   __LOCK(lock)
#define _write_lock(lock)   __LOCK(lock)
#define _spin_lock_bh(lock)   __LOCK_BH(lock)
#define _read_lock_bh(lock)   __LOCK_BH(lock)
#define _write_lock_bh(lock)   __LOCK_BH(lock)
#define _spin_lock_irq(lock)   __LOCK_IRQ(lock)
#define _read_lock_irq(lock)   __LOCK_IRQ(lock)
#define _write_lock_irq(lock)   __LOCK_IRQ(lock)
#define _spin_lock_irqsave(lock, flags)  __LOCK_IRQSAVE(lock, flags)
#define _read_lock_irqsave(lock, flags)  __LOCK_IRQSAVE(lock, flags)
#define _write_lock_irqsave(lock, flags) __LOCK_IRQSAVE(lock, flags)
#define _spin_trylock(lock)   ({ __LOCK(lock); 1; })
#define _read_trylock(lock)   ({ __LOCK(lock); 1; })
#define _write_trylock(lock)   ({ __LOCK(lock); 1; })
#define _spin_trylock_bh(lock)   ({ __LOCK_BH(lock); 1; })
#define _spin_unlock(lock)   __UNLOCK(lock)
#define _read_unlock(lock)   __UNLOCK(lock)
#define _write_unlock(lock)   __UNLOCK(lock)
#define _spin_unlock_bh(lock)   __UNLOCK_BH(lock)
#define _write_unlock_bh(lock)   __UNLOCK_BH(lock)
#define _read_unlock_bh(lock)   __UNLOCK_BH(lock)
#define _spin_unlock_irq(lock)   __UNLOCK_IRQ(lock)
#define _read_unlock_irq(lock)   __UNLOCK_IRQ(lock)
#define _write_unlock_irq(lock)   __UNLOCK_IRQ(lock)
#define _spin_unlock_irqrestore(lock, flags) __UNLOCK_IRQRESTORE(lock, flags)
#define _read_unlock_irqrestore(lock, flags) __UNLOCK_IRQRESTORE(lock, flags)
#define _write_unlock_irqrestore(lock, flags) __UNLOCK_IRQRESTORE(lock, flags)

#endif /* __LINUX_SPINLOCK_API_UP_H */

其中


#define _spin_lock(lock)   __LOCK(lock)
#define __LOCK(lock) \
  do { preempt_disable(); __acquire(lock); (void)(lock); } while (0)

所以在uni-processor裡頭spin_lock的實做根本就只有把preempt關掉。再看看preempt_disable的實做在/include/linux/preempt.h


#ifdef CONFIG_PREEMPT
#define preempt_disable() \
do { \
 inc_preempt_count(); \
 barrier(); \
} while (0)
#else
#define preempt_disable()  do { } while (0)

#define inc_preempt_count() add_preempt_count(1)

# define add_preempt_count(val) do { preempt_count() += (val); } while (0)

#define preempt_count() (current_thread_info()->preempt_count)

上面都在同一個檔案裡，這邊只擷取用的到的部份。
所以preempt_disable如果在CONFIG_PREEMPT的環境就是把preempt_count加一而已。
如果是在non-preempt就根本什麼都沒做。所以如果你知道只有在process context會使用到spin_lock就可以使用最傳統的。這時儘管是preemptible kernel如果有timer interrupt進來。time slice時間已經用完了，由於切成non_preemptible的緣故，所以也不會被換出去。

接下來再看看其他幾個重要的spin_lock變形：
spin_lock_irq


#define _spin_lock_irq(lock)   __LOCK_IRQ(lock)
#define __LOCK_IRQ(lock) \
  do { local_irq_disable(); __LOCK(lock); } while (0)

include/linux/irqflags:


#define local_irq_disable() \
 do { raw_local_irq_disable(); trace_hardirqs_off(); } while (0)

arch/x86/include/asm/irqflags:


static inline void raw_local_irq_disable(void)
{
 native_irq_disable();
}
static inline void native_irq_disable(void)
{
 asm volatile("cli": : :"memory");
}

所以在x86裡面spin_lock_irq追到最後就是把cli關掉然後在disable preemption而已。這個時候比較吶悶的是為何中斷已經關掉還要disable preemption，有可能在沒有interrupt進來的情快下被搶佔嗎？沒有timer interrupt還有可能被搶佔？後來想一想，有一個可能是這樣，儘管關掉了外部中斷，內部有可能發生像page fault的這種trap，假設現在page fault的情況是read only導致的，這時系統會copy-on-write製造新的page給這個process使用，這時若發生了記憶體空間不足的問題。就會被抓去睡覺，如此就導致了搶佔的發生。上面是個人的理解，有錯請指教。

spin_lock_irqsave


#define _spin_lock_irqsave(lock, flags)  __LOCK_IRQSAVE(lock, flags)
#define __LOCK_IRQSAVE(lock, flags) \
  do { local_irq_save(flags); __LOCK(lock); } while (0)

include/linux/irqflags.h


#define local_irq_save(flags)    \
 do {      \
  typecheck(unsigned long, flags); \
  raw_local_irq_save(flags);  \
  trace_hardirqs_off();   \
 } while (0)

arch/x86/include/asm/irqflags.h


#define raw_local_irq_save(flags)    \
 do { (flags) = __raw_local_irq_save(); } while (0)

static inline unsigned long __raw_local_irq_save(void)
{
 unsigned long flags = __raw_local_save_flags();

 raw_local_irq_disable();

 return flags;
}
static inline unsigned long __raw_local_save_flags(void)
{
 return native_save_fl();
}
static inline unsigned long native_save_fl(void)
{
 unsigned long flags;

 asm volatile("# __raw_save_flags\n\t"
       "pushf ; pop %0"
       : "=g" (flags)
       : /* no input */
       : "memory");

 return flags;
}

spin_lock_irqsave和spin_lock_irq的差別：多了把flag存起來的行為。
一般來講只要有interrupt和process context共享spin lock以及process & process context間共享spin lock這兩者同時發生且preemptible的情形，都會spin_lock_irqsave。若是只有process&interrupt context共享lock或是像上例但是non-preemptible kernel，則只需要local_irq_save。

至於spin_lock_bh則是disable softirq，詳細的使用情形則沒有研究。

WORK QUEUE
work queue是一種延遲作業的機制。一般來說預設是使用內建的worker thread。（event0/1/2/3）
使用方法如下：

1. 如果你認為使用預設的worker thread無法處理龐大工作量。此時可以create一個專有的worker thread。
Create_workqueue(const char *name):為每個processer都create一個worker thread。
Create_singlethread_workque(const char *name):只為當下的cpu create一個worker thread。
記得這邊的name就是thread的name，往後可以在程式執行起來以後由ps aux看到。

2. 之後宣告可以有兩種
compile time:DECLARE_WORK(name, void (*function)(void*), void *data)
run time:
INIT_WORK(struct work_struct *work, void (*function)(void*), void *data)
PREPARE_WORK(struct work_struct *work, void (*function)(void*), void *data)
PREPARE_WORK書上（linux device driver 3rd）說沒有初始化work_struct。

3. 之後就是把work排進worker thread開始執行
有兩種，第一種是排進預設的event worker thread:
schedule_work(), schedule_delayed_work()
第二種是排進你自己create的worker thread:
queue_work(struct workqueue_struct *queue, struct work_struct *work)
queue_delayed_work(struct workqueue_struct *queue, struct work_struct *work, unsigned long delay)
第一個參數workqueue_struct就是你自己create的worker thread。

4. 若你想要快點把worker thread裡頭你自己的work執行，可以使用
flush_workqueue(struct workqueue_struct *queue)
若是想直接cancel你所排定的工作cancel_delayed_work(struct work_struct *work)
還是想要直接刪除你的kernel thread:destroy_workqueue(struct workqueue_struct *queue)

少年當立凌雲志

Thursday, January 07, 2010

Spin lock & work queue 整理

Labels

Followers

Blog Archive