essential linux device driver的第二章A peek inside the kernel裡頭的Concurreny in the kernel有提到使用spin lock的四種case。寫得非常好。可以參考看看。
我的理解是這樣的,spin lock存在的目的是為了避免context switch的發生,什麼時候要避免使用context switch呢?就是處理共用變數的時間非常短。這時就不需要把process抓去睡覺,取而代之的是使用spin lock等待鎖的釋放,只要spin lock等待的時間少於兩次context switch(swap process out & swap process in)的時間。這樣就划的來了。而在單cpu上面只要你佔用了processor此時你要了一個lock,剛好的這個lock已經被其他的context(process or interrupt)所佔據。無論如何,當下你一定要進行context switch才能把執行權限放給其他的context把事情做完,並且把鎖釋放,這樣你才有機會獲得這把鎖。因此才有了以下不同的spin lock在單cpu裡面實做的變形,而不是真的在那邊spin。(在單處理器裡,spin lock的實做就是disable interrupt & disable preemption這樣就可以避免使用semaphore導致context switch)
下面把這三個function的source code攤出來看。
spin_lock() spin_lock_irq() spin_lock_irqsave()
在我的2.6.29的kernel上面spin lock的實做在uni-processor的source code裡頭是長這樣(include/linux/spinlock_api_up.h):
#ifndef __LINUX_SPINLOCK_API_UP_H
#define __LINUX_SPINLOCK_API_UP_H
#ifndef __LINUX_SPINLOCK_H
# error "please don't include this file directly"
#endif
/*
* include/linux/spinlock_api_up.h
*
* spinlock API implementation on UP-nondebug (inlined implementation)
*
* portions Copyright 2005, Red Hat, Inc., Ingo Molnar
* Released under the General Public License (GPL).
*/
#define in_lock_functions(ADDR) 0
#define assert_spin_locked(lock) do { (void)(lock); } while (0)
/*
* In the UP-nondebug case there's no real locking going on, so the
* only thing we have to do is to keep the preempt counts and irq
* flags straight, to suppress compiler warnings of unused lock
* variables, and to add the proper checker annotations:
*/
#define __LOCK(lock) \
do { preempt_disable(); __acquire(lock); (void)(lock); } while (0)
#define __LOCK_BH(lock) \
do { local_bh_disable(); __LOCK(lock); } while (0)
#define __LOCK_IRQ(lock) \
do { local_irq_disable(); __LOCK(lock); } while (0)
#define __LOCK_IRQSAVE(lock, flags) \
do { local_irq_save(flags); __LOCK(lock); } while (0)
#define __UNLOCK(lock) \
do { preempt_enable(); __release(lock); (void)(lock); } while (0)
#define __UNLOCK_BH(lock) \
do { preempt_enable_no_resched(); local_bh_enable(); __release(lock); (void)(lock); } while (0)
#define __UNLOCK_IRQ(lock) \
do { local_irq_enable(); __UNLOCK(lock); } while (0)
#define __UNLOCK_IRQRESTORE(lock, flags) \
do { local_irq_restore(flags); __UNLOCK(lock); } while (0)
#define _spin_lock(lock) __LOCK(lock)
#define _spin_lock_nested(lock, subclass) __LOCK(lock)
#define _read_lock(lock) __LOCK(lock)
#define _write_lock(lock) __LOCK(lock)
#define _spin_lock_bh(lock) __LOCK_BH(lock)
#define _read_lock_bh(lock) __LOCK_BH(lock)
#define _write_lock_bh(lock) __LOCK_BH(lock)
#define _spin_lock_irq(lock) __LOCK_IRQ(lock)
#define _read_lock_irq(lock) __LOCK_IRQ(lock)
#define _write_lock_irq(lock) __LOCK_IRQ(lock)
#define _spin_lock_irqsave(lock, flags) __LOCK_IRQSAVE(lock, flags)
#define _read_lock_irqsave(lock, flags) __LOCK_IRQSAVE(lock, flags)
#define _write_lock_irqsave(lock, flags) __LOCK_IRQSAVE(lock, flags)
#define _spin_trylock(lock) ({ __LOCK(lock); 1; })
#define _read_trylock(lock) ({ __LOCK(lock); 1; })
#define _write_trylock(lock) ({ __LOCK(lock); 1; })
#define _spin_trylock_bh(lock) ({ __LOCK_BH(lock); 1; })
#define _spin_unlock(lock) __UNLOCK(lock)
#define _read_unlock(lock) __UNLOCK(lock)
#define _write_unlock(lock) __UNLOCK(lock)
#define _spin_unlock_bh(lock) __UNLOCK_BH(lock)
#define _write_unlock_bh(lock) __UNLOCK_BH(lock)
#define _read_unlock_bh(lock) __UNLOCK_BH(lock)
#define _spin_unlock_irq(lock) __UNLOCK_IRQ(lock)
#define _read_unlock_irq(lock) __UNLOCK_IRQ(lock)
#define _write_unlock_irq(lock) __UNLOCK_IRQ(lock)
#define _spin_unlock_irqrestore(lock, flags) __UNLOCK_IRQRESTORE(lock, flags)
#define _read_unlock_irqrestore(lock, flags) __UNLOCK_IRQRESTORE(lock, flags)
#define _write_unlock_irqrestore(lock, flags) __UNLOCK_IRQRESTORE(lock, flags)
#endif /* __LINUX_SPINLOCK_API_UP_H */
其中
#define _spin_lock(lock) __LOCK(lock)
#define __LOCK(lock) \
do { preempt_disable(); __acquire(lock); (void)(lock); } while (0)
所以在uni-processor裡頭spin_lock的實做根本就只有把preempt關掉。再看看preempt_disable的實做在/include/linux/preempt.h
#ifdef CONFIG_PREEMPT
#define preempt_disable() \
do { \
inc_preempt_count(); \
barrier(); \
} while (0)
#else
#define preempt_disable() do { } while (0)
#define inc_preempt_count() add_preempt_count(1)
# define add_preempt_count(val) do { preempt_count() += (val); } while (0)
#define preempt_count() (current_thread_info()->preempt_count)
上面都在同一個檔案裡,這邊只擷取用的到的部份。
所以preempt_disable如果在CONFIG_PREEMPT的環境就是把preempt_count加一而已。
如果是在non-preempt就根本什麼都沒做。所以如果你知道只有在process context會使用到spin_lock就可以使用最傳統的。這時儘管是preemptible kernel如果有timer interrupt進來。time slice時間已經用完了,由於切成non_preemptible的緣故,所以也不會被換出去。
接下來再看看其他幾個重要的spin_lock變形:
spin_lock_irq
#define _spin_lock_irq(lock) __LOCK_IRQ(lock)
#define __LOCK_IRQ(lock) \
do { local_irq_disable(); __LOCK(lock); } while (0)
include/linux/irqflags:
#define local_irq_disable() \
do { raw_local_irq_disable(); trace_hardirqs_off(); } while (0)
arch/x86/include/asm/irqflags:
static inline void raw_local_irq_disable(void)
{
native_irq_disable();
}
static inline void native_irq_disable(void)
{
asm volatile("cli": : :"memory");
}
所以在x86裡面spin_lock_irq追到最後就是把cli關掉然後在disable preemption而已。這個時候比較吶悶的是為何中斷已經關掉還要disable preemption,有可能在沒有interrupt進來的情快下被搶佔嗎?沒有timer interrupt還有可能被搶佔?後來想一想,有一個可能是這樣,儘管關掉了外部中斷,內部有可能發生像page fault的這種trap,假設現在page fault的情況是read only導致的,這時系統會copy-on-write製造新的page給這個process使用,這時若發生了記憶體空間不足的問題。就會被抓去睡覺,如此就導致了搶佔的發生。上面是個人的理解,有錯請指教。
spin_lock_irqsave
#define _spin_lock_irqsave(lock, flags) __LOCK_IRQSAVE(lock, flags)
#define __LOCK_IRQSAVE(lock, flags) \
do { local_irq_save(flags); __LOCK(lock); } while (0)
include/linux/irqflags.h
#define local_irq_save(flags) \
do { \
typecheck(unsigned long, flags); \
raw_local_irq_save(flags); \
trace_hardirqs_off(); \
} while (0)
arch/x86/include/asm/irqflags.h
#define raw_local_irq_save(flags) \
do { (flags) = __raw_local_irq_save(); } while (0)
static inline unsigned long __raw_local_irq_save(void)
{
unsigned long flags = __raw_local_save_flags();
raw_local_irq_disable();
return flags;
}
static inline unsigned long __raw_local_save_flags(void)
{
return native_save_fl();
}
static inline unsigned long native_save_fl(void)
{
unsigned long flags;
asm volatile("# __raw_save_flags\n\t"
"pushf ; pop %0"
: "=g" (flags)
: /* no input */
: "memory");
return flags;
}
spin_lock_irqsave和spin_lock_irq的差別:多了把flag存起來的行為。
一般來講只要有interrupt和process context共享spin lock以及process & process context間共享spin lock這兩者同時發生且preemptible的情形,都會spin_lock_irqsave。若是只有process&interrupt context共享lock或是像上例但是non-preemptible kernel,則只需要local_irq_save。
至於spin_lock_bh則是disable softirq,詳細的使用情形則沒有研究。
WORK QUEUE
work queue是一種延遲作業的機制。一般來說預設是使用內建的worker thread。(event0/1/2/3)
使用方法如下:
1. 如果你認為使用預設的worker thread無法處理龐大工作量。此時可以create一個專有的worker thread。
Create_workqueue(const char *name):為每個processer都create一個worker thread。
Create_singlethread_workque(const char *name):只為當下的cpu create一個worker thread。
記得這邊的name就是thread的name,往後可以在程式執行起來以後由ps aux看到。
2. 之後宣告可以有兩種
compile time:DECLARE_WORK(name, void (*function)(void*), void *data)
run time:
INIT_WORK(struct work_struct *work, void (*function)(void*), void *data)
PREPARE_WORK(struct work_struct *work, void (*function)(void*), void *data)
PREPARE_WORK書上(linux device driver 3rd)說沒有初始化work_struct。
3. 之後就是把work排進worker thread開始執行
有兩種,第一種是排進預設的event worker thread:
schedule_work(), schedule_delayed_work()
第二種是排進你自己create的worker thread:
queue_work(struct workqueue_struct *queue, struct work_struct *work)
queue_delayed_work(struct workqueue_struct *queue, struct work_struct *work, unsigned long delay)
第一個參數workqueue_struct就是你自己create的worker thread。
4. 若你想要快點把worker thread裡頭你自己的work執行,可以使用
flush_workqueue(struct workqueue_struct *queue)
若是想直接cancel你所排定的工作cancel_delayed_work(struct work_struct *work)
還是想要直接刪除你的kernel thread:destroy_workqueue(struct workqueue_struct *queue)