Introduction to Kernel Programming
-
Upload
ahmed-mekkawy -
Category
Technology
-
view
5.368 -
download
0
Transcript of Introduction to Kernel Programming
Introduction to Kernel Coding
Demystifying Kernel Programming
Outline
● Context of execution● Memory● I/O
Mechanism vs Policy
● Mechanism: Interface to the system resources
● Policy: How the resource is used● Examples:
– Udev
– File configuration
Context of execution
● Possible contexts– System Call
– Interrupt Handling
– Tasklets
– Kernel threadsResourceHandler
Resource
User process
Kernel thread
System Call Handling
Interrupt Handling
Userspace
Kernelspace
Tasklet
Why do we care?
● Blocking:– Mutual exclusion / Reentrancy
– Resource Allocation
– Mixed context code
● System responsiveness● Crashes – what's at stake
Interface
● General Pattern– Central Data
Structure
– Register entry points
– Entry point definition
● Know your subsystem
Res
ourc
e H
andl
er
interface { meth1meth2
...}
Register
deregister
meth1 (DS)
meth2 (DS)
Container
SUBSYSTEM
consumer
Op
Example – Fileops
DR
IVE
R/F
S M
OD
ULE
fleops { myopenmyreadmyclose}
Register
deregister
myopen (FILE)
myread
myclose
M,M:FOPS
open(fd)
read
write
VFS
USERKERNEL
Registration
● For certain type, e.g. filesystem● For specific objects e.g. file ops
– Detection by the driver – legacy
– Detection by a bus driver
static int ext3_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, struct vfsmount *mnt){ return get_sb_bdev(fs_type, flags, dev_name, data, ext3_fill_super, mnt);}
static struct file_system_typeext3_fs_type = { .owner = THIS_MODULE, .name = "ext3", .get_sb = ext3_get_sb, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV,};
static int __init init_ext3_fs(void){ ... err = register_filesystem (&ext3_fs_type);
... return 0;}
struct vfsmount * vfs_kern_mount( struct file_system_type *type,
int flags, const char *name, void *data){ struct vfsmount *mnt; int error; mnt = alloc_vfsmnt(name);
... error = type->get_sb(type, flags, name, data, mnt);
... mnt->mnt_mountpoint = mnt->mnt_root;
... return mnt;}
static struct file_system_type **find_filesystem (const char *name, unsigned len){ struct file_system_type **p; for (p=&file_systems; *p; p=&(*p)->next) if (strlen((*p)->name) == len && strncmp((*p)->name, name, len) == 0) break; return p;}
struct vfsmount * do_kern_mount( const char *fstype,
int flags, const char *name, void *data){ struct file_system_type *type =
get_fs_type(fstype); struct vfsmount *mnt;
... mnt = vfs_kern_mount(type, flags, name, data);
... return mnt;}
int register_filesystem(struct file_system_type * fs){ int res = 0; struct file_system_type ** p;
... INIT_LIST_HEAD(&fs->fs_supers); write_lock(&file_systems_lock); p = find_filesystem(fs->name, strlen(fs->name)); if (*p) res = -EBUSY; else *p = fs; write_unlock(&file_systems_lock); return res;}
struct file_system_type *get_fs_type(const char *name){ struct file_system_type *fs; unsigned len = ... strlen(name); read_lock(&file_systems_lock); fs = *(find_filesystem(name, len)); read_unlock(&file_systems_lock); if (!fs && (request_module("%.*s", len, name) == 0)) { read_lock(&file_systems_lock); fs = *(find_filesystem(name, len)); if (fs && !try_module_get(fs->owner)) fs = NULL; read_unlock(&file_systems_lock); } return fs;}
VFS
EXT3
Device Model
(Bovet et al)
SUBSYSTEM
kset
kobject
attribute1attribute2
...ResourceHandler
PCI
pci_register_driver
probe
driver_if{...
probe}
Scan actions
register_device
Interrupts
● Registering for interrupts● Interrupt Handling – fast and alert
– Critical regions: Spinlocks and SMP systems
– Memory allocation
– System is unresponsive, interrupts masked
● Tasklets – pretty fast, pretty alert● Workqueues – sleep all you want
Interrupt Handling
WORKQhandler
ISR
Initialization
Tasklet
request_irq
Device
Interrupt
KERNEL PROPER
schedule_work
tasklet_schedule
DRIVER
static irqreturn_t ipw_isr(int irq, void *data){ struct ipw_priv *priv = data; u32 inta, inta_mask;
... spin_lock(&priv->irq_lock);
... inta_mask = ipw_read32(priv, IPW_INTA_MASK_R);
... if (!(inta & (IPW_INTA_MASK_ALL & inta_mask))) {
... } __ipw_disable_interrupts(priv); inta &= (IPW_INTA_MASK_ALL & inta_mask); ipw_write32(priv, IPW_INTA_RW, inta); priv->isr_inta = inta; tasklet_schedule(&priv->irq_tasklet); spin_unlock(&priv->irq_lock); return IRQ_HANDLED;}
static void ipw_bg_link_down(struct work_struct *work){ struct ipw_priv *priv = container_of(work, struct ipw_priv, link_down); mutex_lock(&priv->mutex); ipw_link_down(priv); mutex_unlock(&priv->mutex);}
static void ipw_irq_tasklet(struct ipw_priv *priv){ u32 inta, inta_mask, handled = 0; unsigned long flags; spin_lock_irqsave(&priv->irq_lock, flags); inta = ipw_read32(priv, IPW_INTA_RW); inta_mask = ipw_read32(priv, IPW_INTA_MASK_R); inta &= (IPW_INTA_MASK_ALL & inta_mask); spin_unlock_irqrestore(&priv->irq_lock, flags); spin_lock_irqsave(&priv->lock, flags); ... if (inta & IPW_INTA_BIT_RF_KILL_DONE) { ... cancel_delayed_work(&priv->request_scan); ... schedule_work(&priv->link_down); queue_delayed_work(priv->workqueue, &priv->rf_kill, 2 * HZ); handled |= IPW_INTA_BIT_RF_KILL_DONE; } ... spin_unlock_irqrestore(&priv->lock, flags); /* enable all interrupts */ ipw_enable_interrupts(priv);}
static int __devinitipw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent){ ... struct ipw_priv *priv; ... err = ipw_setup_deferred_work(priv); ... err = request_irq(pdev->irq, ipw_isr, IRQF_SHARED, DRV_NAME, priv); ...}
static int __devinitipw_setup_deferred_work(struct ipw_priv *priv){ priv->workqueue = create_workqueue(DRV_NAME); ... INIT_WORK(&priv->link_down, ipw_bg_link_down); ... tasklet_init(&priv->irq_tasklet, (void (*)(unsigned long)) ipw_irq_tasklet, (unsigned long)priv); ...}
TASKLET
ISR
WORKQ
PROBE
What Address Space?!!!
● Flat space– Access to pointers
– Symbols
● Across the boundary– copy_to/copy_from
asmlinkage long sys_sendmsg(int fd, struct msghdr __user *msg, unsigned flags){ struct compat_msghdr __user *msg_compat = (struct compat_msghdr __user *)msg; struct socket *sock; struct sockaddr_storage address; struct iovec *iov = iovstack; struct msghdr msg_sys; int err, iov_size, fput_needed;
... if (copy_from_user(&msg_sys, msg, sizeof(struct msghdr))) return -EFAULT;
... sock = sockfd_lookup_light(fd, &err, &fput_needed);
... iov_size = msg_sys.msg_iovlen * sizeof(struct iovec); ... iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL);
... err = verify_iovec(&msg_sys, iov, (struct sockaddr *)&address, VERIFY_READ);
... err = sock_sendmsg(sock, &msg_sys, total_len);
... return err;}
static struct socket *sock_from_file(struct file *file, int *err){ if (file->f_op == &socket_file_ops) return file->private_data; ...}
static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed){
struct file *file;struct socket *sock;
file = fget_light(fd, fput_needed);if (file) {
sock = sock_from_file(file, err);if (sock) return sock;fput_light(file, *fput_needed);
}return NULL;
}
#define files_fdtable(files) (rcu_dereference((files)->fdt))static inline void free_fdtable(struct fdtable *fdt){
call_rcu(&fdt->rcu, free_fdtable_rcu);}struct file *fget_light(unsigned int fd, int *fput_needed){ struct file *file; struct files_struct *files = current->files; *fput_needed = 0;
... rcu_read_lock(); file = fcheck_files(files, fd);
... rcu_read_unlock();
... return file;}static inline struct file * fcheck_files(struct files_struct *files, unsigned int fd){ struct file * file = NULL; struct fdtable *fdt = files_fdtable(files);
... file = rcu_dereference(fdt->fd[fd]); return file;}
SOCKETS
FS
struct fdtable {...struct file ** fd; struct rcu_head rcu;...
};int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr *kaddr){ if (copy_from_user(kaddr, uaddr, ulen)) return -EFAULT; .,,}
Allocation and flags
● Page Frame● Memory allocation
– Atomicity : GFP_ATOMIC from Reserved Pfs – no sleep
– Contiguity
– Region: GFP_HIGHMEM, GFP_DMA, GFP_KERNEL
● Slab allocator
Manipulating User memory
● Remapping page frames● Handling page faults
– Define vm_operations with a page fault handler
– Mark page frames to fault (e.g. fork in copy on write)
static intfb_mmap(struct file *file, struct vm_area_struct * vma){ int fbidx = iminor(file->f_path.dentry->d_inode); struct fb_info *info = registered_fb[fbidx]; unsigned long off; unsigned long start; u32 len;
... off = vma->vm_pgoff << PAGE_SHIFT; ... lock_kernel();
... /* frame buffer memory */ start = info->fix.smem_start; len = PAGE_ALIGN((start & ~PAGE_MASK) + info->fix.smem_len);
... unlock_kernel(); start &= PAGE_MASK;
.... off += start; vma->vm_pgoff = off >> PAGE_SHIFT; vma->vm_flags |= VM_IO | VM_RESERVED;
... if (io_remap_pfn_range(vma, vma->vm_start, off >> PAGE_SHIFT, vma->vm_end - vma->vm_start, vma->vm_page_prot)) return -EAGAIN; return 0;}intregister_framebuffer(struct fb_info *fb_info){
... registered_fb[i] = fb_info; ... return 0;}
static int __devinit nvidiafb_probe(struct pci_dev *pd, const struct pci_device_id *ent)
{ struct fb_info *info;
info = framebuffer_alloc(sizeof(struct nvidia_par), &pd->dev);...
nvidiafb_fix.smem_start = pci_resource_start(pd, 1);...
if (register_framebuffer(info) < 0) { printk(KERN_ERR PFX "error registering nVidia framebuffer\n");
... }
... return 0;}
NVIDIA
FRAME BUFFER
Manipulating VMAstatic int snd_pcm_mmap_status_fault(struct vm_area_struct *area, struct vm_fault *vmf){ struct snd_pcm_substream *substream = area->vm_private_data; struct snd_pcm_runtime *runtime;
runtime = substream->runtime; vmf->page = virt_to_page(runtime->status); get_page(vmf->page); return 0;}static struct vm_operations_struct snd_pcm_vm_ops_status ={ .fault = snd_pcm_mmap_status_fault,};
static int snd_pcm_mmap_status(struct snd_pcm_substream *substream, struct file *file, struct vm_area_struct *area)
{ long size; if (!(area->vm_flags & VM_READ)) return -EINVAL;
size = area->vm_end - area->vm_start; if (size != PAGE_ALIGN(sizeof(struct snd_pcm_mmap_status))) return -EINVAL; area->vm_ops = &snd_pcm_vm_ops_status; area->vm_private_data = substream; area->vm_flags |= VM_RESERVED; return 0;}
I/O
● Control data:– I/O memory remapping
● Data transfer:– DMA
– PCI Scatter Gather
static intqla2x00_iospace_config(scsi_qla_host_t *ha){ resource_size_t pio;
if (pci_request_selected_regions(ha->pdev, ha->bars, QLA2XXX_DRIVER_NAME)) { goto iospace_error_exit; }
/* Use MMIO operations for all accesses. */ if (!(pci_resource_flags(ha->pdev, 1) & IORESOURCE_MEM)) { goto iospace_error_exit; } if (pci_resource_len(ha->pdev, 1) < MIN_IOBASE_LEN) { goto iospace_error_exit; }
ha->iobase = ioremap(pci_resource_start(ha->pdev, 1), MIN_IOBASE_LEN); if (!ha->iobase) { goto iospace_error_exit; }
return (0);
iospace_error_exit: return (-ENOMEM);}
#define WRT_REG_WORD(addr, data) writew(data,addr)#define RD_REG_WORD_RELAXED(addr) readw_relaxed(addr)#define ISP_REQ_Q_IN(ha, reg) \
(IS_QLA2100(ha) || IS_QLA2200(ha) ? \ &(reg)->u.isp2100.mailbox4 : \ &(reg)->u.isp2300.req_q_in)
intqla2x00_start_scsi(srb_t *sp){ scsi_qla_host_t *ha;
... if (scsi_sg_count(cmd)) { nseg = dma_map_sg(&ha->pdev->dev, scsi_sglist(cmd), scsi_sg_count(cmd), cmd->sc_data_direction); } else nseg = 0;
... /* Set chip new ring index. */ WRT_REG_WORD(ISP_REQ_Q_IN(ha, reg), ha->req_ring_index); RD_REG_WORD_RELAXED(ISP_REQ_Q_IN(ha, reg)); /* PCI Posting. */}
Know your Subsystem
● Specific structures– Interface (entry points)
– The resource objects
● Specific registration interface● Specific objects
References
● Understanding the Linux Kernel (Daniel Bovet, Marco Cesati)
● Linux Device Drivers (Alessandro Rubini)● Linux Kernel Development (Robert Lowe)● Essential Linux Device Drivers
(Sreekrishman Venkateswaran)● Kernel Documentation● Code● http://www.gelato.unsw.edu.au/~dsw/public-
files/kernel-docs/kernel-api/