driver-know-hows

device driver related stuff

View on GitHub

Chapter 3: Advanced File Operations

Table of Contents

  1. The file_operations Structure in Depth
  2. ioctl - Device Control
  3. poll and select
  4. llseek - Seeking in Device Files
  5. mmap - Memory Mapping
  6. Asynchronous I/O
  7. fsync - Synchronization
  8. Advanced Examples

The file_operations Structure in Depth

Complete Structure Overview

#include <linux/fs.h>

/*
 * file_operations - Complete structure (kernel 5.x/6.x)
 * 
 * Not all operations need to be implemented.
 * Set unimplemented operations to NULL.
 */
struct file_operations {
    struct module *owner;
    
    /* Position operations */
    loff_t (*llseek) (struct file *, loff_t, int);
    
    /* Read/Write operations */
    ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
    ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
    ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
    ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
    
    /* Directory operations (not used for device drivers) */
    int (*iterate) (struct file *, struct dir_context *);
    int (*iterate_shared) (struct file *, struct dir_context *);
    
    /* Polling */
    __poll_t (*poll) (struct file *, struct poll_table_struct *);
    
    /* Device control */
    long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
    long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
    
    /* Memory mapping */
    int (*mmap) (struct file *, struct vm_area_struct *);
    unsigned long mmap_supported_flags;
    
    /* Open/Release */
    int (*open) (struct inode *, struct file *);
    int (*flush) (struct file *, fl_owner_t id);
    int (*release) (struct inode *, struct file *);
    
    /* Synchronization */
    int (*fsync) (struct file *, loff_t, loff_t, int datasync);
    
    /* Async operations */
    int (*fasync) (int, struct file *, int);
    
    /* Locking */
    int (*lock) (struct file *, int, struct file_lock *);
    int (*flock) (struct file *, int, struct file_lock *);
    
    /* Sendfile operations */
    ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, 
                           loff_t *, size_t, unsigned int);
    ssize_t (*splice_read)(struct file *, loff_t *,
                          struct pipe_inode_info *, size_t, unsigned int);
    
    /* Misc */
    int (*setlease)(struct file *, long, struct file_lock **, void **);
    long (*fallocate)(struct file *file, int mode, loff_t offset, loff_t len);
    void (*show_fdinfo)(struct seq_file *m, struct file *f);
    ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
                              loff_t, size_t, unsigned int);
    loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
                              struct file *file_out, loff_t pos_out,
                              loff_t len, unsigned int remap_flags);
};

Important Data Structures

/*
 * struct file - Represents an open file
 * 
 * Created when file is opened, passed to most file operations
 */
struct file {
    struct path f_path;               /* File path */
    struct inode *f_inode;            /* Cached inode */
    const struct file_operations *f_op; /* File operations */
    
    unsigned int f_flags;             /* Open flags (O_RDONLY, O_NONBLOCK, etc.) */
    fmode_t f_mode;                   /* File mode (FMODE_READ, FMODE_WRITE) */
    loff_t f_pos;                     /* Current position */
    
    struct fown_struct f_owner;       /* Owner for async notification */
    const struct cred *f_cred;        /* File credentials */
    
    void *private_data;               /* Driver-specific data */
    
    /* ... many more fields ... */
};

/*
 * struct inode - Represents a file on disk
 * 
 * Contains metadata about the file
 */
struct inode {
    umode_t i_mode;                   /* File type and permissions */
    unsigned short i_opflags;
    kuid_t i_uid;                     /* Owner user ID */
    kgid_t i_gid;                     /* Owner group ID */
    unsigned int i_flags;
    
    const struct inode_operations *i_op;
    struct super_block *i_sb;
    
    union {
        struct pipe_inode_info *i_pipe;
        struct block_device *i_bdev;
        struct cdev *i_cdev;          /* Character device */
        char *i_link;
    };
    
    dev_t i_rdev;                     /* Device number */
    loff_t i_size;                    /* File size */
    struct timespec64 i_atime;        /* Access time */
    struct timespec64 i_mtime;        /* Modification time */
    struct timespec64 i_ctime;        /* Change time */
    
    /* ... many more fields ... */
};

ioctl - Device Control

Theory: ioctl Overview

ioctl (input/output control) provides a mechanism for device-specific operations that don’t fit into the standard read/write model.

Common uses:

ioctl Command Encoding

#include <linux/ioctl.h>

/*
 * ioctl command structure (32-bit value):
 * 
 * Bits:
 * 31-30: Direction (read/write)
 * 29-16: Size of argument (14 bits)
 * 15-8:  Type (magic number, usually ASCII character)
 * 7-0:   Command number (0-255)
 */

/*
 * Direction bits
 */
#define _IOC_NONE  0U    /* No data transfer */
#define _IOC_WRITE 1U    /* User writing to kernel */
#define _IOC_READ  2U    /* Kernel writing to user */

/*
 * Helper macros to create ioctl commands
 */

/* No argument */
#define _IO(type, nr)        _IOC(_IOC_NONE, (type), (nr), 0)

/* Write parameter */
#define _IOW(type, nr, size) _IOC(_IOC_WRITE, (type), (nr), sizeof(size))

/* Read parameter */
#define _IOR(type, nr, size) _IOC(_IOC_READ, (type), (nr), sizeof(size))

/* Read and write parameter */
#define _IOWR(type, nr, size) _IOC(_IOC_READ|_IOC_WRITE, (type), (nr), sizeof(size))

/*
 * Macros to extract information from ioctl command
 */
#define _IOC_DIR(nr)   (((nr) >> 30) & 0x03)
#define _IOC_TYPE(nr)  (((nr) >> 8) & 0xFF)
#define _IOC_NR(nr)    (((nr) >> 0) & 0xFF)
#define _IOC_SIZE(nr)  (((nr) >> 16) & 0x3FFF)

Implementing ioctl

mydev_ioctl.h - Shared header for kernel and userspace:

/*
 * mydev_ioctl.h - ioctl definitions for mydevice
 * 
 * This file is included by both kernel driver and userspace applications
 */

#ifndef MYDEV_IOCTL_H
#define MYDEV_IOCTL_H

#include <linux/ioctl.h>

/*
 * Magic number for this driver
 * Choose an unused number from Documentation/ioctl/ioctl-number.rst
 * Use ASCII character for readability
 */
#define MYDEV_IOC_MAGIC 'M'

/*
 * Data structure for complex ioctl operations
 */
struct mydev_config {
    unsigned int speed;      /* Speed in Hz */
    unsigned int mode;       /* Operating mode */
    unsigned int flags;      /* Configuration flags */
    char name[32];           /* Device name */
};

/*
 * Define ioctl commands
 */

/* Reset device - no argument */
#define MYDEV_IOCRESET    _IO(MYDEV_IOC_MAGIC, 0)

/* Get device speed - read int */
#define MYDEV_IOCGSPEED   _IOR(MYDEV_IOC_MAGIC, 1, int)

/* Set device speed - write int */
#define MYDEV_IOCSSPEED   _IOW(MYDEV_IOC_MAGIC, 2, int)

/* Get configuration - read struct */
#define MYDEV_IOCGCONFIG  _IOR(MYDEV_IOC_MAGIC, 3, struct mydev_config)

/* Set configuration - write struct */
#define MYDEV_IOCSCONFIG  _IOW(MYDEV_IOC_MAGIC, 4, struct mydev_config)

/* Exchange configuration - read/write struct */
#define MYDEV_IOCXCONFIG  _IOWR(MYDEV_IOC_MAGIC, 5, struct mydev_config)

/* Maximum command number */
#define MYDEV_IOC_MAXNR 5

#endif /* MYDEV_IOCTL_H */

Kernel driver implementation:

/*
 * mydev_driver.c - Driver with ioctl implementation
 */

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/cdev.h>
#include <linux/device.h>
#include <linux/uaccess.h>
#include "mydev_ioctl.h"

/* Device data structure */
struct mydev_data {
    struct cdev cdev;
    int speed;
    int mode;
    int flags;
    char name[32];
};

static struct mydev_data *mydev;
static dev_t dev_num;
static struct class *dev_class;

/*
 * ioctl implementation
 * 
 * @filp: File pointer
 * @cmd:  ioctl command
 * @arg:  Command argument (can be int or pointer to struct)
 * 
 * Return: 0 on success, negative error code on failure
 */
static long mydev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
    int retval = 0;
    int tmp;
    struct mydev_config config;
    
    /*
     * Extract command components
     */
    unsigned int dir = _IOC_DIR(cmd);
    unsigned int type = _IOC_TYPE(cmd);
    unsigned int nr = _IOC_NR(cmd);
    unsigned int size = _IOC_SIZE(cmd);
    
    pr_info("ioctl: cmd=0x%x, dir=%u, type=%c, nr=%u, size=%u\n",
            cmd, dir, type, nr, size);
    
    /*
     * Verify ioctl command validity
     */
    
    /* Check magic number */
    if (type != MYDEV_IOC_MAGIC) {
        pr_err("ioctl: Invalid magic number\n");
        return -ENOTTY;  /* Inappropriate ioctl for device */
    }
    
    /* Check command number range */
    if (nr > MYDEV_IOC_MAXNR) {
        pr_err("ioctl: Invalid command number\n");
        return -ENOTTY;
    }
    
    /*
     * Verify user space pointer validity
     * 
     * access_ok() checks if user space address is valid
     * (replaced with just checking in newer kernels as copy_* does this)
     */
    if (dir & _IOC_READ)
        retval = !access_ok((void __user *)arg, size);
    else if (dir & _IOC_WRITE)
        retval = !access_ok((void __user *)arg, size);
    
    if (retval)
        return -EFAULT;
    
    /*
     * Handle each ioctl command
     */
    switch (cmd) {
    
    case MYDEV_IOCRESET:
        /*
         * Reset device to default state
         * No argument
         */
        pr_info("ioctl: RESET command\n");
        mydev->speed = 1000;
        mydev->mode = 0;
        mydev->flags = 0;
        strcpy(mydev->name, "default");
        break;
    
    case MYDEV_IOCGSPEED:
        /*
         * Get speed - copy to userspace
         */
        pr_info("ioctl: GET SPEED command\n");
        tmp = mydev->speed;
        if (copy_to_user((int __user *)arg, &tmp, sizeof(tmp)))
            return -EFAULT;
        break;
    
    case MYDEV_IOCSSPEED:
        /*
         * Set speed - copy from userspace
         */
        pr_info("ioctl: SET SPEED command\n");
        if (copy_from_user(&tmp, (int __user *)arg, sizeof(tmp)))
            return -EFAULT;
        
        /* Validate input */
        if (tmp < 0 || tmp > 100000) {
            pr_err("ioctl: Invalid speed value: %d\n", tmp);
            return -EINVAL;
        }
        
        mydev->speed = tmp;
        pr_info("ioctl: Speed set to %d\n", mydev->speed);
        break;
    
    case MYDEV_IOCGCONFIG:
        /*
         * Get configuration - copy struct to userspace
         */
        pr_info("ioctl: GET CONFIG command\n");
        config.speed = mydev->speed;
        config.mode = mydev->mode;
        config.flags = mydev->flags;
        strncpy(config.name, mydev->name, sizeof(config.name));
        
        if (copy_to_user((struct mydev_config __user *)arg, 
                        &config, sizeof(config)))
            return -EFAULT;
        break;
    
    case MYDEV_IOCSCONFIG:
        /*
         * Set configuration - copy struct from userspace
         */
        pr_info("ioctl: SET CONFIG command\n");
        if (copy_from_user(&config, (struct mydev_config __user *)arg,
                          sizeof(config)))
            return -EFAULT;
        
        /* Validate and apply configuration */
        mydev->speed = config.speed;
        mydev->mode = config.mode;
        mydev->flags = config.flags;
        strncpy(mydev->name, config.name, sizeof(mydev->name));
        mydev->name[sizeof(mydev->name) - 1] = '\0';  /* Ensure null termination */
        
        pr_info("ioctl: Config applied\n");
        break;
    
    case MYDEV_IOCXCONFIG:
        /*
         * Exchange configuration
         * Get current config, set new config, return old config
         */
        pr_info("ioctl: EXCHANGE CONFIG command\n");
        
        /* Read new config from user */
        if (copy_from_user(&config, (struct mydev_config __user *)arg,
                          sizeof(config)))
            return -EFAULT;
        
        /* Save old config */
        struct mydev_config old_config;
        old_config.speed = mydev->speed;
        old_config.mode = mydev->mode;
        old_config.flags = mydev->flags;
        strncpy(old_config.name, mydev->name, sizeof(old_config.name));
        
        /* Apply new config */
        mydev->speed = config.speed;
        mydev->mode = config.mode;
        mydev->flags = config.flags;
        strncpy(mydev->name, config.name, sizeof(mydev->name));
        
        /* Return old config to user */
        if (copy_to_user((struct mydev_config __user *)arg,
                        &old_config, sizeof(old_config)))
            return -EFAULT;
        break;
    
    default:
        pr_err("ioctl: Unknown command: 0x%x\n", cmd);
        return -ENOTTY;
    }
    
    return retval;
}

static int mydev_open(struct inode *inode, struct file *filp)
{
    pr_info("Device opened\n");
    return 0;
}

static int mydev_release(struct inode *inode, struct file *filp)
{
    pr_info("Device closed\n");
    return 0;
}

static struct file_operations fops = {
    .owner          = THIS_MODULE,
    .open           = mydev_open,
    .release        = mydev_release,
    .unlocked_ioctl = mydev_ioctl,  /* ioctl handler */
};

static int __init mydev_init(void)
{
    int ret;
    
    /* Allocate device data */
    mydev = kzalloc(sizeof(struct mydev_data), GFP_KERNEL);
    if (!mydev)
        return -ENOMEM;
    
    /* Initialize device data */
    mydev->speed = 1000;
    mydev->mode = 0;
    mydev->flags = 0;
    strcpy(mydev->name, "default");
    
    /* Allocate device number */
    ret = alloc_chrdev_region(&dev_num, 0, 1, "mydev");
    if (ret < 0)
        goto err_alloc;
    
    /* Create class */
    dev_class = class_create(THIS_MODULE, "mydev_class");
    if (IS_ERR(dev_class)) {
        ret = PTR_ERR(dev_class);
        goto err_class;
    }
    
    /* Initialize and add cdev */
    cdev_init(&mydev->cdev, &fops);
    mydev->cdev.owner = THIS_MODULE;
    ret = cdev_add(&mydev->cdev, dev_num, 1);
    if (ret < 0)
        goto err_cdev;
    
    /* Create device */
    if (IS_ERR(device_create(dev_class, NULL, dev_num, NULL, "mydev"))) {
        ret = -EINVAL;
        goto err_device;
    }
    
    pr_info("mydev: Device initialized\n");
    return 0;

err_device:
    cdev_del(&mydev->cdev);
err_cdev:
    class_destroy(dev_class);
err_class:
    unregister_chrdev_region(dev_num, 1);
err_alloc:
    kfree(mydev);
    return ret;
}

static void __exit mydev_exit(void)
{
    device_destroy(dev_class, dev_num);
    cdev_del(&mydev->cdev);
    class_destroy(dev_class);
    unregister_chrdev_region(dev_num, 1);
    kfree(mydev);
    pr_info("mydev: Device removed\n");
}

module_init(mydev_init);
module_exit(mydev_exit);

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Tutorial Author");
MODULE_DESCRIPTION("Device driver with ioctl support");

Userspace test program:

/*
 * test_ioctl.c - Userspace program to test ioctl
 * 
 * Compile: gcc -o test_ioctl test_ioctl.c
 */

#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/ioctl.h>
#include <string.h>
#include "mydev_ioctl.h"

int main(int argc, char *argv[])
{
    int fd;
    int speed;
    struct mydev_config config;
    
    /* Open device */
    fd = open("/dev/mydev", O_RDWR);
    if (fd < 0) {
        perror("Failed to open device");
        return EXIT_FAILURE;
    }
    
    printf("Device opened successfully\n");
    
    /* Test 1: Reset device */
    printf("\n--- Test 1: Reset device ---\n");
    if (ioctl(fd, MYDEV_IOCRESET, NULL) < 0) {
        perror("IOCRESET failed");
    } else {
        printf("Device reset successfully\n");
    }
    
    /* Test 2: Get speed */
    printf("\n--- Test 2: Get speed ---\n");
    if (ioctl(fd, MYDEV_IOCGSPEED, &speed) < 0) {
        perror("IOCGSPEED failed");
    } else {
        printf("Current speed: %d\n", speed);
    }
    
    /* Test 3: Set speed */
    printf("\n--- Test 3: Set speed ---\n");
    speed = 5000;
    if (ioctl(fd, MYDEV_IOCSSPEED, &speed) < 0) {
        perror("IOCSSPEED failed");
    } else {
        printf("Speed set to: %d\n", speed);
    }
    
    /* Test 4: Get configuration */
    printf("\n--- Test 4: Get configuration ---\n");
    if (ioctl(fd, MYDEV_IOCGCONFIG, &config) < 0) {
        perror("IOCGCONFIG failed");
    } else {
        printf("Configuration:\n");
        printf("  Speed: %u\n", config.speed);
        printf("  Mode:  %u\n", config.mode);
        printf("  Flags: %u\n", config.flags);
        printf("  Name:  %s\n", config.name);
    }
    
    /* Test 5: Set configuration */
    printf("\n--- Test 5: Set configuration ---\n");
    config.speed = 10000;
    config.mode = 2;
    config.flags = 0x0F;
    strcpy(config.name, "custom_config");
    
    if (ioctl(fd, MYDEV_IOCSCONFIG, &config) < 0) {
        perror("IOCSCONFIG failed");
    } else {
        printf("Configuration set successfully\n");
    }
    
    /* Test 6: Exchange configuration */
    printf("\n--- Test 6: Exchange configuration ---\n");
    config.speed = 20000;
    config.mode = 3;
    config.flags = 0xFF;
    strcpy(config.name, "exchanged");
    
    if (ioctl(fd, MYDEV_IOCXCONFIG, &config) < 0) {
        perror("IOCXCONFIG failed");
    } else {
        printf("Old configuration (returned):\n");
        printf("  Speed: %u\n", config.speed);
        printf("  Mode:  %u\n", config.mode);
        printf("  Flags: %u\n", config.flags);
        printf("  Name:  %s\n", config.name);
    }
    
    /* Close device */
    close(fd);
    printf("\nDevice closed\n");
    
    return EXIT_SUCCESS;
}

poll and select

Theory: Blocking I/O vs Non-blocking I/O

poll and select allow userspace programs to monitor multiple file descriptors for I/O readiness:

Implementing poll

/*
 * poll_driver.c - Driver with poll support
 */

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/cdev.h>
#include <linux/device.h>
#include <linux/poll.h>
#include <linux/wait.h>
#include <linux/sched.h>
#include <linux/uaccess.h>

struct poll_dev {
    struct cdev cdev;
    wait_queue_head_t read_queue;   /* Wait queue for readers */
    wait_queue_head_t write_queue;  /* Wait queue for writers */
    char buffer[256];
    size_t data_size;
    int readable;                    /* Flag: data available for reading */
    int writable;                    /* Flag: space available for writing */
};

static struct poll_dev *pdev;
static dev_t dev_num;
static struct class *dev_class;

/*
 * poll implementation
 * 
 * @filp: File pointer
 * @wait: Poll table (kernel structure)
 * 
 * Return: Mask of ready events (EPOLLIN, EPOLLOUT, etc.)
 */
static __poll_t poll_dev_poll(struct file *filp, poll_table *wait)
{
    __poll_t mask = 0;
    
    pr_info("poll: Called\n");
    
    /*
     * Add our wait queues to the poll table
     * 
     * This doesn't block. It registers our wait queues so that
     * if we return 0 (not ready), the kernel knows where to wait.
     */
    poll_wait(filp, &pdev->read_queue, wait);
    poll_wait(filp, &pdev->write_queue, wait);
    
    /*
     * Check device state and return appropriate mask
     */
    
    /* Data available for reading? */
    if (pdev->readable) {
        mask |= EPOLLIN | EPOLLRDNORM;  /* Readable */
        pr_info("poll: Device is readable\n");
    }
    
    /* Space available for writing? */
    if (pdev->writable) {
        mask |= EPOLLOUT | EPOLLWRNORM;  /* Writable */
        pr_info("poll: Device is writable\n");
    }
    
    /*
     * Other possible flags:
     * EPOLLERR  - Error condition
     * EPOLLHUP  - Hang up
     * EPOLLPRI  - Urgent data available
     */
    
    return mask;
}

static ssize_t poll_dev_read(struct file *filp, char __user *buf,
                             size_t count, loff_t *f_pos)
{
    ssize_t retval;
    
    /*
     * If no data available and non-blocking mode, return immediately
     */
    if (!pdev->readable && (filp->f_flags & O_NONBLOCK))
        return -EAGAIN;
    
    /*
     * If no data available and blocking mode, wait for data
     */
    if (!pdev->readable) {
        pr_info("read: Waiting for data...\n");
        if (wait_event_interruptible(pdev->read_queue, pdev->readable))
            return -ERESTARTSYS;  /* Interrupted by signal */
    }
    
    /* Read data */
    if (count > pdev->data_size)
        count = pdev->data_size;
    
    if (copy_to_user(buf, pdev->buffer, count))
        return -EFAULT;
    
    pdev->data_size = 0;
    pdev->readable = 0;
    pdev->writable = 1;
    
    /* Wake up writers waiting for space */
    wake_up_interruptible(&pdev->write_queue);
    
    retval = count;
    pr_info("read: Read %zu bytes\n", count);
    
    return retval;
}

static ssize_t poll_dev_write(struct file *filp, const char __user *buf,
                              size_t count, loff_t *f_pos)
{
    ssize_t retval;
    
    /* If no space and non-blocking, return immediately */
    if (!pdev->writable && (filp->f_flags & O_NONBLOCK))
        return -EAGAIN;
    
    /* If no space and blocking, wait for space */
    if (!pdev->writable) {
        pr_info("write: Waiting for space...\n");
        if (wait_event_interruptible(pdev->write_queue, pdev->writable))
            return -ERESTARTSYS;
    }
    
    /* Write data */
    if (count > sizeof(pdev->buffer))
        count = sizeof(pdev->buffer);
    
    if (copy_from_user(pdev->buffer, buf, count))
        return -EFAULT;
    
    pdev->data_size = count;
    pdev->readable = 1;
    pdev->writable = 0;
    
    /* Wake up readers waiting for data */
    wake_up_interruptible(&pdev->read_queue);
    
    retval = count;
    pr_info("write: Wrote %zu bytes\n", count);
    
    return retval;
}

static int poll_dev_open(struct inode *inode, struct file *filp)
{
    pr_info("Device opened\n");
    return 0;
}

static int poll_dev_release(struct inode *inode, struct file *filp)
{
    pr_info("Device closed\n");
    return 0;
}

static struct file_operations fops = {
    .owner   = THIS_MODULE,
    .open    = poll_dev_open,
    .release = poll_dev_release,
    .read    = poll_dev_read,
    .write   = poll_dev_write,
    .poll    = poll_dev_poll,  /* poll handler */
};

static int __init poll_dev_init(void)
{
    int ret;
    
    pdev = kzalloc(sizeof(struct poll_dev), GFP_KERNEL);
    if (!pdev)
        return -ENOMEM;
    
    /* Initialize wait queues */
    init_waitqueue_head(&pdev->read_queue);
    init_waitqueue_head(&pdev->write_queue);
    
    /* Initially writable (empty buffer) */
    pdev->readable = 0;
    pdev->writable = 1;
    pdev->data_size = 0;
    
    /* Standard device registration... */
    ret = alloc_chrdev_region(&dev_num, 0, 1, "polldev");
    if (ret < 0)
        goto err;
    
    dev_class = class_create(THIS_MODULE, "polldev_class");
    if (IS_ERR(dev_class)) {
        ret = PTR_ERR(dev_class);
        goto err_class;
    }
    
    cdev_init(&pdev->cdev, &fops);
    ret = cdev_add(&pdev->cdev, dev_num, 1);
    if (ret < 0)
        goto err_cdev;
    
    device_create(dev_class, NULL, dev_num, NULL, "polldev");
    
    pr_info("polldev: Initialized\n");
    return 0;

err_cdev:
    class_destroy(dev_class);
err_class:
    unregister_chrdev_region(dev_num, 1);
err:
    kfree(pdev);
    return ret;
}

static void __exit poll_dev_exit(void)
{
    device_destroy(dev_class, dev_num);
    cdev_del(&pdev->cdev);
    class_destroy(dev_class);
    unregister_chrdev_region(dev_num, 1);
    kfree(pdev);
    pr_info("polldev: Removed\n");
}

module_init(poll_dev_init);
module_exit(poll_dev_exit);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Driver with poll support");

Userspace test with poll:

/*
 * test_poll.c - Test poll functionality
 * 
 * Compile: gcc -o test_poll test_poll.c
 */

#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <poll.h>
#include <string.h>

int main(void)
{
    int fd;
    struct pollfd fds[1];
    int ret;
    char buffer[256];
    
    /* Open device */
    fd = open("/dev/polldev", O_RDWR);
    if (fd < 0) {
        perror("open");
        return EXIT_FAILURE;
    }
    
    /* Setup poll structure */
    fds[0].fd = fd;
    fds[0].events = POLLIN | POLLOUT;  /* Monitor read and write */
    
    printf("Testing poll...\n");
    
    /* Test 1: Check if writable (should be) */
    printf("\nTest 1: Check if device is writable\n");
    ret = poll(fds, 1, 5000);  /* 5 second timeout */
    if (ret < 0) {
        perror("poll");
    } else if (ret == 0) {
        printf("Timeout - no events\n");
    } else {
        if (fds[0].revents & POLLOUT)
            printf("Device is writable!\n");
        if (fds[0].revents & POLLIN)
            printf("Device is readable!\n");
    }
    
    /* Test 2: Write data, then check if readable */
    printf("\nTest 2: Write data and check readability\n");
    const char *msg = "Hello, poll!";
    write(fd, msg, strlen(msg));
    printf("Data written\n");
    
    fds[0].events = POLLIN;  /* Only monitor read */
    ret = poll(fds, 1, 5000);
    if (ret > 0 && (fds[0].revents & POLLIN)) {
        printf("Device is readable!\n");
        ssize_t n = read(fd, buffer, sizeof(buffer) - 1);
        buffer[n] = '\0';
        printf("Read: %s\n", buffer);
    }
    
    /* Test 3: Non-blocking read when no data */
    printf("\nTest 3: Non-blocking read with no data\n");
    fcntl(fd, F_SETFL, O_NONBLOCK);
    ret = read(fd, buffer, sizeof(buffer));
    if (ret < 0)
        perror("Non-blocking read returned");
    
    close(fd);
    return EXIT_SUCCESS;
}

llseek - Seeking in Device Files

Theory: File Position Management

Some devices support seeking (changing file position). For devices with addressable storage, implementing llseek allows random access.

Implementing llseek

/*
 * llseek implementation
 */

#include <linux/fs.h>

#define DEVICE_SIZE 4096  /* Size of our "device" */

struct seekable_dev {
    char buffer[DEVICE_SIZE];
    struct cdev cdev;
};

/*
 * llseek implementation
 * 
 * @filp:   File pointer
 * @offset: Offset to seek to/by
 * @whence: Seek mode (SEEK_SET, SEEK_CUR, SEEK_END)
 * 
 * Return: New file position, or negative error code
 */
static loff_t seekable_llseek(struct file *filp, loff_t offset, int whence)
{
    loff_t new_pos;
    
    pr_info("llseek: offset=%lld, whence=%d\n", offset, whence);
    
    switch (whence) {
    case SEEK_SET:  /* Absolute position */
        new_pos = offset;
        break;
    
    case SEEK_CUR:  /* Relative to current position */
        new_pos = filp->f_pos + offset;
        break;
    
    case SEEK_END:  /* Relative to end */
        new_pos = DEVICE_SIZE + offset;
        break;
    
    default:
        return -EINVAL;
    }
    
    /* Validate new position */
    if (new_pos < 0 || new_pos > DEVICE_SIZE) {
        pr_err("llseek: Invalid position: %lld\n", new_pos);
        return -EINVAL;
    }
    
    /* Update file position */
    filp->f_pos = new_pos;
    pr_info("llseek: New position: %lld\n", new_pos);
    
    return new_pos;
}

/*
 * For devices that don't support seeking
 */
static loff_t no_llseek(struct file *filp, loff_t offset, int whence)
{
    return -ESPIPE;  /* Illegal seek (like pipe) */
}

/*
 * Use default llseek (works for most cases)
 */
static struct file_operations fops = {
    .llseek = default_llseek,  /* Provided by kernel */
    /* ... other operations ... */
};

/*
 * Disable seeking completely
 */
static struct file_operations no_seek_fops = {
    .llseek = no_llseek,
    /* ... other operations ... */
};

mmap - Memory Mapping

Theory: Memory Mapped I/O

mmap allows userspace to directly access device memory without system calls for each access. Very efficient for:

mmap Architecture

User Space                  Kernel Space
┌────────────┐              ┌────────────┐
│            │              │            │
│ User       │   mmap()     │  Device    │
│ Process    ├──────────────┤  Driver    │
│            │              │            │
│ ┌────────┐ │              │ ┌────────┐ │
│ │Virtual │ │              │ │Physical│ │
│ │Address │ ◄──────────────► │Memory  │ │
│ │Space   │ │  Page Tables │ │        │ │
│ └────────┘ │              │ └────────┘ │
└────────────┘              └────────────┘

Implementing mmap

/*
 * mmap_driver.c - Device driver with mmap support
 */

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/cdev.h>
#include <linux/device.h>
#include <linux/mm.h>
#include <linux/slab.h>

#define MMAP_SIZE (4 * PAGE_SIZE)  /* 4 pages */

struct mmap_dev {
    struct cdev cdev;
    void *kmem;          /* Kernel memory buffer */
    unsigned long size;
};

static struct mmap_dev *mdev;
static dev_t dev_num;
static struct class *dev_class;

/*
 * VM operations - called for page faults
 */
static vm_fault_t mmap_dev_fault(struct vm_fault *vmf)
{
    struct page *page;
    unsigned long offset;
    void *page_ptr;
    
    pr_info("mmap: Page fault at offset %lu\n", vmf->pgoff);
    
    /* Calculate offset in device memory */
    offset = vmf->pgoff << PAGE_SHIFT;
    
    if (offset >= mdev->size)
        return VM_FAULT_SIGBUS;  /* Out of range */
    
    /* Get page address */
    page_ptr = mdev->kmem + offset;
    page = virt_to_page(page_ptr);
    
    /* Increment page reference count */
    get_page(page);
    
    /* Install page in page table */
    vmf->page = page;
    
    pr_info("mmap: Mapped page at offset %lu\n", offset);
    return 0;
}

static const struct vm_operations_struct mmap_vm_ops = {
    .fault = mmap_dev_fault,
};

/*
 * mmap implementation - Method 1: Using VM operations (on-demand)
 * 
 * @filp: File pointer
 * @vma:  Virtual memory area
 * 
 * Return: 0 on success, negative error code on failure
 */
static int mmap_dev_mmap(struct file *filp, struct vm_area_struct *vma)
{
    unsigned long size;
    
    pr_info("mmap: Called\n");
    pr_info("mmap: Start=0x%lx, End=0x%lx, Offset=%lu\n",
            vma->vm_start, vma->vm_end, vma->vm_pgoff);
    
    /* Calculate requested size */
    size = vma->vm_end - vma->vm_start;
    
    /* Validate size */
    if (size > mdev->size) {
        pr_err("mmap: Requested size too large\n");
        return -EINVAL;
    }
    
    /* Validate offset */
    if (vma->vm_pgoff > 0) {
        pr_err("mmap: Non-zero offset not supported\n");
        return -EINVAL;
    }
    
    /*
     * Set VM flags
     * 
     * VM_IO        - This is I/O memory (not pageable)
     * VM_DONTEXPAND - Don't expand with mremap()
     * VM_DONTDUMP  - Don't include in core dump
     */
    vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
    
    /* Set VM operations */
    vma->vm_ops = &mmap_vm_ops;
    
    pr_info("mmap: Mapping %lu bytes\n", size);
    
    return 0;
}

/*
 * mmap implementation - Method 2: Direct mapping (all at once)
 */
static int mmap_dev_mmap_direct(struct file *filp, struct vm_area_struct *vma)
{
    unsigned long size;
    unsigned long pfn;
    int ret;
    
    size = vma->vm_end - vma->vm_start;
    
    if (size > mdev->size)
        return -EINVAL;
    
    /*
     * Use remap_pfn_range() to map kernel memory
     * 
     * This maps all pages immediately (not on-demand)
     */
    pfn = virt_to_phys(mdev->kmem) >> PAGE_SHIFT;
    
    ret = remap_pfn_range(vma,
                         vma->vm_start,  /* User virtual address */
                         pfn,            /* Physical page frame number */
                         size,           /* Size to map */
                         vma->vm_page_prot); /* Protection */
    
    if (ret < 0) {
        pr_err("mmap: remap_pfn_range failed\n");
        return ret;
    }
    
    vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
    
    pr_info("mmap: Mapped %lu bytes directly\n", size);
    
    return 0;
}

static int mmap_dev_open(struct inode *inode, struct file *filp)
{
    pr_info("Device opened\n");
    return 0;
}

static int mmap_dev_release(struct inode *inode, struct file *filp)
{
    pr_info("Device closed\n");
    return 0;
}

static struct file_operations fops = {
    .owner   = THIS_MODULE,
    .open    = mmap_dev_open,
    .release = mmap_dev_release,
    .mmap    = mmap_dev_mmap,  /* Use on-demand mapping */
    /* .mmap = mmap_dev_mmap_direct, */  /* Or use direct mapping */
};

static int __init mmap_dev_init(void)
{
    int ret;
    
    mdev = kzalloc(sizeof(struct mmap_dev), GFP_KERNEL);
    if (!mdev)
        return -ENOMEM;
    
    /*
     * Allocate kernel memory for mmap
     * Must be page-aligned
     */
    mdev->kmem = kzalloc(MMAP_SIZE, GFP_KERNEL);
    if (!mdev->kmem) {
        kfree(mdev);
        return -ENOMEM;
    }
    mdev->size = MMAP_SIZE;
    
    /* Write test pattern to memory */
    memset(mdev->kmem, 0xAA, MMAP_SIZE);
    
    /* Standard device registration */
    ret = alloc_chrdev_region(&dev_num, 0, 1, "mmapdev");
    if (ret < 0)
        goto err;
    
    dev_class = class_create(THIS_MODULE, "mmapdev_class");
    if (IS_ERR(dev_class)) {
        ret = PTR_ERR(dev_class);
        goto err_class;
    }
    
    cdev_init(&mdev->cdev, &fops);
    ret = cdev_add(&mdev->cdev, dev_num, 1);
    if (ret < 0)
        goto err_cdev;
    
    device_create(dev_class, NULL, dev_num, NULL, "mmapdev");
    
    pr_info("mmapdev: Initialized with %lu bytes\n", mdev->size);
    return 0;

err_cdev:
    class_destroy(dev_class);
err_class:
    unregister_chrdev_region(dev_num, 1);
err:
    kfree(mdev->kmem);
    kfree(mdev);
    return ret;
}

static void __exit mmap_dev_exit(void)
{
    device_destroy(dev_class, dev_num);
    cdev_del(&mdev->cdev);
    class_destroy(dev_class);
    unregister_chrdev_region(dev_num, 1);
    kfree(mdev->kmem);
    kfree(mdev);
    pr_info("mmapdev: Removed\n");
}

module_init(mmap_dev_init);
module_exit(mmap_dev_exit);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Device with mmap support");

Userspace test:

/*
 * test_mmap.c - Test mmap functionality
 * 
 * Compile: gcc -o test_mmap test_mmap.c
 */

#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <string.h>

#define MMAP_SIZE (4 * 4096)  /* 4 pages */

int main(void)
{
    int fd;
    void *mapped;
    unsigned char *ptr;
    int i;
    
    /* Open device */
    fd = open("/dev/mmapdev", O_RDWR);
    if (fd < 0) {
        perror("open");
        return EXIT_FAILURE;
    }
    
    /* Map device memory to user space */
    mapped = mmap(NULL,           /* Let kernel choose address */
                 MMAP_SIZE,       /* Size to map */
                 PROT_READ | PROT_WRITE, /* Read/write access */
                 MAP_SHARED,      /* Share mapping with kernel */
                 fd,              /* File descriptor */
                 0);              /* Offset in device */
    
    if (mapped == MAP_FAILED) {
        perror("mmap");
        close(fd);
        return EXIT_FAILURE;
    }
    
    printf("Memory mapped successfully at %p\n", mapped);
    
    /* Access mapped memory */
    ptr = (unsigned char *)mapped;
    
    /* Read initial pattern */
    printf("\nInitial pattern (first 16 bytes):\n");
    for (i = 0; i < 16; i++) {
        printf("%02x ", ptr[i]);
    }
    printf("\n");
    
    /* Write new pattern */
    printf("\nWriting new pattern...\n");
    for (i = 0; i < MMAP_SIZE; i++) {
        ptr[i] = i & 0xFF;
    }
    
    /* Read back */
    printf("New pattern (first 16 bytes):\n");
    for (i = 0; i < 16; i++) {
        printf("%02x ", ptr[i]);
    }
    printf("\n");
    
    /* Unmap memory */
    if (munmap(mapped, MMAP_SIZE) < 0) {
        perror("munmap");
    } else {
        printf("\nMemory unmapped successfully\n");
    }
    
    close(fd);
    return EXIT_SUCCESS;
}

Summary

In this chapter, you learned:

ioctl: Device-specific control operations
poll/select: Non-blocking I/O and multiplexing
llseek: File position management
mmap: Memory-mapped I/O for high performance
Advanced file operations: Complete file_operations structure

Key Takeaways

  1. Use ioctl for device control, not for regular data transfer
  2. Implement poll for efficient I/O multiplexing
  3. mmap provides zero-copy access to device memory
  4. Always validate user input in all operations
  5. Use wait queues for blocking operations

Next Steps

Proceed to 04-memory.md to learn about kernel memory management, allocation strategies, and DMA.


Quick Reference

ioctl

/* Define commands */
#define MY_IOCRESET  _IO('M', 0)
#define MY_IOCGVAL   _IOR('M', 1, int)
#define MY_IOCSVAL   _IOW('M', 2, int)

/* Implement handler */
long my_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);

poll

/* Implement poll */
__poll_t my_poll(struct file *filp, poll_table *wait) {
    poll_wait(filp, &my_queue, wait);
    if (data_ready)
        return EPOLLIN | EPOLLRDNORM;
    return 0;
}

/* Wake up waiters */
wake_up_interruptible(&my_queue);

mmap

/* Implement mmap */
int my_mmap(struct file *filp, struct vm_area_struct *vma) {
    unsigned long pfn = virt_to_phys(kmem) >> PAGE_SHIFT;
    return remap_pfn_range(vma, vma->vm_start, pfn,
                          vma->vm_end - vma->vm_start,
                          vma->vm_page_prot);
}