http://weichong78.blogspot.com/

exploring page cache and picking up oops debugging

A while ago, I wrote a small toy kernel module to understand how a process interact with page cache after accessing a file, to verify what I have learn from the great ULK3:

#include <linux/module.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/cdev.h>
#include <linux/sched.h>
#include <linux/file.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <asm/uaccess.h>

#define FILEPROBE_MAJOR 200
#define FILEPROBE_MINOR 0

struct fileprobe {
int qpid;
dev_t dev;
struct cdev cdev;
};

struct fileprobe fp;

static int fileprobe_open (struct inode *inode, struct file *file)
{
printk("fileprobe_open\n");
return 0;
}

static ssize_t fileprobe_read (struct file* file, char __user *buf, size_t count, loff_t *ppos)
{
printk("fileprobe_read\n");
return 0;
}

static void fileprobe_queryfile (struct file* qfile)
{
struct page* qpage;
char* qcontent;

printk("fileprobe_queryfile\n");
printk("find_get_page 0\n");
qpage = find_get_page(qfile->f_mapping, 0);
if (NULL != qpage) {
qcontent = (char*)kmap(qpage);
if (NULL != qcontent) {
printk(" the content of 1st page is [%c%c%c%c]\n", qcontent[0], qcontent[1], qcontent[2], qcontent[3]);
kunmap((void*)qcontent);
}
}
printk("find_get_page 1\n");
qpage = find_get_page(qfile->f_mapping, 1);
if (NULL != qpage) {
qcontent = (char*)kmap(qpage);
if (NULL != qcontent) {
printk(" the content of 2nd page is [%c%c%c%c]\n", qcontent[0], qcontent[1], qcontent[2], qcontent[3]);
kunmap((void*)qcontent);
}
}
}

static ssize_t fileprobe_write (struct file* file, const char __user *buf, size_t count, loff_t *ppos)
{
struct task_struct *qtask;
int fd_nr;
struct file* qfile;

printk("fileprobe_write\n");
get_user(fp.qpid, (int __user *)buf);

for_each_process(qtask) {
if(qtask->pid == fp.qpid) {
printk("task_struct found for PID %d, %s\n", qtask->pid, qtask->comm);
for(fd_nr = 0; fd_nr <>files->fdt->max_fds; fd_nr++) {
qfile = qtask->files->fdt->fd[fd_nr];
if(qfile != NULL) {
printk(" fd_nr is %d, dentry->d_iname is %s\n", fd_nr, qfile->f_path.dentry->d_iname);
if(0 == strcmp(qfile->f_path.dentry->d_iname,"fileprobe.log")) {
fileprobe_queryfile(qfile);
}
}
}
}
}
return 0;
}

struct file_operations f_ops = {
.open = fileprobe_open,
/* .release = fileprobe_release, */
.read = fileprobe_read,
.write = fileprobe_write
};

int __init fileprobe_init (void)
{
int ret;
printk("fileprobe_init\n");
cdev_init(&fp.cdev, &f_ops);
fp.dev = MKDEV(FILEPROBE_MAJOR, FILEPROBE_MINOR);
ret = register_chrdev_region(fp.dev, 1, "fileprobe");
ret = cdev_add(&fp.cdev, fp.dev, 1);
if (ret != 0) goto err1;
return 0;
err1:
unregister_chrdev_region(fp.dev, 1);
return -1;
}

void __exit fileprobe_exit (void)
{
cdev_del(&fp.cdev);
unregister_chrdev_region(fp.dev, 1);
printk("fileprobe_exit\n");
}

module_init(fileprobe_init);
module_exit(fileprobe_exit);

MODULE_LICENSE("GPL");


Basically, it is a simple character device module that accept a PID as argument. With that PID, it search for the task_struct from the task list. It then search the task_struct's file descriptor table to look for an open file with the file name (from dentry) "fileprobe.log". With the located struct file, it try to look for the corresponding struct address_space through f_mapping. The address_space structure has a (if I remember correctly) red-black tree of struct page which identify which physical page frame host the file content. The module utilizes the kernel function find_get_page() to locate the page frames by passing the 4k page order as the second argument. It then use kmap to obtain a virtual address of the physical page and use printk to display partial content of "fileprobe.log".

Now this whole thing is very toyish and canned for learning purpose only. The content of "fileprobe.log" is exactly 8k. The first 4k starts with 3 characters "1st" follow by all 'X' characters while the second 4k starts with 3 characters "2nd" follow by all 'Y', to make it easy for verification. Basically, such a file can easily be created by a perl command line like

perl -e 'print "X"x4096'

with the first 3 characters replaced (vim recommended :-).

I then conduct the experiment by using less to open the "fileprobe.log" file, obtain the PID of the less process and write the value to the device node of the kernel module using a user space program like:

#include
#include
#include

int main (int argc, char** argv)
{
int qpid = atoi(argv[1]);
printf("qpid is %d\n", qpid);
int fd = open("./fileprobe", O_WRONLY);
write(fd, &qpid, sizeof(qpid));
close(fd);
return 0;
}


With the description above, all went well. But the above kernel module was not the original version, which had the below difference:


--- fileprobe.c 2008-10-21 21:12:23.000000000 +0800
+++ fileprobe.c.orig 2008-10-21 21:11:07.000000000 +0800
@@ -42,7 +42,7 @@
if (NULL != qpage) {
qcontent = (char*)kmap(qpage);
if (NULL != qcontent) {
- printk(" the content of 1st page is [%c%c%c%c]\n", qcontent[0], qcontent[1], qcontent[2], qcontent[3]);
+ printk(" the content of 1st page is [%s]\n", qcontent);
kunmap((void*)qcontent);
}
}
@@ -51,7 +51,7 @@
if (NULL != qpage) {
qcontent = (char*)kmap(qpage);
if (NULL != qcontent) {
- printk(" the content of 2nd page is [%c%c%c%c]\n", qcontent[0], qcontent[1], qcontent[2], qcontent[3]);
+ printk(" the content of 2nd page is [%s]\n", qcontent);
kunmap((void*)qcontent);
}
}



This original version actually gave an OOPs the first time I tried, which looks like this:


[ 852.962496] fileprobe_init
[ 853.794718] fileprobe_open
[ 853.794728] fileprobe_write
[ 853.794753] task_struct found for PID 9096, less
[ 853.794756] fd_nr is 0, dentry->d_iname is tty2
[ 853.794758] fd_nr is 1, dentry->d_iname is tty2
[ 853.794761] fd_nr is 2, dentry->d_iname is tty2
[ 853.794763] fd_nr is 3, dentry->d_iname is tty
[ 853.794765] fd_nr is 4, dentry->d_iname is fileprobe.log
[ 853.794767] fileprobe_queryfile
[ 853.794769] find_get_page 0
[ 853.794810] BUG: unable to handle kernel paging request at virtual address ffa0b000
[ 853.794813] printing eip: c021a6f6 *pde = 00004067 *pte = 00000000
[ 853.794819] Oops: 0000 [#1] SMP
[ 853.794898] Modules linked in: fileprobe michael_mic arc4 ecb blkcipher ieee80211_crypt_tkip appletalk ax25 ipx p8023 binfmt_misc rfcomm l2cap bluetooth uinput ppdev i915 drm ipv6 sbs container sbshc dock cpufreq_conservative cpufreq_ondemand cpufreq_userspace cpufreq_powersave cpufreq_stats freq_table iptable_filter ip_tables x_tables aes_i586 dm_crypt dm_mod lp af_packet pcmcia battery ac snd_intel8x0 snd_ac97_codec ac97_bus snd_pcm_oss snd_mixer_oss snd_pcm psmouse ipw2200 serio_raw yenta_socket rsrc_nonstatic pcmcia_core ieee80211 ieee80211_crypt snd_seq_dummy video output snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event button snd_seq snd_timer snd_seq_device intel_agp thinkpad_acpi nvram evdev agpgart parport_pc parport shpchp pci_hotplug iTCO_wdt iTCO_vendor_support snd soundcore snd_page_alloc pcspkr ext3 jbd mbcache sg sr_mod cdrom sd_mod ata_generic pata_acpi ata_piix e100 mii libata scsi_mod ehci_hcd uhci_hcd usbcore thermal processor fan fbcon tileblit font bitblit softcursor fuse
[ 853.797243]
[ 853.797281] Pid: 9146, comm: fileprobe_user Not tainted (2.6.24-19-generic #1)
[ 853.797343] EIP: 0060:[] EFLAGS: 00010097 CPU: 0
[ 853.797392] EIP is at strnlen+0x6/0x20
[ 853.797433] EAX: ffa0b000 EBX: c048e93e ECX: ffa0a000 EDX: ffffeffe
[ 853.797479] ESI: ffa0a000 EDI: 00000000 EBP: ffffffff ESP: ebb97e88

[ 853.797525] DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
[ 853.797570] Process fileprobe_user (pid: 9146, ti=ebb96000 task=ea3dc5c0 task.ti=ebb96000)
[ 853.797619] Stack: c0219c99 000c2091 00000000 0000000a 00000006 ffffffff 00000001 00000400
[ 853.797848] c048e920 0000000a c048ed20 ffa0a000 ffffffff ffffffff f8e1731b 00000400
[ 853.798075] ffa0a000 ea308b80 00000000 c0219f14 ebb97f68 ea1e6600 c012ce7c ebb97f64
[ 853.798303] Call Trace:
[ 853.798372] [] vsnprintf+0x459/0x610
[ 853.798453] [] vscnprintf+0x14/0x20
[ 853.798519] [] vprintk+0x5c/0x380
[ 853.798591] [] set_page_address+0xa2/0x160
[ 853.798663] [] kmap_high+0x18f/0x1b0
[ 853.798732] [] printk+0x1b/0x20
[ 853.798795] [] fileprobe_write+0x117/0x180 [fileprobe]
[ 853.798866] [] fileprobe_write+0x0/0x180 [fileprobe]
[ 853.798931] [] vfs_write+0xb9/0x170
[ 853.799000] [] sys_write+0x41/0x70
[ 853.799067] [] sysenter_past_esp+0x6b/0xa9
[ 853.799143] =======================
[ 853.799182] Code: 90 8d 74 26 00 85 c9 57 89 c7 89 d0 74 05 f2 ae 75 01 4f 89 f8 5f c3 8d b4 26 00 00 00 00 8d bc 27 00 00 00 00 89 c1 89 c8 eb 06 <80> 38 00 74 07 40 4a 83 fa ff 75 f4 29 c8 c3 90 90 90 90 90 90
[ 853.800471] EIP: [] strnlen+0x6/0x20 SS:ESP 0068:ebb97e88
[ 853.800555] ---[ end trace 45550c129f3be6fa ]---


The OOPs actually puzzled me for a while, until I talked to a mighty parrot this afternoon, and he noticed the "strnlen" in the OOPs, which I was careless enough to missed that obvious hint, thinking that it could be related instead to kmap, all these while, silly me.

So, go along with that hint, I tried to modify the module which looks like the final version above, and yippie, it worked, with the following dmesg instead:


[ 686.833604] fileprobe_init
[ 724.422097] fileprobe_open
[ 724.422107] fileprobe_write
[ 724.422139] task_struct found for PID 7564, less
[ 724.422142] fd_nr is 0, dentry->d_iname is 1
[ 724.422144] fd_nr is 1, dentry->d_iname is 1
[ 724.422146] fd_nr is 2, dentry->d_iname is 1
[ 724.422148] fd_nr is 3, dentry->d_iname is tty
[ 724.422151] fd_nr is 4, dentry->d_iname is fileprobe.log
[ 724.422153] fileprobe_queryfile
[ 724.422154] find_get_page 0
[ 724.422157] the content of 1st page is [1stX]
[ 724.422159] find_get_page 1
[ 724.422161] the content of 2nd page is [2ndY]
[ 724.422163] fd_nr is 11, dentry->d_iname is
[ 724.422166] fd_nr is 62, dentry->d_iname is
[ 724.422168] fd_nr is 63, dentry->d_iname is


Notice the content of the page starts with "1stX" and the second with "2ndY"? Well, looks like what I read in ULK3 is true.

By the way, on an unrelated account, the mighty parrot also told me that /etc in Linux stands for "errata configuration" and /usr stands for "UNIX service resource". :-)

Ok, it is getting late, and I'm tired, so that's all I'm going to dump in this post for now. If there is any error you notice, please do drop me a comment or email so that I can correct them.

Tuesday, October 14, 2008

objdump vmlinux

Being able to quickly figure out which part of the kernel source code correspond to the binary (in memory image of) running kernel is always handy, especially when debugging a faulty/oops/crashing system.
A while ago, I used objdump -d -S on the .ko file of the driver I compiled which would intersperse my driver source with the .ko binary. Thinking along that line, vmlinux should be able to give us similar output with the kernel source code. After searching around in the web, I found this site that confirms this.
I tried objdump -d -S vmlinux and got a huge 65MB text file that list what I have expected, part of it looks like this:

vmlinux: file format elf32-i386

Disassembly of section .text.head:

c0100000 <_text>:
*/
.section .text.head,"ax",@progbits
ENTRY(startup_32)
/* test KEEP_SEGMENTS flag to see if the bootloader is asking
us to not reload segments */
testb $(1<<6), BP_loadflags(%esi)
c0100000: f6 86 11 02 00 00 40 testb $0x40,0x211(%esi)
jnz 2f
c0100007: 75 14 jne c010001d <_text+0x1d>

and this:

c03453a0 :
*/
#ifdef CONFIG_X86_64
asmlinkage
#endif
void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
{
c03453a0: 55 push %ebp
c03453a1: 89 e5 mov %esp,%ebp
c03453a3: 57 push %edi
c03453a4: 56 push %esi
c03453a5: 53 push %ebx
c03453a6: 83 ec 34 sub $0x34,%esp
c03453a9: e8 72 fe db ff call c0105220
c03453ae: 89 45 e4 mov %eax,-0x1c(%ebp)
c03453b1: 89 55 e0 mov %edx,-0x20(%ebp)

static inline unsigned long __raw_local_save_flags(void)
{
unsigned long f;

asm volatile(paravirt_alt(PV_SAVE_REGS
c03453b4: 51 push %ecx
c03453b5: 52 push %edx
c03453b6: ff 15 bc 78 43 c0 call *0xc04378bc
c03453bc: 5a pop %edx
c03453bd: 59 pop %ecx
* should be a rarely used function, only in places where its
* otherwise impossible to know the irq state, like in traps.
*/
static inline void trace_hardirqs_fixup_flags(unsigned long flags)
{
if (raw_irqs_disabled_flags(flags))
c03453be: f6 c4 02 test $0x2,%ah
c03453c1: 0f 85 ea 00 00 00 jne c03454b1
trace_hardirqs_off();
c03453c7: e8 d4 55 e3 ff call c017a9a0
c03453cc: 64 a1 00 70 4c c0 mov %fs:0xc04c7000,%eax
c03453d2: 89 45 e8 mov %eax,-0x18(%ebp)
* We can fault from pretty much anywhere, with unknown IRQ state.
*/
trace_hardirqs_fixup();

tsk = current;
mm = tsk->mm;
c03453d5: 8b 90 b0 00 00 00 mov 0xb0(%eax),%edx
c03453db: 89 d6 mov %edx,%esi
c03453dd: 89 55 ec mov %edx,-0x14(%ebp)
c03453e0: 83 c6 34 add $0x34,%esi
PVOP_VCALL1(pv_cpu_ops.write_cr0, x);
}


This much I have expected. But something even better that I tried after reading the website is the make .lst file actually works!
I did a:

make kernel/panic.lst

and here is part of the panic.lst that I get in return:

panic.o: file format elf32-i386

Disassembly of section .text:

c012c3e0 :
return 1;
}
__setup("panic=", panic_setup);

static long no_blink(long time)
{
c012c3e0: 55 push %ebp
c012c3e1: 89 e5 mov %esp,%ebp
c012c3e3: e8 fc ff ff ff call c012c3e4
c012c3e4: R_386_PC32 mcount
return 0;
}
c012c3e8: 5d pop %ebp
c012c3e9: 31 c0 xor %eax,%eax
c012c3eb: c3 ret
c012c3ec: 8d 74 26 00 lea 0x0(%esi),%esi

Wednesday, October 01, 2008

Floating Point

Last night, out of boredom (because I had a swollen foot), I revised how floating point is represented in PC, using the IEEE Standard 754 format. To convince myself about what was read, I code this short code snippet:

/* This code is licensed under GPLv2 and above */
#include <stdio.h>

int main ()
{
/* Single Precision Float */
/* S Exponent Significant */
/* 0 1000_0000 [1]000_0000_0000_0000_0000_0000 */
/* Note that the Exponent is in Biased format, therefore,
1000_0000 - 0111_1111 (decimal 127) = 0000_0001 (decimal 2)
while the Significant is in Normalized format, which means the 1.bbbb is implied */
/* 0100_0000_0000_0000_0000_0000_0000_0000 */
/* Hex: 0x40000000 */

unsigned long int usi = 0x40000000;
signed long int si = usi;
float f = usi;

unsigned long int *uptr = &usi;
signed long int *sptr = &usi;
float *fptr = &usi;

printf("for 0x40000000\n");
printf("before assignment:\n");
printf("unsigned int = %lu, signed int = %ld, float = %f\n", *uptr, *sptr, *fptr);
printf("after assignment:\n");
printf("unsigned int = %lu, signed int = %ld, float = %f\n", usi, si, f);

return 0;
}

and the output from the execution is:

for 0x40000000
before assignment:
unsigned int = 1073741824, signed int = 1073741824, float = 2.000000
after assignment:
unsigned int = 1073741824, signed int = 1073741824, float = 1073741824.000000
posted by weichong78 at 7:38 AM | 0 comments

Reply via email to