Difference between revisions of "Ubstat system call"
| m (Added more information about arch) | |||
| (9 intermediate revisions by 3 users not shown) | |||
| Line 1: | Line 1: | ||
| This article describes an interesting system call which was designed to pick beancounters statistics. | This article describes an interesting system call which was designed to pick beancounters statistics. | ||
| − | = Overview = | + | == Overview == | 
| The system call appeared in the very first version of the OpenVZ. Its API is rather sloppy, but this is something we have to live with due to backward compatibility reasons. | The system call appeared in the very first version of the OpenVZ. Its API is rather sloppy, but this is something we have to live with due to backward compatibility reasons. | ||
| The main intention of this system call is to allow a user space process get the beancounters statistics ''periodically''. This statistics includes the fields observed in the <code>/proc/user_beancounters</code> file and one more field - the so called <code>minheld</code> value which is opposite to the <code>maxheld</code> one. As long as exporting the statistics the system call also notifies the task about the desired period has elapsed. The notification is performed by sending a signal to a process and this notification is one-shot. | The main intention of this system call is to allow a user space process get the beancounters statistics ''periodically''. This statistics includes the fields observed in the <code>/proc/user_beancounters</code> file and one more field - the so called <code>minheld</code> value which is opposite to the <code>maxheld</code> one. As long as exporting the statistics the system call also notifies the task about the desired period has elapsed. The notification is performed by sending a signal to a process and this notification is one-shot. | ||
| − | = How it works = | + | == How it works == | 
| The typical usage of this call is in performing the following steps. | The typical usage of this call is in performing the following steps. | ||
| # Request the amount of resources | # Request the amount of resources | ||
| Line 16: | Line 16: | ||
| In a signal handler one should just perform a respective system call to get the stats and schedule the next notification (yes, they are performed in one go; see below for more details). | In a signal handler one should just perform a respective system call to get the stats and schedule the next notification (yes, they are performed in one go; see below for more details). | ||
| − | = API = | + | == API == | 
| The system call description is | The system call description is | ||
| − | < | + | <source lang="c"> | 
| long ubstat(int func, unsigned long luid, void *notif, void *buf, int size); | long ubstat(int func, unsigned long luid, void *notif, void *buf, int size); | ||
| − | </ | + | </source> | 
| The macros and data typed used are declared in <code>include/ub/ub_stat.h</code> file. | The macros and data typed used are declared in <code>include/ub/ub_stat.h</code> file. | ||
| − | == Arguments description == | + | === Arguments description === | 
| '''<code>func</code>''' is like <code>cmd</code> in the <code>ioctl</code> system call. It can be one of | '''<code>func</code>''' is like <code>cmd</code> in the <code>ioctl</code> system call. It can be one of | ||
| Line 45: | Line 45: | ||
| '''<code>size</code>''' is the <code>buf</code> memory size. | '''<code>size</code>''' is the <code>buf</code> memory size. | ||
| − | == The statistics format == | + | === The statistics format === | 
| − | The format of data  | + | The format of data returned into the buffer depends on the function requested. | 
| '''1. <code>UBSTAT_READ_ONE</code>''' format is | '''1. <code>UBSTAT_READ_ONE</code>''' format is | ||
| − | < | + | <source lang="c"> | 
|          typedef unsigned long ubstattime_t; |          typedef unsigned long ubstattime_t; | ||
| Line 62: | Line 62: | ||
|                  ubstatparm_t    param[1]; |                  ubstatparm_t    param[1]; | ||
|          } |          } | ||
| − | </ | + | </source> | 
| It contains the time period for which the stats are returned and the <code>maxheld</code> and <code>failcnt</code> for the resource. | It contains the time period for which the stats are returned and the <code>maxheld</code> and <code>failcnt</code> for the resource. | ||
| '''2. <code>UBSTAT_READ_ALL</code>''' format is | '''2. <code>UBSTAT_READ_ALL</code>''' format is | ||
| − | < | + | <source lang="c"> | 
|          typedef unsigned long ubstattime_t; |          typedef unsigned long ubstattime_t; | ||
| Line 80: | Line 80: | ||
|                  ubstatparm_t    param[UB_RESOURCES]; |                  ubstatparm_t    param[UB_RESOURCES]; | ||
|          } |          } | ||
| − | </ | + | </source> | 
| It contains the same info as the <code>UBSTAT_READ_ONE</code> does, but for all the resources. | It contains the same info as the <code>UBSTAT_READ_ONE</code> does, but for all the resources. | ||
| '''3. <code>UBSTAT_READ_FULL</code>''' format is | '''3. <code>UBSTAT_READ_FULL</code>''' format is | ||
| − | < | + | <source lang="c"> | 
|          typedef unsigned long ubstattime_t; |          typedef unsigned long ubstattime_t; | ||
| Line 104: | Line 104: | ||
|                  ubstatparmf_t    param[UB_RESOURCES]; |                  ubstatparmf_t    param[UB_RESOURCES]; | ||
|          } |          } | ||
| − | </ | + | </source> | 
| It contains the extended info for all the resources. | It contains the extended info for all the resources. | ||
| Line 113: | Line 113: | ||
| '''6. UBSTAT_GETTIME''' format is | '''6. UBSTAT_GETTIME''' format is | ||
| − | < | + | <source lang="c"> | 
|          typedef unsigned long ubstattime_t; |          typedef unsigned long ubstattime_t; | ||
| Line 121: | Line 121: | ||
|                  ubstattime_t    cur_time; |                  ubstattime_t    cur_time; | ||
|          }; |          }; | ||
| − | </ | + | </source> | 
| It returns the time interval within which the stats are collected and the current time. | It returns the time interval within which the stats are collected and the current time. | ||
| Line 127: | Line 127: | ||
| All the times used are in seconds. | All the times used are in seconds. | ||
| − | == Notification == | + | === Notification === | 
| The notification info is passed via the <code>notif</code> argument and is being set up for all the functions except the <code>UBLIST</code> and the <code>UBPARNUM</code>. The notification is one-shot, but note that once you requested the statistics the next shot is scheduled at the same time. | The notification info is passed via the <code>notif</code> argument and is being set up for all the functions except the <code>UBLIST</code> and the <code>UBPARNUM</code>. The notification is one-shot, but note that once you requested the statistics the next shot is scheduled at the same time. | ||
| The <code>notif</code> should point to | The <code>notif</code> should point to | ||
| − | < | + | <source lang="c"> | 
|          typedef struct { |          typedef struct { | ||
|                  long            maxinterval; |                  long            maxinterval; | ||
|                  int             signum; |                  int             signum; | ||
|          } ubnotifrq_t; |          } ubnotifrq_t; | ||
| − | </ | + | </source> | 
| The <code>maxinterval</code> is the time after which the notification will be delivered. It should be more than 1 (second). | The <code>maxinterval</code> is the time after which the notification will be delivered. It should be more than 1 (second). | ||
| Line 142: | Line 142: | ||
| The <code>signum</code> is the signal that will be sent to notify. | The <code>signum</code> is the signal that will be sent to notify. | ||
| − | = Demo = | + | === Return value === | 
| + | The system call returns -1 in case error has occurred. In case of <code>UBSTAT_UBPARMNUM</code> it returns <code>UB_RESOURCES</code> and in all other cases it returns the amount of bytes written to the <code>buf</code>. | ||
| + | |||
| + | == Demo == | ||
| The following program demonstrates how you can (but not should) use the described API. This example is deliberately made very stupid and simple to demonstrate the main idea and will only work on x86_64. | The following program demonstrates how you can (but not should) use the described API. This example is deliberately made very stupid and simple to demonstrate the main idea and will only work on x86_64. | ||
| − | < | + | <source lang="c"> | 
| #include <stdio.h> | #include <stdio.h> | ||
| #include <unistd.h> | #include <unistd.h> | ||
| Line 282: | Line 285: | ||
| 	return 0; | 	return 0; | ||
| } | } | ||
| − | </ | + | </source> | 
| − | = Implementation constraints = | + | == Implementation constraints == | 
| Unfortunately the API is not architecture independent and thus 32-bit application will simply not work on x86_64. | Unfortunately the API is not architecture independent and thus 32-bit application will simply not work on x86_64. | ||
| [[Category:UBC]] | [[Category:UBC]] | ||
Latest revision as of 14:06, 20 January 2011
This article describes an interesting system call which was designed to pick beancounters statistics.
Contents
Overview[edit]
The system call appeared in the very first version of the OpenVZ. Its API is rather sloppy, but this is something we have to live with due to backward compatibility reasons.
The main intention of this system call is to allow a user space process get the beancounters statistics periodically. This statistics includes the fields observed in the /proc/user_beancounters file and one more field - the so called minheld value which is opposite to the maxheld one. As long as exporting the statistics the system call also notifies the task about the desired period has elapsed. The notification is performed by sending a signal to a process and this notification is one-shot.
How it works[edit]
The typical usage of this call is in performing the following steps.
- Request the amount of resources
- Get the IDs of all the living beancounters
- Setup a handler for some signal (e.g. USR1)
- Perform a system call to setup the notification
- Go do something (or sleep for ever)
In a signal handler one should just perform a respective system call to get the stats and schedule the next notification (yes, they are performed in one go; see below for more details).
API[edit]
The system call description is
long ubstat(int func, unsigned long luid, void *notif, void *buf, int size);The macros and data typed used are declared in include/ub/ub_stat.h file.
Arguments description[edit]
func is like cmd in the ioctl system call. It can be one of
- UBSTAT_READ_ONEto read basic stats for one resource. The desired resource itself should be- or-ed with the- func
- UBSTAT_READ_ALLto read basic stats about all the resources
- UBSTAT_READ_FULLto read extended stats about all the resources
- UBSTAT_UBLISTto get the ids of the beancounters
- UBSTAT_UBPARMNUMto get the number of resources used by the kernel
- UBSTAT_GETTIME
See below for what basic and extended stats mean.
luid is the desired beancounter ID. Only one beancounter can be checked at one call.
notif is the pointer to a ubnotifrq_t structure which describes the notification details (see below).
buf is the pointer to a chunk of memory, which will contain the data requested.
size is the buf memory size.
The statistics format[edit]
The format of data returned into the buffer depends on the function requested.
1. UBSTAT_READ_ONE format is
        typedef unsigned long ubstattime_t;
        typedef struct {
                unsigned long   maxheld;
                unsigned long   failcnt;
        } ubstatparm_t;
        struct {
                ubstattime_t    start_time;
                ubstattime_t    end_time;
                ubstatparm_t    param[1];
        }It contains the time period for which the stats are returned and the maxheld and failcnt for the resource.
2. UBSTAT_READ_ALL format is
        typedef unsigned long ubstattime_t;
        typedef struct {
                unsigned long   maxheld;
                unsigned long   failcnt;
        } ubstatparm_t;
        struct {
                ubstattime_t    start_time;
                ubstattime_t    end_time;
                ubstatparm_t    param[UB_RESOURCES];
        }It contains the same info as the UBSTAT_READ_ONE does, but for all the resources.
3. UBSTAT_READ_FULL format is
        typedef unsigned long ubstattime_t;
        typedef struct {
                unsigned long   barrier;
                unsigned long   limit;
                unsigned long   held;
                unsigned long   maxheld;
                unsigned long   minheld;
                unsigned long   failcnt;
                unsigned long __unused1;
                unsigned long __unused2;
       } ubstatparmf_t;
        struct {
                ubstattime_t    start_time;
                ubstattime_t    end_time;
                ubstatparmf_t    param[UB_RESOURCES];
        }It contains the extended info for all the resources.
4. UBSTAT_UBLIST treats the buf to point to the unsigned long array.
5. UBSTAT_UBPARMNUM ignores the buf.
6. UBSTAT_GETTIME format is
        typedef unsigned long ubstattime_t;
        struct {
                ubstattime_t    start_time;
                ubstattime_t    end_time;
                ubstattime_t    cur_time;
        };It returns the time interval within which the stats are collected and the current time.
All the times used are in seconds.
Notification[edit]
The notification info is passed via the notif argument and is being set up for all the functions except the UBLIST and the UBPARNUM. The notification is one-shot, but note that once you requested the statistics the next shot is scheduled at the same time.
The notif should point to
        typedef struct {
                long            maxinterval;
                int             signum;
        } ubnotifrq_t;The maxinterval is the time after which the notification will be delivered. It should be more than 1 (second).
The signum is the signal that will be sent to notify.
Return value[edit]
The system call returns -1 in case error has occurred. In case of UBSTAT_UBPARMNUM it returns UB_RESOURCES and in all other cases it returns the amount of bytes written to the buf.
Demo[edit]
The following program demonstrates how you can (but not should) use the described API. This example is deliberately made very stupid and simple to demonstrate the main idea and will only work on x86_64.
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <ub_stat.h>
#include <signal.h>
#define UBSTAT_BUFSIZE	4096
#define UBSTAT_NOTIFSIG	SIGUSR1
#define UB_RESOURCES	24
static char ubstat_buf[UBSTAT_BUFSIZE];
static int luid, func;
static ubnotifrq_t notif;
static void (*print_stat)(void *buf);
#define sys_ubstat(args...)	syscall(503, ## args)
static void usage(void)
{
	printf("ubstat <id> <period> <resource>\n");
	printf("resource can be either num or:\n");
	printf("-1 for all maxheld and failcnt\n");
	printf("-2 for all at all\n");
}
static void print_stat_one(void *buf)
{
	struct {
		ubstattime_t    start_time;
		ubstattime_t    end_time;
		ubstatparm_t    param[1];
	} *data;
	data = buf;
	printf("maxheld: %lu\n", data->param[0].maxheld);
	printf("failcnt: %lu\n", data->param[0].failcnt);
}
static void print_stat_all(void *buf)
{
	struct {
		ubstattime_t    start_time;
		ubstattime_t    end_time;
		ubstatparm_t    param[UB_RESOURCES];
	} *data;
	int res;
	data = buf;
	for (res = 0; res < UB_RESOURCES; res++) {
		printf("res %d\n", res);
		printf("\tmaxheld: %lu\n", data->param[res].maxheld);
		printf("\tfailcnt: %lu\n", data->param[res].failcnt);
	}
}
static void print_stat_full(void *buf)
{
	struct {
		ubstattime_t    start_time;
		ubstattime_t    end_time;
		ubstatparmf_t    param[UB_RESOURCES];
	} *data;
	int res;
	data = buf;
	for (res = 0; res < UB_RESOURCES; res++) {
		printf("res %d\n", res);
		printf("minheld: %lu\n", data->param[res].minheld);
		printf("maxheld: %lu\n", data->param[res].maxheld);
		printf("failcnt: %lu\n", data->param[res].failcnt);
	}
}
static inline int res2func(int resource)
{
	if (resource >= 0) {
		print_stat = print_stat_one;
		return UBSTAT_READ_ONE | resource;
	}
	if (resource == -1) {
		print_stat = print_stat_all;
		return UBSTAT_READ_ALL;
	}
	if (resource == -2) {
		print_stat = print_stat_full;
		return UBSTAT_READ_FULL;
	}
	printf("Bad resource %d\n", resource);
	exit(1);
}
static void do_notify(int x)
{
	int err;
	err = sys_ubstat(func, luid, (unsigned long)¬if,
			ubstat_buf, UBSTAT_BUFSIZE);
	if (err < 0) {
		perror("Can't set stat");
		exit(0);
	}
	print_stat(ubstat_buf);
}
static int do_ubstat(int id, int period, int resource)
{
	luid = id;
	func = res2func(resource);
	notif.maxinterval = period;
	notif.signum = UBSTAT_NOTIFSIG;
	signal(UBSTAT_NOTIFSIG, do_notify);
	do_notify(0);
}
int main(int argc, char **argv)
{
	int id, period, res;
	if (argc == 1) {
		usage();
		return 0;
	}
	id = atoi(argv[1]);
	period = atoi(argv[2]);
	res = atoi(argv[3]);
	do_ubstat(id, period, res);
	while (1)
		sleep(10);
	return 0;
}Implementation constraints[edit]
Unfortunately the API is not architecture independent and thus 32-bit application will simply not work on x86_64.
