diff --git a/Documentation/vm/zsmalloc.txt b/Documentation/vm/zsmalloc.txt new file mode 100644 index 000000000000..64ed63c4f69d --- /dev/null +++ b/Documentation/vm/zsmalloc.txt @@ -0,0 +1,70 @@ +zsmalloc +-------- + +This allocator is designed for use with zram. Thus, the allocator is +supposed to work well under low memory conditions. In particular, it +never attempts higher order page allocation which is very likely to +fail under memory pressure. On the other hand, if we just use single +(0-order) pages, it would suffer from very high fragmentation -- +any object of size PAGE_SIZE/2 or larger would occupy an entire page. +This was one of the major issues with its predecessor (xvmalloc). + +To overcome these issues, zsmalloc allocates a bunch of 0-order pages +and links them together using various 'struct page' fields. These linked +pages act as a single higher-order page i.e. an object can span 0-order +page boundaries. The code refers to these linked pages as a single entity +called zspage. + +For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE +since this satisfies the requirements of all its current users (in the +worst case, page is incompressible and is thus stored "as-is" i.e. in +uncompressed form). For allocation requests larger than this size, failure +is returned (see zs_malloc). + +Additionally, zs_malloc() does not return a dereferenceable pointer. +Instead, it returns an opaque handle (unsigned long) which encodes actual +location of the allocated object. The reason for this indirection is that +zsmalloc does not keep zspages permanently mapped since that would cause +issues on 32-bit systems where the VA region for kernel space mappings +is very small. So, before using the allocating memory, the object has to +be mapped using zs_map_object() to get a usable pointer and subsequently +unmapped using zs_unmap_object(). + +stat +---- + +With CONFIG_ZSMALLOC_STAT, we could see zsmalloc internal information via +/sys/kernel/debug/zsmalloc/. Here is a sample of stat output: + +# cat /sys/kernel/debug/zsmalloc/zram0/classes + + class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage + .. + .. + 9 176 0 1 186 129 8 4 + 10 192 1 0 2880 2872 135 3 + 11 208 0 1 819 795 42 2 + 12 224 0 1 219 159 12 4 + .. + .. + + +class: index +size: object size zspage stores +almost_empty: the number of ZS_ALMOST_EMPTY zspages(see below) +almost_full: the number of ZS_ALMOST_FULL zspages(see below) +obj_allocated: the number of objects allocated +obj_used: the number of objects allocated to the user +pages_used: the number of pages allocated for the class +pages_per_zspage: the number of 0-order pages to make a zspage + +We assign a zspage to ZS_ALMOST_EMPTY fullness group when: + n <= N / f, where +n = number of allocated objects +N = total number of objects zspage can store +f = fullness_threshold_frac(ie, 4 at the moment) + +Similarly, we assign zspage to: + ZS_ALMOST_FULL when n > N / f + ZS_EMPTY when n == 0 + ZS_FULL when n == N diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index a796407123c7..7dc6540f2b60 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -555,4 +555,6 @@ config BLK_DEV_RBD If unsure, say N. +source "drivers/block/zram/Kconfig" + endif # BLK_DEV diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 5b795059f8fb..4fd9bdf546a9 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -42,4 +42,6 @@ obj-$(CONFIG_BLK_DEV_DRBD) += drbd/ obj-$(CONFIG_BLK_DEV_RBD) += rbd.o obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/ +obj-$(CONFIG_ZRAM) += zram/ + swim_mod-y := swim.o swim_asm.o diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 304000c3d433..e8c4361269bc 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3632,6 +3632,7 @@ static int mtip_block_initialize(struct driver_data *dd) /* Set device limits. */ set_bit(QUEUE_FLAG_NONROT, &dd->queue->queue_flags); + clear_bit(QUEUE_FLAG_ADD_RANDOM, &dd->queue->queue_flags); blk_queue_max_segments(dd->queue, MTIP_MAX_SG); blk_queue_physical_block_size(dd->queue, 4096); blk_queue_io_min(dd->queue, 4096); diff --git a/drivers/staging/zram/Kconfig b/drivers/block/zram/Kconfig similarity index 79% rename from drivers/staging/zram/Kconfig rename to drivers/block/zram/Kconfig index a5eacd618a9c..2e57483d4397 100644 --- a/drivers/staging/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -14,7 +14,16 @@ config ZRAM disks and maybe many more. See zram.txt for more information. - Project home: + +config ZRAM_LZ4_COMPRESS + bool "Enable LZ4 algorithm support" + depends on ZRAM + select LZ4_COMPRESS + select LZ4_DECOMPRESS + default n + help + This option enables LZ4 compression algorithm support. Compression + algorithm can be changed using `comp_algorithm' device attribute. config ZRAM_LZ4_COMPRESS bool "Enable LZ4 algorithm support" diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile new file mode 100644 index 000000000000..be0763ff57a2 --- /dev/null +++ b/drivers/block/zram/Makefile @@ -0,0 +1,5 @@ +zram-y := zcomp_lzo.o zcomp.o zram_drv.o + +zram-$(CONFIG_ZRAM_LZ4_COMPRESS) += zcomp_lz4.o + +obj-$(CONFIG_ZRAM) += zram.o diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c new file mode 100644 index 000000000000..27208ed62df3 --- /dev/null +++ b/drivers/block/zram/zcomp.c @@ -0,0 +1,361 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include + +#include "zcomp.h" +#include "zcomp_lzo.h" +#ifdef CONFIG_ZRAM_LZ4_COMPRESS +#include "zcomp_lz4.h" +#endif + +/* + * single zcomp_strm backend + */ +struct zcomp_strm_single { + struct mutex strm_lock; + struct zcomp_strm *zstrm; +}; + +/* + * multi zcomp_strm backend + */ +struct zcomp_strm_multi { + /* protect strm list */ + spinlock_t strm_lock; + /* max possible number of zstrm streams */ + int max_strm; + /* number of available zstrm streams */ + int avail_strm; + /* list of available strms */ + struct list_head idle_strm; + wait_queue_head_t strm_wait; +}; + +static struct zcomp_backend *backends[] = { + &zcomp_lzo, +#ifdef CONFIG_ZRAM_LZ4_COMPRESS + &zcomp_lz4, +#endif + NULL +}; + +static struct zcomp_backend *find_backend(const char *compress) +{ + int i = 0; + while (backends[i]) { + if (sysfs_streq(compress, backends[i]->name)) + break; + i++; + } + return backends[i]; +} + +static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm) +{ + if (zstrm->private) + comp->backend->destroy(zstrm->private); + free_pages((unsigned long)zstrm->buffer, 1); + kfree(zstrm); +} + +/* + * allocate new zcomp_strm structure with ->private initialized by + * backend, return NULL on error + */ +static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp, gfp_t flags) +{ + struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), flags); + if (!zstrm) + return NULL; + + zstrm->private = comp->backend->create(flags); + /* + * allocate 2 pages. 1 for compressed data, plus 1 extra for the + * case when compressed size is larger than the original one + */ + zstrm->buffer = (void *)__get_free_pages(flags | __GFP_ZERO, 1); + if (!zstrm->private || !zstrm->buffer) { + zcomp_strm_free(comp, zstrm); + zstrm = NULL; + } + return zstrm; +} + +/* + * get idle zcomp_strm or wait until other process release + * (zcomp_strm_release()) one for us + */ +static struct zcomp_strm *zcomp_strm_multi_find(struct zcomp *comp) +{ + struct zcomp_strm_multi *zs = comp->stream; + struct zcomp_strm *zstrm; + + while (1) { + spin_lock(&zs->strm_lock); + if (!list_empty(&zs->idle_strm)) { + zstrm = list_entry(zs->idle_strm.next, + struct zcomp_strm, list); + list_del(&zstrm->list); + spin_unlock(&zs->strm_lock); + return zstrm; + } + /* zstrm streams limit reached, wait for idle stream */ + if (zs->avail_strm >= zs->max_strm) { + spin_unlock(&zs->strm_lock); + wait_event(zs->strm_wait, !list_empty(&zs->idle_strm)); + continue; + } + /* allocate new zstrm stream */ + zs->avail_strm++; + spin_unlock(&zs->strm_lock); + /* + * This function can be called in swapout/fs write path + * so we can't use GFP_FS|IO. And it assumes we already + * have at least one stream in zram initialization so we + * don't do best effort to allocate more stream in here. + * A default stream will work well without further multiple + * streams. That's why we use NORETRY | NOWARN. + */ + zstrm = zcomp_strm_alloc(comp, GFP_NOIO | __GFP_NORETRY | + __GFP_NOWARN); + if (!zstrm) { + spin_lock(&zs->strm_lock); + zs->avail_strm--; + spin_unlock(&zs->strm_lock); + wait_event(zs->strm_wait, !list_empty(&zs->idle_strm)); + continue; + } + break; + } + return zstrm; +} + +/* add stream back to idle list and wake up waiter or free the stream */ +static void zcomp_strm_multi_release(struct zcomp *comp, struct zcomp_strm *zstrm) +{ + struct zcomp_strm_multi *zs = comp->stream; + + spin_lock(&zs->strm_lock); + if (zs->avail_strm <= zs->max_strm) { + list_add(&zstrm->list, &zs->idle_strm); + spin_unlock(&zs->strm_lock); + wake_up(&zs->strm_wait); + return; + } + + zs->avail_strm--; + spin_unlock(&zs->strm_lock); + zcomp_strm_free(comp, zstrm); +} + +/* change max_strm limit */ +static bool zcomp_strm_multi_set_max_streams(struct zcomp *comp, int num_strm) +{ + struct zcomp_strm_multi *zs = comp->stream; + struct zcomp_strm *zstrm; + + spin_lock(&zs->strm_lock); + zs->max_strm = num_strm; + /* + * if user has lowered the limit and there are idle streams, + * immediately free as much streams (and memory) as we can. + */ + while (zs->avail_strm > num_strm && !list_empty(&zs->idle_strm)) { + zstrm = list_entry(zs->idle_strm.next, + struct zcomp_strm, list); + list_del(&zstrm->list); + zcomp_strm_free(comp, zstrm); + zs->avail_strm--; + } + spin_unlock(&zs->strm_lock); + return true; +} + +static void zcomp_strm_multi_destroy(struct zcomp *comp) +{ + struct zcomp_strm_multi *zs = comp->stream; + struct zcomp_strm *zstrm; + + while (!list_empty(&zs->idle_strm)) { + zstrm = list_entry(zs->idle_strm.next, + struct zcomp_strm, list); + list_del(&zstrm->list); + zcomp_strm_free(comp, zstrm); + } + kfree(zs); +} + +static int zcomp_strm_multi_create(struct zcomp *comp, int max_strm) +{ + struct zcomp_strm *zstrm; + struct zcomp_strm_multi *zs; + + comp->destroy = zcomp_strm_multi_destroy; + comp->strm_find = zcomp_strm_multi_find; + comp->strm_release = zcomp_strm_multi_release; + comp->set_max_streams = zcomp_strm_multi_set_max_streams; + zs = kmalloc(sizeof(struct zcomp_strm_multi), GFP_KERNEL); + if (!zs) + return -ENOMEM; + + comp->stream = zs; + spin_lock_init(&zs->strm_lock); + INIT_LIST_HEAD(&zs->idle_strm); + init_waitqueue_head(&zs->strm_wait); + zs->max_strm = max_strm; + zs->avail_strm = 1; + + zstrm = zcomp_strm_alloc(comp, GFP_KERNEL); + if (!zstrm) { + kfree(zs); + return -ENOMEM; + } + list_add(&zstrm->list, &zs->idle_strm); + return 0; +} + +static struct zcomp_strm *zcomp_strm_single_find(struct zcomp *comp) +{ + struct zcomp_strm_single *zs = comp->stream; + mutex_lock(&zs->strm_lock); + return zs->zstrm; +} + +static void zcomp_strm_single_release(struct zcomp *comp, + struct zcomp_strm *zstrm) +{ + struct zcomp_strm_single *zs = comp->stream; + mutex_unlock(&zs->strm_lock); +} + +static bool zcomp_strm_single_set_max_streams(struct zcomp *comp, int num_strm) +{ + /* zcomp_strm_single support only max_comp_streams == 1 */ + return false; +} + +static void zcomp_strm_single_destroy(struct zcomp *comp) +{ + struct zcomp_strm_single *zs = comp->stream; + zcomp_strm_free(comp, zs->zstrm); + kfree(zs); +} + +static int zcomp_strm_single_create(struct zcomp *comp) +{ + struct zcomp_strm_single *zs; + + comp->destroy = zcomp_strm_single_destroy; + comp->strm_find = zcomp_strm_single_find; + comp->strm_release = zcomp_strm_single_release; + comp->set_max_streams = zcomp_strm_single_set_max_streams; + zs = kmalloc(sizeof(struct zcomp_strm_single), GFP_KERNEL); + if (!zs) + return -ENOMEM; + + comp->stream = zs; + mutex_init(&zs->strm_lock); + zs->zstrm = zcomp_strm_alloc(comp, GFP_KERNEL); + if (!zs->zstrm) { + kfree(zs); + return -ENOMEM; + } + return 0; +} + +/* show available compressors */ +ssize_t zcomp_available_show(const char *comp, char *buf) +{ + ssize_t sz = 0; + int i = 0; + + while (backends[i]) { + if (sysfs_streq(comp, backends[i]->name)) + sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, + "[%s] ", backends[i]->name); + else + sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, + "%s ", backends[i]->name); + i++; + } + sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n"); + return sz; +} + +bool zcomp_set_max_streams(struct zcomp *comp, int num_strm) +{ + return comp->set_max_streams(comp, num_strm); +} + +struct zcomp_strm *zcomp_strm_find(struct zcomp *comp) +{ + return comp->strm_find(comp); +} + +void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm) +{ + comp->strm_release(comp, zstrm); +} + +int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, + const unsigned char *src, size_t *dst_len) +{ + return comp->backend->compress(src, zstrm->buffer, dst_len, + zstrm->private); +} + +int zcomp_decompress(struct zcomp *comp, const unsigned char *src, + size_t src_len, unsigned char *dst) +{ + return comp->backend->decompress(src, src_len, dst); +} + +void zcomp_destroy(struct zcomp *comp) +{ + comp->destroy(comp); + kfree(comp); +} + +/* + * search available compressors for requested algorithm. + * allocate new zcomp and initialize it. return compressing + * backend pointer or ERR_PTR if things went bad. ERR_PTR(-EINVAL) + * if requested algorithm is not supported, ERR_PTR(-ENOMEM) in + * case of allocation error. + */ +struct zcomp *zcomp_create(const char *compress, int max_strm) +{ + struct zcomp *comp; + struct zcomp_backend *backend; + + backend = find_backend(compress); + if (!backend) + return ERR_PTR(-EINVAL); + + comp = kzalloc(sizeof(struct zcomp), GFP_KERNEL); + if (!comp) + return ERR_PTR(-ENOMEM); + + comp->backend = backend; + if (max_strm > 1) + zcomp_strm_multi_create(comp, max_strm); + else + zcomp_strm_single_create(comp); + if (!comp->stream) { + kfree(comp); + return ERR_PTR(-ENOMEM); + } + return comp; +} diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h new file mode 100644 index 000000000000..a3848166747c --- /dev/null +++ b/drivers/block/zram/zcomp.h @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _ZCOMP_H_ +#define _ZCOMP_H_ + +#include + +struct zcomp_strm { + /* compression/decompression buffer */ + void *buffer; + /* + * The private data of the compression stream, only compression + * stream backend can touch this (e.g. compression algorithm + * working memory) + */ + void *private; + /* used in multi stream backend, protected by backend strm_lock */ + struct list_head list; +}; + +/* static compression backend */ +struct zcomp_backend { + int (*compress)(const unsigned char *src, unsigned char *dst, + size_t *dst_len, void *private); + + int (*decompress)(const unsigned char *src, size_t src_len, + unsigned char *dst); + + void *(*create)(gfp_t flags); + void (*destroy)(void *private); + + const char *name; +}; + +/* dynamic per-device compression frontend */ +struct zcomp { + void *stream; + struct zcomp_backend *backend; + + struct zcomp_strm *(*strm_find)(struct zcomp *comp); + void (*strm_release)(struct zcomp *comp, struct zcomp_strm *zstrm); + bool (*set_max_streams)(struct zcomp *comp, int num_strm); + void (*destroy)(struct zcomp *comp); +}; + +ssize_t zcomp_available_show(const char *comp, char *buf); + +struct zcomp *zcomp_create(const char *comp, int max_strm); +void zcomp_destroy(struct zcomp *comp); + +struct zcomp_strm *zcomp_strm_find(struct zcomp *comp); +void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm); + +int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, + const unsigned char *src, size_t *dst_len); + +int zcomp_decompress(struct zcomp *comp, const unsigned char *src, + size_t src_len, unsigned char *dst); + +bool zcomp_set_max_streams(struct zcomp *comp, int num_strm); +#endif /* _ZCOMP_H_ */ diff --git a/drivers/block/zram/zcomp_lz4.c b/drivers/block/zram/zcomp_lz4.c new file mode 100644 index 000000000000..0110086accba --- /dev/null +++ b/drivers/block/zram/zcomp_lz4.c @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +#include "zcomp_lz4.h" + +static void *zcomp_lz4_create(gfp_t flags) +{ + void *ret; + + ret = kmalloc(LZ4_MEM_COMPRESS, flags); + if (!ret) + ret = __vmalloc(LZ4_MEM_COMPRESS, + flags | __GFP_HIGHMEM, + PAGE_KERNEL); + return ret; +} + +static void zcomp_lz4_destroy(void *private) +{ + kvfree(private); +} + +static int zcomp_lz4_compress(const unsigned char *src, unsigned char *dst, + size_t *dst_len, void *private) +{ + /* return : Success if return 0 */ + return lz4_compress(src, PAGE_SIZE, dst, dst_len, private); +} + +static int zcomp_lz4_decompress(const unsigned char *src, size_t src_len, + unsigned char *dst) +{ + size_t dst_len = PAGE_SIZE; + /* return : Success if return 0 */ + return lz4_decompress_unknownoutputsize(src, src_len, dst, &dst_len); +} + +struct zcomp_backend zcomp_lz4 = { + .compress = zcomp_lz4_compress, + .decompress = zcomp_lz4_decompress, + .create = zcomp_lz4_create, + .destroy = zcomp_lz4_destroy, + .name = "lz4", +}; diff --git a/drivers/block/zram/zcomp_lz4.h b/drivers/block/zram/zcomp_lz4.h new file mode 100644 index 000000000000..60613fb29dd8 --- /dev/null +++ b/drivers/block/zram/zcomp_lz4.h @@ -0,0 +1,17 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _ZCOMP_LZ4_H_ +#define _ZCOMP_LZ4_H_ + +#include "zcomp.h" + +extern struct zcomp_backend zcomp_lz4; + +#endif /* _ZCOMP_LZ4_H_ */ diff --git a/drivers/block/zram/zcomp_lzo.c b/drivers/block/zram/zcomp_lzo.c new file mode 100644 index 000000000000..ed7a1f0549ec --- /dev/null +++ b/drivers/block/zram/zcomp_lzo.c @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +#include "zcomp_lzo.h" + +static void *lzo_create(gfp_t flags) +{ + void *ret; + + ret = kmalloc(LZO1X_MEM_COMPRESS, flags); + if (!ret) + ret = __vmalloc(LZO1X_MEM_COMPRESS, + flags | __GFP_HIGHMEM, + PAGE_KERNEL); + return ret; +} + +static void lzo_destroy(void *private) +{ + kvfree(private); +} + +static int lzo_compress(const unsigned char *src, unsigned char *dst, + size_t *dst_len, void *private) +{ + int ret = lzo1x_1_compress(src, PAGE_SIZE, dst, dst_len, private); + return ret == LZO_E_OK ? 0 : ret; +} + +static int lzo_decompress(const unsigned char *src, size_t src_len, + unsigned char *dst) +{ + size_t dst_len = PAGE_SIZE; + int ret = lzo1x_decompress_safe(src, src_len, dst, &dst_len); + return ret == LZO_E_OK ? 0 : ret; +} + +struct zcomp_backend zcomp_lzo = { + .compress = lzo_compress, + .decompress = lzo_decompress, + .create = lzo_create, + .destroy = lzo_destroy, + .name = "lzo", +}; diff --git a/drivers/block/zram/zcomp_lzo.h b/drivers/block/zram/zcomp_lzo.h new file mode 100644 index 000000000000..128c5807fa14 --- /dev/null +++ b/drivers/block/zram/zcomp_lzo.h @@ -0,0 +1,17 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _ZCOMP_LZO_H_ +#define _ZCOMP_LZO_H_ + +#include "zcomp.h" + +extern struct zcomp_backend zcomp_lzo; + +#endif /* _ZCOMP_LZO_H_ */ diff --git a/drivers/staging/zram/zram_drv.c b/drivers/block/zram/zram_drv.c similarity index 67% rename from drivers/staging/zram/zram_drv.c rename to drivers/block/zram/zram_drv.c index a98fb3318ee9..ade1b8bf5acf 100644 --- a/drivers/staging/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -2,6 +2,7 @@ * Compressed RAM block device * * Copyright (C) 2008, 2009, 2010 Nitin Gupta + * 2012, 2013 Minchan Kim * * This code is released using a dual license strategy: BSD/GPL * You can choose the licence that better fits your requirements. @@ -9,7 +10,6 @@ * Released under the terms of 3-clause BSD License * Released under the terms of GNU General Public License Version 2.0 * - * Project home: http://compcache.googlecode.com */ #define KMSG_COMPONENT "zram" @@ -29,56 +29,36 @@ #include #include #include -#include -#include #include #include -#include +#include #include "zram_drv.h" -static inline int z_decompress_safe(const unsigned char *src, size_t src_len, - unsigned char *dest, size_t *dest_len) -{ -#ifdef CONFIG_ZRAM_LZ4_COMPRESS - return lz4_decompress_unknownoutputsize(src, src_len, dest, dest_len); -#else - return lzo1x_decompress_safe(src, src_len, dest, dest_len); -#endif -} - -static inline int z_compress(const unsigned char *src, size_t src_len, - unsigned char *dst, size_t *dst_len, void *wrkmem) -{ -#ifdef CONFIG_ZRAM_LZ4_COMPRESS - return lz4_compress(src, src_len, dst, dst_len, wrkmem); -#else - return lzo1x_1_compress(src, src_len, dst, dst_len, wrkmem); -#endif -} - -static inline size_t z_scratch_size(void) -{ -#ifdef CONFIG_ZRAM_LZ4_COMPRESS - return LZ4_MEM_COMPRESS; -#else - return LZO1X_MEM_COMPRESS; -#endif -} - /* Globals */ static int zram_major; static struct zram *zram_devices; - -/* - * We don't need to see memory allocation errors more than once every 1 - * second to know that a problem is occurring. - */ -#define ALLOC_ERROR_LOG_RATE_MS 1000 +static const char *default_compressor = "lzo"; /* Module params (documentation at end) */ static unsigned int num_devices = 1; +#define ZRAM_ATTR_RO(name) \ +static ssize_t zram_attr_##name##_show(struct device *d, \ + struct device_attribute *attr, char *b) \ +{ \ + struct zram *zram = dev_to_zram(d); \ + return scnprintf(b, PAGE_SIZE, "%llu\n", \ + (u64)atomic64_read(&zram->stats.name)); \ +} \ +static struct device_attribute dev_attr_##name = \ + __ATTR(name, S_IRUGO, zram_attr_##name##_show, NULL); + +static inline int init_done(struct zram *zram) +{ + return zram->meta != NULL; +} + static inline struct zram *dev_to_zram(struct device *dev) { return (struct zram *)dev_to_disk(dev)->private_data; @@ -89,94 +69,117 @@ static ssize_t disksize_show(struct device *dev, { struct zram *zram = dev_to_zram(dev); - return sprintf(buf, "%llu\n", zram->disksize); + return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize); } static ssize_t initstate_show(struct device *dev, struct device_attribute *attr, char *buf) { + u32 val; struct zram *zram = dev_to_zram(dev); - return sprintf(buf, "%u\n", zram->init_done); -} - -static ssize_t num_reads_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); + down_read(&zram->init_lock); + val = init_done(zram); + up_read(&zram->init_lock); - return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.num_reads)); + return scnprintf(buf, PAGE_SIZE, "%u\n", val); } -static ssize_t num_writes_show(struct device *dev, +static ssize_t orig_data_size_show(struct device *dev, struct device_attribute *attr, char *buf) { struct zram *zram = dev_to_zram(dev); - return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.num_writes)); + return scnprintf(buf, PAGE_SIZE, "%llu\n", + (u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT); } -static ssize_t invalid_io_show(struct device *dev, +static ssize_t mem_used_total_show(struct device *dev, struct device_attribute *attr, char *buf) { + u64 val = 0; struct zram *zram = dev_to_zram(dev); + struct zram_meta *meta = zram->meta; - return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.invalid_io)); -} - -static ssize_t notify_free_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); + down_read(&zram->init_lock); + if (init_done(zram)) + val = zs_get_total_size_bytes(meta->mem_pool); + up_read(&zram->init_lock); - return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.notify_free)); + return scnprintf(buf, PAGE_SIZE, "%llu\n", val); } -static ssize_t zero_pages_show(struct device *dev, +static ssize_t max_comp_streams_show(struct device *dev, struct device_attribute *attr, char *buf) { + int val; struct zram *zram = dev_to_zram(dev); - return sprintf(buf, "%u\n", zram->stats.pages_zero); + down_read(&zram->init_lock); + val = zram->max_comp_streams; + up_read(&zram->init_lock); + + return scnprintf(buf, PAGE_SIZE, "%d\n", val); } -static ssize_t orig_data_size_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t max_comp_streams_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) { + int num; struct zram *zram = dev_to_zram(dev); + int ret; - return sprintf(buf, "%llu\n", - (u64)(zram->stats.pages_stored) << PAGE_SHIFT); -} + ret = kstrtoint(buf, 0, &num); + if (ret < 0) + return ret; + if (num < 1) + return -EINVAL; -static ssize_t compr_data_size_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); + down_write(&zram->init_lock); + if (init_done(zram)) { + if (!zcomp_set_max_streams(zram->comp, num)) { + pr_info("Cannot change max compression streams\n"); + ret = -EINVAL; + goto out; + } + } - return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.compr_size)); + zram->max_comp_streams = num; + ret = len; +out: + up_write(&zram->init_lock); + return ret; } -static ssize_t mem_used_total_show(struct device *dev, +static ssize_t comp_algorithm_show(struct device *dev, struct device_attribute *attr, char *buf) { - u64 val = 0; + size_t sz; struct zram *zram = dev_to_zram(dev); - struct zram_meta *meta = zram->meta; down_read(&zram->init_lock); - if (zram->init_done) - val = zs_get_total_size_bytes(meta->mem_pool); + sz = zcomp_available_show(zram->compressor, buf); up_read(&zram->init_lock); - return sprintf(buf, "%llu\n", val); + return sz; } +static ssize_t comp_algorithm_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct zram *zram = dev_to_zram(dev); + down_write(&zram->init_lock); + if (init_done(zram)) { + up_write(&zram->init_lock); + pr_info("Can't change algorithm for initialized device\n"); + return -EBUSY; + } + strlcpy(zram->compressor, buf, sizeof(zram->compressor)); + up_write(&zram->init_lock); + return len; +} + +/* flag operations needs meta->tb_lock */ static int zram_test_flag(struct zram_meta *meta, u32 index, enum zram_pageflags flag) { @@ -208,7 +211,8 @@ static inline int valid_io_request(struct zram *zram, struct bio *bio) u64 start, end, bound; /* unaligned request */ - if (unlikely(bio->bi_sector & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1))) + if (unlikely(bio->bi_sector & + (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1))) return 0; if (unlikely(bio->bi_size & (ZRAM_LOGICAL_BLOCK_SIZE - 1))) return 0; @@ -227,8 +231,6 @@ static inline int valid_io_request(struct zram *zram, struct bio *bio) static void zram_meta_free(struct zram_meta *meta) { zs_destroy_pool(meta->mem_pool); - kfree(meta->compress_workmem); - free_pages((unsigned long)meta->compress_buffer, 1); vfree(meta->table); kfree(meta); } @@ -240,39 +242,24 @@ static struct zram_meta *zram_meta_alloc(u64 disksize) if (!meta) goto out; - meta->compress_workmem = kzalloc(z_scratch_size(), GFP_KERNEL); - if (!meta->compress_workmem) - goto free_meta; - - meta->compress_buffer = - (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1); - if (!meta->compress_buffer) { - pr_err("Error allocating compressor buffer space\n"); - goto free_workmem; - } - num_pages = disksize >> PAGE_SHIFT; meta->table = vzalloc(num_pages * sizeof(*meta->table)); if (!meta->table) { pr_err("Error allocating zram address table\n"); - goto free_buffer; + goto free_meta; } - meta->mem_pool = zs_create_pool(GFP_NOIO | __GFP_HIGHMEM | - __GFP_NOWARN); + meta->mem_pool = zs_create_pool(GFP_NOIO | __GFP_HIGHMEM); if (!meta->mem_pool) { pr_err("Error creating memory pool\n"); goto free_table; } + rwlock_init(&meta->tb_lock); return meta; free_table: vfree(meta->table); -free_buffer: - free_pages((unsigned long)meta->compress_buffer, 1); -free_workmem: - kfree(meta->compress_workmem); free_meta: kfree(meta); meta = NULL; @@ -317,11 +304,11 @@ static void handle_zero_page(struct bio_vec *bvec) flush_dcache_page(page); } +/* NOTE: caller should hold meta->tb_lock with write-side */ static void zram_free_page(struct zram *zram, size_t index) { struct zram_meta *meta = zram->meta; unsigned long handle = meta->table[index].handle; - u16 size = meta->table[index].size; if (unlikely(!handle)) { /* @@ -330,21 +317,15 @@ static void zram_free_page(struct zram *zram, size_t index) */ if (zram_test_flag(meta, index, ZRAM_ZERO)) { zram_clear_flag(meta, index, ZRAM_ZERO); - zram->stats.pages_zero--; + atomic64_dec(&zram->stats.zero_pages); } return; } - if (unlikely(size > max_zpage_size)) - zram->stats.bad_compress--; - zs_free(meta->mem_pool, handle); - if (size <= PAGE_SIZE / 2) - zram->stats.good_compress--; - - atomic64_sub(meta->table[index].size, &zram->stats.compr_size); - zram->stats.pages_stored--; + atomic64_sub(meta->table[index].size, &zram->stats.compr_data_size); + atomic64_dec(&zram->stats.pages_stored); meta->table[index].handle = 0; meta->table[index].size = 0; @@ -352,27 +333,32 @@ static void zram_free_page(struct zram *zram, size_t index) static int zram_decompress_page(struct zram *zram, char *mem, u32 index) { - int ret = LZO_E_OK; - size_t clen = PAGE_SIZE; + int ret = 0; unsigned char *cmem; struct zram_meta *meta = zram->meta; - unsigned long handle = meta->table[index].handle; + unsigned long handle; + u16 size; + + read_lock(&meta->tb_lock); + handle = meta->table[index].handle; + size = meta->table[index].size; if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) { + read_unlock(&meta->tb_lock); clear_page(mem); return 0; } cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO); - if (meta->table[index].size == PAGE_SIZE) + if (size == PAGE_SIZE) copy_page(mem, cmem); else - ret = z_decompress_safe(cmem, meta->table[index].size, - mem, &clen); + ret = zcomp_decompress(zram->comp, cmem, size, mem); zs_unmap_object(meta->mem_pool, handle); + read_unlock(&meta->tb_lock); /* Should NEVER happen. Return bio error if it does. */ - if (unlikely(ret != LZO_E_OK)) { + if (unlikely(ret)) { pr_err("Decompression failed! err=%d, page=%u\n", ret, index); atomic64_inc(&zram->stats.failed_reads); return ret; @@ -390,11 +376,14 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, struct zram_meta *meta = zram->meta; page = bvec->bv_page; + read_lock(&meta->tb_lock); if (unlikely(!meta->table[index].handle) || zram_test_flag(meta, index, ZRAM_ZERO)) { + read_unlock(&meta->tb_lock); handle_zero_page(bvec); return 0; } + read_unlock(&meta->tb_lock); if (is_partial_io(bvec)) /* Use a temporary buffer to decompress the page */ @@ -412,7 +401,7 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, ret = zram_decompress_page(zram, uncmem, index); /* Should NEVER happen. Return bio error if it does. */ - if (unlikely(ret != LZO_E_OK)) + if (unlikely(ret)) goto out_cleanup; if (is_partial_io(bvec)) @@ -437,11 +426,10 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, struct page *page; unsigned char *user_mem, *cmem, *src, *uncmem = NULL; struct zram_meta *meta = zram->meta; - static unsigned long zram_rs_time; + struct zcomp_strm *zstrm; + bool locked = false; page = bvec->bv_page; - src = meta->compress_buffer; - if (is_partial_io(bvec)) { /* * This is a partial IO. We need to read the full page @@ -457,6 +445,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, goto out; } + zstrm = zcomp_strm_find(zram->comp); + locked = true; user_mem = kmap_atomic(page); if (is_partial_io(bvec)) { @@ -469,52 +459,41 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, } if (page_zero_filled(uncmem)) { - kunmap_atomic(user_mem); + if (user_mem) + kunmap_atomic(user_mem); /* Free memory associated with this sector now. */ + write_lock(&zram->meta->tb_lock); zram_free_page(zram, index); - - zram->stats.pages_zero++; zram_set_flag(meta, index, ZRAM_ZERO); + write_unlock(&zram->meta->tb_lock); + + atomic64_inc(&zram->stats.zero_pages); ret = 0; goto out; } - /* - * zram_slot_free_notify could miss free so that let's - * double check. - */ - if (unlikely(meta->table[index].handle || - zram_test_flag(meta, index, ZRAM_ZERO))) - zram_free_page(zram, index); - - ret = z_compress(uncmem, PAGE_SIZE, src, &clen, - meta->compress_workmem); - + ret = zcomp_compress(zram->comp, zstrm, uncmem, &clen); if (!is_partial_io(bvec)) { kunmap_atomic(user_mem); user_mem = NULL; uncmem = NULL; } - if (unlikely(ret != LZO_E_OK)) { + if (unlikely(ret)) { pr_err("Compression failed! err=%d\n", ret); goto out; } - + src = zstrm->buffer; if (unlikely(clen > max_zpage_size)) { - zram->stats.bad_compress++; clen = PAGE_SIZE; - src = NULL; if (is_partial_io(bvec)) src = uncmem; } handle = zs_malloc(meta->mem_pool, clen); if (!handle) { - if (printk_timed_ratelimit(&zram_rs_time, - ALLOC_ERROR_LOG_RATE_MS)) - pr_info("Error allocating memory for compressed page: %u, size=%zu\n", - index, clen); + pr_info("Error allocating memory for compressed page: %u, size=%zu\n", + index, clen); ret = -ENOMEM; goto out; } @@ -528,82 +507,104 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, memcpy(cmem, src, clen); } + zcomp_strm_release(zram->comp, zstrm); + locked = false; zs_unmap_object(meta->mem_pool, handle); /* * Free memory associated with this sector * before overwriting unused sectors. */ + write_lock(&zram->meta->tb_lock); zram_free_page(zram, index); meta->table[index].handle = handle; meta->table[index].size = clen; + write_unlock(&zram->meta->tb_lock); /* Update stats */ - atomic64_add(clen, &zram->stats.compr_size); - zram->stats.pages_stored++; - if (clen <= PAGE_SIZE / 2) - zram->stats.good_compress++; - + atomic64_add(clen, &zram->stats.compr_data_size); + atomic64_inc(&zram->stats.pages_stored); out: + if (locked) + zcomp_strm_release(zram->comp, zstrm); if (is_partial_io(bvec)) kfree(uncmem); - if (ret) atomic64_inc(&zram->stats.failed_writes); return ret; } -static void handle_pending_slot_free(struct zram *zram) -{ - struct zram_slot_free *free_rq; - - spin_lock(&zram->slot_free_lock); - while (zram->slot_free_rq) { - free_rq = zram->slot_free_rq; - zram->slot_free_rq = free_rq->next; - zram_free_page(zram, free_rq->index); - kfree(free_rq); - } - spin_unlock(&zram->slot_free_lock); -} - static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, - int offset, struct bio *bio, int rw) + int offset, struct bio *bio) { int ret; + int rw = bio_data_dir(bio); if (rw == READ) { - down_read(&zram->lock); - handle_pending_slot_free(zram); + atomic64_inc(&zram->stats.num_reads); ret = zram_bvec_read(zram, bvec, index, offset, bio); - up_read(&zram->lock); } else { - down_write(&zram->lock); - handle_pending_slot_free(zram); + atomic64_inc(&zram->stats.num_writes); ret = zram_bvec_write(zram, bvec, index, offset); - up_write(&zram->lock); } return ret; } +/* + * zram_bio_discard - handler on discard request + * @index: physical block index in PAGE_SIZE units + * @offset: byte offset within physical block + */ +static void zram_bio_discard(struct zram *zram, u32 index, + int offset, struct bio *bio) +{ + size_t n = bio->bi_size; + + /* + * zram manages data in physical block size units. Because logical block + * size isn't identical with physical block size on some arch, we + * could get a discard request pointing to a specific offset within a + * certain physical block. Although we can handle this request by + * reading that physiclal block and decompressing and partially zeroing + * and re-compressing and then re-storing it, this isn't reasonable + * because our intent with a discard request is to save memory. So + * skipping this logical block is appropriate here. + */ + if (offset) { + if (n < offset) + return; + + n -= offset; + index++; + } + + while (n >= PAGE_SIZE) { + /* + * Discard request can be large so the lock hold times could be + * lengthy. So take the lock once per page. + */ + write_lock(&zram->meta->tb_lock); + zram_free_page(zram, index); + write_unlock(&zram->meta->tb_lock); + index++; + n -= PAGE_SIZE; + } +} + static void zram_reset_device(struct zram *zram, bool reset_capacity) { size_t index; struct zram_meta *meta; - flush_work(&zram->free_work); - down_write(&zram->init_lock); - if (!zram->init_done) { + if (!init_done(zram)) { up_write(&zram->init_lock); return; } meta = zram->meta; - zram->init_done = 0; - /* Free all pages that are still in this zram device */ for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) { unsigned long handle = meta->table[index].handle; @@ -613,6 +614,9 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity) zs_free(meta->mem_pool, handle); } + zcomp_destroy(zram->comp); + zram->max_comp_streams = 1; + zram_meta_free(zram->meta); zram->meta = NULL; /* Reset stats */ @@ -624,37 +628,14 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity) up_write(&zram->init_lock); } -static void zram_init_device(struct zram *zram, struct zram_meta *meta) -{ - if (zram->disksize > 2 * (totalram_pages << PAGE_SHIFT)) { - pr_info( - "There is little point creating a zram of greater than " - "twice the size of memory since we expect a 2:1 compression " - "ratio. Note that zram uses about 0.1%% of the size of " - "the disk when not in use so a huge zram is " - "wasteful.\n" - "\tMemory Size: %lu kB\n" - "\tSize you selected: %llu kB\n" - "Continuing anyway ...\n", - (totalram_pages << PAGE_SHIFT) >> 10, zram->disksize >> 10 - ); - } - - /* zram devices sort of resembles non-rotational disks */ - queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue); - - zram->meta = meta; - zram->init_done = 1; - - pr_debug("Initialization done!\n"); -} - static ssize_t disksize_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { u64 disksize; + struct zcomp *comp; struct zram_meta *meta; struct zram *zram = dev_to_zram(dev); + int err; disksize = memparse(buf, NULL); if (!disksize) @@ -662,20 +643,37 @@ static ssize_t disksize_store(struct device *dev, disksize = PAGE_ALIGN(disksize); meta = zram_meta_alloc(disksize); + if (!meta) + return -ENOMEM; + + comp = zcomp_create(zram->compressor, zram->max_comp_streams); + if (IS_ERR(comp)) { + pr_info("Cannot initialise %s compressing backend\n", + zram->compressor); + err = PTR_ERR(comp); + goto out_free_meta; + } + down_write(&zram->init_lock); - if (zram->init_done) { - up_write(&zram->init_lock); - zram_meta_free(meta); + if (init_done(zram)) { pr_info("Cannot change disksize for initialized device\n"); - return -EBUSY; + err = -EBUSY; + goto out_destroy_comp; } + zram->meta = meta; + zram->comp = comp; zram->disksize = disksize; set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); - zram_init_device(zram, meta); up_write(&zram->init_lock); - return len; + +out_destroy_comp: + up_write(&zram->init_lock); + zcomp_destroy(comp); +out_free_meta: + zram_meta_free(meta); + return err; } static ssize_t reset_store(struct device *dev, @@ -689,42 +687,51 @@ static ssize_t reset_store(struct device *dev, zram = dev_to_zram(dev); bdev = bdget_disk(zram->disk, 0); + if (!bdev) + return -ENOMEM; + /* Do not reset an active device! */ - if (bdev->bd_holders) - return -EBUSY; + if (bdev->bd_holders) { + ret = -EBUSY; + goto out; + } ret = kstrtou16(buf, 10, &do_reset); if (ret) - return ret; + goto out; - if (!do_reset) - return -EINVAL; + if (!do_reset) { + ret = -EINVAL; + goto out; + } /* Make sure all pending I/O is finished */ - if (bdev) - fsync_bdev(bdev); + fsync_bdev(bdev); + bdput(bdev); zram_reset_device(zram, true); return len; + +out: + bdput(bdev); + return ret; } -static void __zram_make_request(struct zram *zram, struct bio *bio, int rw) +static void __zram_make_request(struct zram *zram, struct bio *bio) { - int i, offset; + int offset, i; u32 index; struct bio_vec *bvec; - switch (rw) { - case READ: - atomic64_inc(&zram->stats.num_reads); - break; - case WRITE: - atomic64_inc(&zram->stats.num_writes); - break; - } - index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT; - offset = (bio->bi_sector & (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT; + offset = (bio->bi_sector & + (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT; + + if (unlikely(bio->bi_rw & REQ_DISCARD)) { + zram_bio_discard(zram, index, offset, bio); + bio_endio(bio, 0); + return; + } bio_for_each_segment(bvec, bio, i) { int max_transfer_size = PAGE_SIZE - offset; @@ -740,16 +747,15 @@ static void __zram_make_request(struct zram *zram, struct bio *bio, int rw) bv.bv_len = max_transfer_size; bv.bv_offset = bvec->bv_offset; - if (zram_bvec_rw(zram, &bv, index, offset, bio, rw) < 0) + if (zram_bvec_rw(zram, &bv, index, offset, bio) < 0) goto out; bv.bv_len = bvec->bv_len - max_transfer_size; bv.bv_offset += max_transfer_size; - if (zram_bvec_rw(zram, &bv, index+1, 0, bio, rw) < 0) + if (zram_bvec_rw(zram, &bv, index + 1, 0, bio) < 0) goto out; } else - if (zram_bvec_rw(zram, bvec, index, offset, bio, rw) - < 0) + if (zram_bvec_rw(zram, bvec, index, offset, bio) < 0) goto out; update_position(&index, &offset, bvec); @@ -771,7 +777,7 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio) struct zram *zram = queue->queuedata; down_read(&zram->init_lock); - if (unlikely(!zram->init_done)) + if (unlikely(!init_done(zram))) goto error; if (!valid_io_request(zram, bio)) { @@ -779,7 +785,7 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio) goto error; } - __zram_make_request(zram, bio, bio_data_dir(bio)); + __zram_make_request(zram, bio); up_read(&zram->init_lock); return; @@ -789,40 +795,19 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio) bio_io_error(bio); } -static void zram_slot_free(struct work_struct *work) -{ - struct zram *zram; - - zram = container_of(work, struct zram, free_work); - down_write(&zram->lock); - handle_pending_slot_free(zram); - up_write(&zram->lock); -} - -static void add_slot_free(struct zram *zram, struct zram_slot_free *free_rq) -{ - spin_lock(&zram->slot_free_lock); - free_rq->next = zram->slot_free_rq; - zram->slot_free_rq = free_rq; - spin_unlock(&zram->slot_free_lock); -} - static void zram_slot_free_notify(struct block_device *bdev, unsigned long index) { struct zram *zram; - struct zram_slot_free *free_rq; + struct zram_meta *meta; zram = bdev->bd_disk->private_data; - atomic64_inc(&zram->stats.notify_free); - - free_rq = kmalloc(sizeof(struct zram_slot_free), GFP_ATOMIC); - if (!free_rq) - return; + meta = zram->meta; - free_rq->index = index; - add_slot_free(zram, free_rq); - schedule_work(&zram->free_work); + write_lock(&meta->tb_lock); + zram_free_page(zram, index); + write_unlock(&meta->tb_lock); + atomic64_inc(&zram->stats.notify_free); } static const struct block_device_operations zram_devops = { @@ -834,14 +819,21 @@ static DEVICE_ATTR(disksize, S_IRUGO | S_IWUSR, disksize_show, disksize_store); static DEVICE_ATTR(initstate, S_IRUGO, initstate_show, NULL); static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store); -static DEVICE_ATTR(num_reads, S_IRUGO, num_reads_show, NULL); -static DEVICE_ATTR(num_writes, S_IRUGO, num_writes_show, NULL); -static DEVICE_ATTR(invalid_io, S_IRUGO, invalid_io_show, NULL); -static DEVICE_ATTR(notify_free, S_IRUGO, notify_free_show, NULL); -static DEVICE_ATTR(zero_pages, S_IRUGO, zero_pages_show, NULL); static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL); -static DEVICE_ATTR(compr_data_size, S_IRUGO, compr_data_size_show, NULL); static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL); +static DEVICE_ATTR(max_comp_streams, S_IRUGO | S_IWUSR, + max_comp_streams_show, max_comp_streams_store); +static DEVICE_ATTR(comp_algorithm, S_IRUGO | S_IWUSR, + comp_algorithm_show, comp_algorithm_store); + +ZRAM_ATTR_RO(num_reads); +ZRAM_ATTR_RO(num_writes); +ZRAM_ATTR_RO(failed_reads); +ZRAM_ATTR_RO(failed_writes); +ZRAM_ATTR_RO(invalid_io); +ZRAM_ATTR_RO(notify_free); +ZRAM_ATTR_RO(zero_pages); +ZRAM_ATTR_RO(compr_data_size); static struct attribute *zram_disk_attrs[] = { &dev_attr_disksize.attr, @@ -849,12 +841,16 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_reset.attr, &dev_attr_num_reads.attr, &dev_attr_num_writes.attr, + &dev_attr_failed_reads.attr, + &dev_attr_failed_writes.attr, &dev_attr_invalid_io.attr, &dev_attr_notify_free.attr, &dev_attr_zero_pages.attr, &dev_attr_orig_data_size.attr, &dev_attr_compr_data_size.attr, &dev_attr_mem_used_total.attr, + &dev_attr_max_comp_streams.attr, + &dev_attr_comp_algorithm.attr, NULL, }; @@ -866,13 +862,8 @@ static int create_device(struct zram *zram, int device_id) { int ret = -ENOMEM; - init_rwsem(&zram->lock); init_rwsem(&zram->init_lock); - INIT_WORK(&zram->free_work, zram_slot_free); - spin_lock_init(&zram->slot_free_lock); - zram->slot_free_rq = NULL; - zram->queue = blk_alloc_queue(GFP_KERNEL); if (!zram->queue) { pr_err("Error allocating disk queue for device %d\n", @@ -900,7 +891,9 @@ static int create_device(struct zram *zram, int device_id) /* Actual capacity set using syfs (/sys/block/zram/disksize */ set_capacity(zram->disk, 0); - + /* zram devices sort of resembles non-rotational disks */ + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue); + queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue); /* * To ensure that we always get PAGE_SIZE aligned * and n*PAGE_SIZED sized I/O requests. @@ -910,6 +903,21 @@ static int create_device(struct zram *zram, int device_id) ZRAM_LOGICAL_BLOCK_SIZE); blk_queue_io_min(zram->disk->queue, PAGE_SIZE); blk_queue_io_opt(zram->disk->queue, PAGE_SIZE); + zram->disk->queue->limits.discard_granularity = PAGE_SIZE; + zram->disk->queue->limits.max_discard_sectors = UINT_MAX; + /* + * zram_bio_discard() will clear all logical blocks if logical block + * size is identical with physical block size(PAGE_SIZE). But if it is + * different, we will skip discarding some parts of logical blocks in + * the part of the request range which isn't aligned to physical block + * size. So we can't ensure that all discarded logical blocks are + * zeroed. + */ + if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE) + zram->disk->queue->limits.discard_zeroes_data = 1; + else + zram->disk->queue->limits.discard_zeroes_data = 0; + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zram->disk->queue); add_disk(zram->disk); @@ -919,8 +927,9 @@ static int create_device(struct zram *zram, int device_id) pr_warn("Error creating sysfs group"); goto out_free_disk; } - - zram->init_done = 0; + strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor)); + zram->meta = NULL; + zram->max_comp_streams = 1; return 0; out_free_disk: @@ -937,13 +946,10 @@ static void destroy_device(struct zram *zram) sysfs_remove_group(&disk_to_dev(zram->disk)->kobj, &zram_disk_attr_group); - if (zram->disk) { - del_gendisk(zram->disk); - put_disk(zram->disk); - } + del_gendisk(zram->disk); + put_disk(zram->disk); - if (zram->queue) - blk_cleanup_queue(zram->queue); + blk_cleanup_queue(zram->queue); } static int __init zram_init(void) @@ -1022,4 +1028,3 @@ MODULE_PARM_DESC(num_devices, "Number of zram devices"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Nitin Gupta "); MODULE_DESCRIPTION("Compressed RAM Block Device"); -MODULE_ALIAS("devname:zram"); diff --git a/drivers/staging/zram/zram_drv.h b/drivers/block/zram/zram_drv.h similarity index 67% rename from drivers/staging/zram/zram_drv.h rename to drivers/block/zram/zram_drv.h index 508a19f444fa..7f21c145e317 100644 --- a/drivers/staging/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -2,6 +2,7 @@ * Compressed RAM block device * * Copyright (C) 2008, 2009, 2010 Nitin Gupta + * 2012, 2013 Minchan Kim * * This code is released using a dual license strategy: BSD/GPL * You can choose the licence that better fits your requirements. @@ -9,16 +10,15 @@ * Released under the terms of 3-clause BSD License * Released under the terms of GNU General Public License Version 2.0 * - * Project home: http://compcache.googlecode.com */ #ifndef _ZRAM_DRV_H_ #define _ZRAM_DRV_H_ #include -#include +#include -#include "../zsmalloc/zsmalloc.h" +#include "zcomp.h" /* * Some arbitrary value. This is just to catch @@ -32,7 +32,7 @@ static const unsigned max_num_devices = 32; * Pages that compress to size greater than this are stored * uncompressed in memory. */ -static const size_t max_zpage_size = PAGE_SIZE / 10 * 9; +static const size_t max_zpage_size = PAGE_SIZE / 4 * 3; /* * NOTE: max_zpage_size must be less than or equal to: @@ -65,52 +65,33 @@ enum zram_pageflags { struct table { unsigned long handle; u16 size; /* object size (excluding header) */ - u8 count; /* object ref count (not yet used) */ u8 flags; } __aligned(4); -/* - * All 64bit fields should only be manipulated by 64bit atomic accessors. - * All modifications to 32bit counter should be protected by zram->lock. - */ struct zram_stats { - atomic64_t compr_size; /* compressed size of pages stored */ + atomic64_t compr_data_size; /* compressed size of pages stored */ atomic64_t num_reads; /* failed + successful */ atomic64_t num_writes; /* --do-- */ atomic64_t failed_reads; /* should NEVER! happen */ atomic64_t failed_writes; /* can happen when memory is too low */ atomic64_t invalid_io; /* non-page-aligned I/O requests */ atomic64_t notify_free; /* no. of swap slot free notifications */ - u32 pages_zero; /* no. of zero filled pages */ - u32 pages_stored; /* no. of pages currently stored */ - u32 good_compress; /* % of pages with compression ratio<=50% */ - u32 bad_compress; /* % of pages with compression ratio>=75% */ + atomic64_t zero_pages; /* no. of zero filled pages */ + atomic64_t pages_stored; /* no. of pages currently stored */ }; struct zram_meta { - void *compress_workmem; - void *compress_buffer; + rwlock_t tb_lock; /* protect table */ struct table *table; struct zs_pool *mem_pool; }; -struct zram_slot_free { - unsigned long index; - struct zram_slot_free *next; -}; - struct zram { struct zram_meta *meta; - struct rw_semaphore lock; /* protect compression buffers, table, - * 32bit stat counters against concurrent - * notifications, reads and writes */ - - struct work_struct free_work; /* handle pending free request */ - struct zram_slot_free *slot_free_rq; /* list head of free request */ - struct request_queue *queue; struct gendisk *disk; - int init_done; + struct zcomp *comp; + /* Prevent concurrent execution of device init, reset and R/W request */ struct rw_semaphore init_lock; /* @@ -118,8 +99,8 @@ struct zram { * we can store in a disk. */ u64 disksize; /* bytes */ - spinlock_t slot_free_lock; - + int max_comp_streams; struct zram_stats stats; + char compressor[10]; }; #endif diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c index 16f69be820c7..121071f9d34a 100644 --- a/drivers/ide/ide-disk.c +++ b/drivers/ide/ide-disk.c @@ -686,8 +686,10 @@ static void ide_disk_setup(ide_drive_t *drive) printk(KERN_INFO "%s: max request size: %dKiB\n", drive->name, queue_max_sectors(q) / 2); - if (ata_id_is_ssd(id)) + if (ata_id_is_ssd(id)) { queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); + queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q); + } /* calculate drive capacity, and select LBA if possible */ ide_disk_get_capacity(drive); diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c index 4edea18b2b0b..da27b23511ab 100644 --- a/drivers/mmc/card/queue.c +++ b/drivers/mmc/card/queue.c @@ -294,6 +294,7 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card, blk_queue_prep_rq(mq->queue, mmc_prep_request); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mq->queue); + queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, mq->queue); if (mmc_can_erase(card)) mmc_queue_setup_discard(mq->queue, card); diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 11c9dce312c2..5cf14d94c698 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -436,6 +436,7 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new) blk_queue_logical_block_size(new->rq, tr->blksize); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, new->rq); + queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, new->rq); if (tr->discard) { queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, new->rq); diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 2b09d599a34d..0dba9eca70dc 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -2384,8 +2384,10 @@ static void sd_read_block_characteristics(struct scsi_disk *sdkp) rot = get_unaligned_be16(&buffer[4]); - if (rot == 1) + if (rot == 1) { queue_flag_set_unlocked(QUEUE_FLAG_NONROT, sdkp->disk->queue); + queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, sdkp->disk->queue); + } out: kfree(buffer); diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig index 9ba30899f3b1..fadee64dce08 100644 --- a/drivers/staging/Kconfig +++ b/drivers/staging/Kconfig @@ -80,12 +80,8 @@ source "drivers/staging/sep/Kconfig" source "drivers/staging/iio/Kconfig" -source "drivers/staging/zram/Kconfig" - source "drivers/staging/zcache/Kconfig" -source "drivers/staging/zsmalloc/Kconfig" - source "drivers/staging/wlags49_h2/Kconfig" source "drivers/staging/wlags49_h25/Kconfig" diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile index ef575cf8b6a2..38128efe477f 100644 --- a/drivers/staging/Makefile +++ b/drivers/staging/Makefile @@ -32,9 +32,7 @@ obj-$(CONFIG_VT6656) += vt6656/ obj-$(CONFIG_VME_BUS) += vme/ obj-$(CONFIG_DX_SEP) += sep/ obj-$(CONFIG_IIO) += iio/ -obj-$(CONFIG_ZRAM) += zram/ obj-$(CONFIG_ZCACHE) += zcache/ -obj-$(CONFIG_ZSMALLOC) += zsmalloc/ obj-$(CONFIG_WLAGS49_H2) += wlags49_h2/ obj-$(CONFIG_WLAGS49_H25) += wlags49_h25/ obj-$(CONFIG_FB_SM7XX) += sm7xx/ diff --git a/drivers/staging/zram/Makefile b/drivers/staging/zram/Makefile deleted file mode 100644 index cb0f9ced6a93..000000000000 --- a/drivers/staging/zram/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -zram-y := zram_drv.o - -obj-$(CONFIG_ZRAM) += zram.o diff --git a/drivers/staging/zram/zram.txt b/drivers/staging/zram/zram.txt deleted file mode 100644 index 765d790ae831..000000000000 --- a/drivers/staging/zram/zram.txt +++ /dev/null @@ -1,77 +0,0 @@ -zram: Compressed RAM based block devices ----------------------------------------- - -Project home: http://compcache.googlecode.com/ - -* Introduction - -The zram module creates RAM based block devices named /dev/zram -( = 0, 1, ...). Pages written to these disks are compressed and stored -in memory itself. These disks allow very fast I/O and compression provides -good amounts of memory savings. Some of the usecases include /tmp storage, -use as swap disks, various caches under /var and maybe many more :) - -Statistics for individual zram devices are exported through sysfs nodes at -/sys/block/zram/ - -* Usage - -Following shows a typical sequence of steps for using zram. - -1) Load Module: - modprobe zram num_devices=4 - This creates 4 devices: /dev/zram{0,1,2,3} - (num_devices parameter is optional. Default: 1) - -2) Set Disksize - Set disk size by writing the value to sysfs node 'disksize'. - The value can be either in bytes or you can use mem suffixes. - Examples: - # Initialize /dev/zram0 with 50MB disksize - echo $((50*1024*1024)) > /sys/block/zram0/disksize - - # Using mem suffixes - echo 256K > /sys/block/zram0/disksize - echo 512M > /sys/block/zram0/disksize - echo 1G > /sys/block/zram0/disksize - -3) Activate: - mkswap /dev/zram0 - swapon /dev/zram0 - - mkfs.ext4 /dev/zram1 - mount /dev/zram1 /tmp - -4) Stats: - Per-device statistics are exported as various nodes under - /sys/block/zram/ - disksize - num_reads - num_writes - invalid_io - notify_free - discard - zero_pages - orig_data_size - compr_data_size - mem_used_total - -5) Deactivate: - swapoff /dev/zram0 - umount /dev/zram1 - -6) Reset: - Write any positive value to 'reset' sysfs node - echo 1 > /sys/block/zram0/reset - echo 1 > /sys/block/zram1/reset - - This frees all the memory allocated for the given device and - resets the disksize to zero. You must set the disksize again - before reusing the device. - -Please report any problems at: - - Mailing list: linux-mm-cc at laptop dot org - - Issue tracker: http://code.google.com/p/compcache/issues/list - -Nitin Gupta -ngupta@vflare.org diff --git a/drivers/staging/zsmalloc/Kconfig b/drivers/staging/zsmalloc/Kconfig deleted file mode 100644 index 7fab032298f3..000000000000 --- a/drivers/staging/zsmalloc/Kconfig +++ /dev/null @@ -1,10 +0,0 @@ -config ZSMALLOC - bool "Memory allocator for compressed pages" - default n - help - zsmalloc is a slab-based memory allocator designed to store - compressed RAM pages. zsmalloc uses virtual memory mapping - in order to reduce fragmentation. However, this results in a - non-standard allocator interface where a handle, not a pointer, is - returned by an alloc(). This handle must be mapped in order to - access the allocated space. diff --git a/drivers/staging/zsmalloc/Makefile b/drivers/staging/zsmalloc/Makefile deleted file mode 100644 index b134848a590d..000000000000 --- a/drivers/staging/zsmalloc/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -zsmalloc-y := zsmalloc-main.o - -obj-$(CONFIG_ZSMALLOC) += zsmalloc.o diff --git a/drivers/staging/zsmalloc/zsmalloc.h b/include/linux/zsmalloc.h similarity index 96% rename from drivers/staging/zsmalloc/zsmalloc.h rename to include/linux/zsmalloc.h index fbe6bec421aa..5e1a3e577595 100644 --- a/drivers/staging/zsmalloc/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -2,6 +2,7 @@ * zsmalloc memory allocator * * Copyright (C) 2011 Nitin Gupta + * Copyright (C) 2012, 2013 Minchan Kim * * This code is released using a dual license strategy: BSD/GPL * You can choose the license that better fits your requirements. diff --git a/mm/Kconfig b/mm/Kconfig index bbab5a6bc30b..f365946391a4 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -401,3 +401,14 @@ config USE_USER_ACCESSIBLE_TIMERS architecture-specific code that will need to be enabled separately. +config ZSMALLOC + tristate "Memory allocator for compressed pages" + depends on MMU + default n + help + zsmalloc is a slab-based memory allocator designed to store + compressed RAM pages. zsmalloc uses virtual memory mapping + in order to reduce fragmentation. However, this results in a + non-standard allocator interface where a handle, not a pointer, is + returned by an alloc(). This handle must be mapped in order to + access the allocated space. diff --git a/mm/Makefile b/mm/Makefile index ccecbf9818f5..26d16c8c36f0 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -50,3 +50,4 @@ obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o obj-$(CONFIG_CLEANCACHE) += cleancache.o +obj-$(CONFIG_ZSMALLOC) += zsmalloc.o diff --git a/drivers/staging/zsmalloc/zsmalloc-main.c b/mm/zsmalloc.c similarity index 98% rename from drivers/staging/zsmalloc/zsmalloc-main.c rename to mm/zsmalloc.c index 41a68032ca83..e4f8f4ad14fd 100644 --- a/drivers/staging/zsmalloc/zsmalloc-main.c +++ b/mm/zsmalloc.c @@ -2,6 +2,7 @@ * zsmalloc memory allocator * * Copyright (C) 2011 Nitin Gupta + * Copyright (C) 2012, 2013 Minchan Kim * * This code is released using a dual license strategy: BSD/GPL * You can choose the license that better fits your requirements. @@ -78,8 +79,7 @@ #include #include #include - -#include "zsmalloc.h" +#include /* * This must be power of 2 and greater than of equal to sizeof(link_free). @@ -780,21 +780,32 @@ static void zs_exit(void) { int cpu; + cpu_notifier_register_begin(); + for_each_online_cpu(cpu) zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu); - unregister_cpu_notifier(&zs_cpu_nb); + __unregister_cpu_notifier(&zs_cpu_nb); + + cpu_notifier_register_done(); } static int zs_init(void) { int cpu, ret; - register_cpu_notifier(&zs_cpu_nb); + cpu_notifier_register_begin(); + + __register_cpu_notifier(&zs_cpu_nb); for_each_online_cpu(cpu) { ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu); - if (notifier_to_errno(ret)) + if (notifier_to_errno(ret)) { + cpu_notifier_register_done(); goto fail; + } } + + cpu_notifier_register_done(); + return 0; fail: zs_exit();