--- /sys/src/9k/k10/asm.c +++ /sys/src/9k/k10/asm.c @@ -282,14 +282,54 @@ asmwalkalloc(usize size) static int npg[4]; +/* + * return space needed for the Page array, plus the top of kernel + * allocation space below 4GB (via *ksizep). pageinit has not run yet. + */ +static uintmem +pagesarraysize(Asm *asmlist, uintmem *ksizep) +{ + Asm *asm; + uintmem np, ksize; + + np = ksize = 0; + for(asm = asmlist; asm != nil; asm = asm->next){ + if(asm->limit == asm->base || asm->type != AsmMEMORY) + continue; + /* + * kernel memory is contiguous from 1 MB to the first gap; + * there can be unmapped gaps (e.g. ACPI) from the e820 map. + */ + if (ksize == 0 && asm->base >= MB && asm->base < 4ull*GB && + asm->limit <= 4ull*GB) + ksize = asm->limit; + np += (asm->limit - asm->base)/PGSZ; + } + *ksizep = ksize; + DBG("kernel stops at %#P\n", ksize); + + /* + * leave room in the kernel for things other than Pages. + * if the machine has > ~8GB, put Pages above 4GB rather than + * reducing np. + */ + if (ksize && np*sizeof(Page) > ksize*3/4 && sys->pmend < 7ull*GB) { + ksize = (ksize/PGSZ)*3/4; + print("reduced kernel pages %llud to %llud to fit bottom 4GB\n", + (uvlong)np, (uvlong)ksize); + np = ksize; + } + return PGROUND(np * sizeof(Page)); +} + void asmmeminit(void) { Asm* asm; PTE *pte; int i, j, l; - uintptr n, va; - uintmem hi, lo, mem, nextmem, pa; + uintptr va; + uintmem hi, lo, mem, nextmem, pa, ksize; Pallocmem *pm; /* @@ -370,10 +410,7 @@ asmmeminit(void) asm->kbase = PTR2UINT(KADDR(asm->base)); } - n = sys->vmend - sys->vmstart; /* close enough */ - if(n > 600*MiB) - n = 600*MiB; - ialloclimit(n/2); + ialloclimit((sys->vmend - sys->vmstart)/2); /* close enough */ pm = palloc.mem; j = 0; @@ -393,4 +430,9 @@ asmmeminit(void) j++; pm++; } + + /* allocate the Page array from a high memory bank or, failing that, malloc */ + pgmem = pagesarraysize(asmlist, &ksize); + mallocinit(); + allocpages(ksize, pgmem, Mustmalloc); } --- /sys/src/9k/k10/main.c +++ /sys/src/9k/k10/main.c @@ -210,9 +210,8 @@ main(u32int ax, u32int bx) ioinit(); kbdinit(); - meminit(); + meminit(); /* now also calls mallocinit + allocpages */ archinit(); - mallocinit(); trapinit(); /* --- /sys/src/9k/k10/mem.h +++ /sys/src/9k/k10/mem.h @@ -8,6 +8,14 @@ #define PiB 1125899906842624ll /* Pebi 0x0004000000000000 */ #define EiB 1152921504606846976ll /* Exbi 0x1000000000000000 */ +/* aliases for the imported (Geoff) page allocator, which uses KB/MB/... */ +#define KB KiB +#define MB MiB +#define GB GiB +#define TB TiB +#define PB PiB +#define EB EiB + #define HOWMANY(x, y) (((x)+((y)-1))/(y)) #define ROUNDUP(x, y) (HOWMANY((x), (y))*(y)) #define ROUNDDN(x, y) (((x)/(y))*(y)) @@ -24,6 +32,7 @@ #define PGSZ (4*KiB) /* page size */ #define PGSHFT 12 /* log(PGSZ) */ +#define PGROUND(s) ROUNDUP((s), PGSZ) /* for the imported page allocator */ #define PTSZ (4*KiB) /* page table page size */ #define PTSHFT 9 /* */ --- /sys/src/9k/port/devcons.c +++ /sys/src/9k/port/devcons.c @@ -997,12 +997,12 @@ consread(Chan *c, void *buf, long n, vlong off) "%llud memory\n" "%d pagesize\n" "%llud kernel\n" - "%lud/%lud user\n" + "%llud/%llud user\n" "0/0 swap\n", /* keep old 9 scripts happy */ sys->pmoccupied, PGSZ, ROUNDUP(sys->vmend - KTZERO, PGSZ)/PGSZ, - palloc.user-palloc.freecount, palloc.user); + (uvlong)(palloc.user-palloc.freecount), (uvlong)palloc.user); b = buf; i = readstr(offset, b, n, tmp); b += i; --- /sys/src/9k/port/page.c +++ /sys/src/9k/port/page.c @@ -4,11 +4,26 @@ #include "dat.h" #include "fns.h" -#define pghash(daddr) palloc.hash[(daddr>>PGSHFT)&(PGHSIZE-1)] +// #include "dbgprint.h" + +#define pghash(daddr) palloc.hash[((daddr)>>PGSHFT) & (PGHSIZE-1)] + +enum { + Maxloops = 0, /* if > 0, max loops in newpage() */ +}; struct Palloc palloc; -static uint highwater; /* TO DO */ +static uintptr highwater; + +uchar pgcarved; /* flag: pgmem carved off a memory bank? visible to memory.c */ +uintptr pgmem; /* bytes allocated for the palloc.pages array */ + +static void +zero(void *p, uintptr n) +{ + memset(p, 0, n); +} /* * Split palloc.mem[i] if it's not all of the same color and we can. @@ -20,21 +35,21 @@ splitbank(int i, int e) Pallocmem *pm; uintmem psz; - if(e == nelem(palloc.mem)) + if(e >= nelem(palloc.mem)) /* trying to split last slot? */ return 0; pm = &palloc.mem[i]; - pm->color = memcolor(pm->base, &psz); + psz = 0; + pm->color = memcolor(pm->base, &psz); if(pm->color < 0){ - pm->color = 0; - if(i > 0) - pm->color = pm[-1].color; + pm->color = i > 0? pm[-1].color: 0; return 0; } if(psz <= PGSZ || psz >= (pm->limit - pm->base)) return 0; - if(i+1 < e) + if(i+1 < e) /* slot available in palloc.mem? */ memmove(pm+2, pm+1, (e-i-1)*sizeof(Pallocmem)); + /* else just step on the next slot */ pm[1].base = pm->base + psz; pm[1].limit = pm->limit; pm->limit = pm[1].base; @@ -44,18 +59,181 @@ splitbank(int i, int e) return 1; } -void -pageinit(void) +/* + * add the pages of each bank to the palloc.head list of available + * pages, built from the Page array. to allow addresses as large as 2⁶³ + * or even just 2²⁰, page numbers need to be uvlong or uintptr. + */ +static void +banksavailpages(int e) { - int e, i, j; + int i; + uintptr pgno, np; Page *p; Pallocmem *pm; - uintmem np, pkb, kkb, kmkb, mkb; - for(e = 0; e < nelem(palloc.mem); e++){ + palloc.head = p = palloc.pages; + if (p == nil) + panic("banksavailpages: nil palloc.pages"); + for(i=0; ilimit - pm->base)/PGSZ; + for(pgno=0; pgno < np; pgno++){ + p->prev = p-1; + p->next = p+1; + /* + * it's important that pgno be uvlong, so that pgno*PGSZ + * doesn't otherwise overflow a ulong for pgno ≥ 1M. + */ + p->pa = pm->base + pgno*PGSZ; + p->lg2size = PGSHFT; + p->color = pm->color; + palloc.freecount++; + p++; + } + } + palloc.tail = p - 1; + palloc.head->prev = 0; + palloc.tail->next = 0; + + palloc.user = p - palloc.pages; /* # of user pages */ + /* there are no more free pages than user pages */ + if (palloc.freecount > palloc.user) + palloc.freecount = palloc.user; +} + +typedef struct Unit Unit; +struct Unit { + uvlong scale; + char sfx[2]; +}; + +static Unit units[] = { + EB, "E", PB, "P", TB, "T", GB, "G", MB, "M", KB, "K", 1, "", +}; + +static void +prsize(char *buf, int len, uvlong size) +{ + uvlong scale; + Unit *unit; + + for (unit = units; unit < units + nelem(units) - 1; unit++) + if (size == unit->scale || size >= 10*unit->scale) + break; + /* round scaled size */ + scale = unit->scale; + snprint(buf, len, "%llud%s", (size + scale/2) / scale, unit->sfx); +} + +int +unitsconv(Fmt *f) +{ + char buf[32]; + + switch(f->r) { + case 'N': + prsize(buf, sizeof buf, va_arg(f->args, uvlong)); + return fmtstrcpy(f, buf); + default: + return fmtstrcpy(f, "(unitsconv)"); + } +} + +static void +pagesummary(void) +{ + /* user, kernel, kernel malloc area, total memory in bytes */ + uintptr uby, kby, kmby, occby; + uintptr usedby; + + fmtinstall('N', unitsconv); + kby = (uintptr)end - KTZERO; + kmby = sys->vmend - (uintptr)end; + /* + * Page array could be in the malloc arena or in KSEG2, but count it + * as part of the malloc arena. + * pgmem space for palloc.pages was allocated in meminit, but may + * be a little larger than needed. + */ + if (pgcarved) { + pgmem = PGROUND(palloc.user * sizeof(Page)); + kmby += pgmem; + } + uby = palloc.user * PGSZ; + usedby = kby + kmby + uby; + occby = sys->pmoccupied; /* amount, not address */ + if (0 && usedby > occby) + print("memory used > occupied by %N\n", usedby - occby); + + print("%N memory: %N+%N kernel", occby, kby, kmby); + if (pgcarved) + print(" (%N of Page structs)", pgmem); + print(", %N user", uby); + if (occby > usedby) + print(", %N unused", occby - usedby); + print("\n"); +} + +/* + * palloc.mem[] banks *could* overlap what we allocate for .pages, + * so run through banks, looking for one large enough to hold pgmem + * and above the kernel's allocation space. + * ksize is the top of the kernel's allocation space. + * if not found, just malloc (in KSEG0). if found, tear off beginning of bank. + * all pages have been mapped in meminit. + * + * we call this from meminit(), before palloc regions are sub-allocated. + * pgmem may be a little higher than necessary, which is harmless. + */ +void +allocpages(uintmem ksize, uintmem pgmem, int forcemalloc) +{ + int i, e; + Pallocmem *pm; + + for(e = 0; e < nelem(palloc.mem); e++) if(palloc.mem[e].base == palloc.mem[e].limit) break; + pm = nil; + /* work backward to avoid first bank, if possible */ + for(i = e - 1; i >= 0; i--){ + pm = &palloc.mem[i]; + /* enough space for Page array in this bank? */ + if (pm->base >= ksize && pm->limit - pm->base > pgmem) + break; + } + assert(palloc.pages == nil); + /* no bank big enough, or caller insists on malloc? */ + if (i >= e || pm == nil || forcemalloc == Mustmalloc) + palloc.pages = malloc(pgmem); /* zeroes its allocation */ + else { + /* palloc.mem[i] has room for Pages, which may be GBs */ + palloc.pages = KADDR(pm->base); + pm->base += pgmem; + /* zeroing is a good idea & needed since Page contains a Lock */ + zero(palloc.pages, pgmem); + pgcarved = 1; + DBG("Pages carved from start of bank %d\n", i); } + if(palloc.pages == nil) + panic("pageinit: no memory for user-page descriptors"); + /* TODO: revert iprint to DBG */ + iprint("Pages array of %,llud bytes allocated at %#p to %#p\n", + pgmem, palloc.pages, (char *)palloc.pages + pgmem - 1); +} + +/* pagesarraysize has already been called. */ +void +pageinit(void) +{ + int e, i; + Pallocmem *pm; + uintmem np; + + for(e = 0; e < nelem(palloc.mem); e++) + if(palloc.mem[e].base == palloc.mem[e].limit) + break; /* * Split banks if not of the same color @@ -67,61 +245,22 @@ pageinit(void) if(splitbank(i, e)) e++; - print("palloc[%d] col %d %#P %#P\n", + DBG("palloc[%d] col %d %#P %#P\n", i, pm->color, pm->base, pm->limit); - -/* BUG; can't handle it all right now */ -if(pm->base > 600*MiB){ - pm->limit = pm->base; - continue; -} -if(pm->limit > 600*MiB) - pm->limit = 600*MiB; - np += (pm->limit - pm->base)/PGSZ; } - palloc.pages = malloc(np*sizeof(Page)); - if(palloc.pages == nil) - panic("pageinit"); - - palloc.head = palloc.pages; - p = palloc.head; - for(i=0; ilimit - pm->base)/PGSZ; - for(j=0; jprev = p-1; - p->next = p+1; - p->pa = pm->base+j*PGSZ; - p->lg2size = PGSHFT; - p->color = pm->color; - palloc.freecount++; - p++; - } - } - palloc.tail = p - 1; - palloc.head->prev = nil; - palloc.tail->next = nil; - - palloc.user = p - palloc.pages; + banksavailpages(e); /* generate palloc.pages list, sets palloc.user */ - /* user, kernel, kernel malloc area, memory */ - pkb = palloc.user*PGSZ/KiB; - kkb = ROUNDUP((uintptr)end - KTZERO, PGSZ)/KiB; - kmkb = ROUNDUP(sys->vmend - (uintptr)end, PGSZ)/KiB; - mkb = sys->pmoccupied/KiB; + /* Paging numbers. keep at least 5% of user pages free. */ + highwater = (palloc.user*5ULL)/100; + if(highwater >= 64*MB/PGSZ) /* or at least 64MB, if 5% is more. */ + highwater = 64*MB/PGSZ; - /* Paging numbers */ - highwater = (palloc.user*5)/100; - if(highwater >= 64*MiB/PGSZ) - highwater = 64*MiB/PGSZ; - - print("%lldM memory: %lldK+%lldM kernel, %lldM user, %lldM lost\n", - mkb/KiB, kkb, kmkb/KiB, pkb/KiB, (mkb-kkb-kmkb-pkb)/KiB); + pagesummary(); } -static void +void pageunchain(Page *p) { if(canlock(&palloc)) @@ -149,10 +288,10 @@ pagechaintail(Page *p) } else { palloc.head = p; - p->prev = nil; + p->prev = 0; } palloc.tail = p; - p->next = nil; + p->next = 0; palloc.freecount++; } @@ -167,13 +306,36 @@ pagechainhead(Page *p) } else { palloc.tail = p; - p->next = nil; + p->next = 0; } palloc.head = p; - p->prev = nil; + p->prev = 0; palloc.freecount++; } +static void +pagewait(uintptr va, int color) +{ + qlock(&palloc.pwait); /* Hold memory requesters here */ + + while(waserror()) /* Ignore interrupts */ + ; + + if(palloc.freecount > 0) + iprint("nearly "); + iprint("out of physical memory for va %#p in %s; " + "highwater %llud freecount %llud\n", + va, (up? up->text: ""), + (uvlong)highwater, (uvlong)palloc.freecount); + USED(color); + pagereclaim(highwater/2); + + tsleep(&palloc.r, ispages, 0, 1000); + poperror(); + + qunlock(&palloc.pwait); +} + /* * allocate and return a new page for the given virtual address in segment s; * return nil iff s was locked on entry and had to be unlocked to wait for memory. @@ -182,60 +344,53 @@ Page* newpage(int clear, Segment *s, uintptr va, int locked) { Page *p; - KMap *k; uchar ct; - int i, hw, dontalloc, color; + int color, loops; + uintptr hw; + /* if free memory is tight (uncommon), wait for some to be freed. */ lock(&palloc); color = getpgcolor(va); hw = highwater; - for(;;) { - if(palloc.freecount > hw) - break; - if(up->kp && palloc.freecount > 0) - break; - + loops = 0; + /* assumes that up will be set by the time freecount <= hw */ + while (palloc.freecount <= hw && (!up->kp || palloc.freecount <= 0)) { unlock(&palloc); - dontalloc = 0; + if(locked){ qunlock(&s->lk); - locked = 0; - dontalloc = 1; - } - qlock(&palloc.pwait); /* Hold memory requesters here */ - - while(waserror()) /* Ignore interrupts */ - ; - - print("out of physical memory\n"); - pagereclaim(highwater/2); - - tsleep(&palloc.r, ispages, 0, 1000); - - poperror(); - - qunlock(&palloc.pwait); - - /* - * If called from fault and we lost the segment from - * underneath don't waste time allocating and freeing - * a page. Fault will call newpage again when it has - * reacquired the segment locks - */ - if(dontalloc) + pagewait(va, color); + /* + * If called from fault and we lost the segment from + * underneath don't waste time allocating and freeing + * a page. Fault will call newpage again when it has + * reacquired the segment locks. + */ return nil; + } + pagewait(va, color); + if (Maxloops && ++loops > Maxloops) + panic("newpage: va %#p: no page after waiting %d iterations", + va, Maxloops); lock(&palloc); } + USED(loops); /* First try for our colour */ + /* + * if we actually ever have systems with multiple memory domains, + * we'll probably want to maintain separate lists for each color + * so that we don't spend a long time skipping pages of the wrong color. + */ for(p = palloc.head; p; p = p->next) if(p->color == color) break; ct = PG_NOFLUSH; - if(p == nil) { + if(p == 0) { p = palloc.head; + assert(p); /* palloc.head */ p->color = color; ct = PG_NEWCOL; } @@ -244,19 +399,22 @@ newpage(int clear, Segment *s, uintptr va, int locked) lock(p); if(p->ref != 0) - panic("newpage: p->ref %d != 0", p->ref); + panic("newpage"); uncachepage(p); p->ref++; p->va = va; p->modref = 0; - mmucachectl(p, ct); + memset(p->cachectl, ct, sizeof(p->cachectl)); unlock(p); unlock(&palloc); if(clear) { + KMap *k; + k = kmap(p); - memset((void*)VA(k), 0, pagesize(p)); + assert(p->lg2size < 32); /* else need wide memset */ + zero((void*)VA(k), 1LL<lg2size); kunmap(k); } @@ -272,24 +430,29 @@ ispages(void*) void putpage(Page *p) { + /* + * we infrequently get lock loops on palloc here. they pass, so + * they probably just indicate a long search down a linked list of + * Pages (such as palloc.head or palloc.tail) somewhere. + */ lock(&palloc); lock(p); - if(p->ref == 0) - panic("putpage"); - if(--p->ref > 0) { unlock(p); unlock(&palloc); return; } + if(p->ref < 0) + panic("putpage"); + if(p->image != nil) pagechaintail(p); else pagechainhead(p); - if(palloc.r.p != nil) + if(palloc.r.p != 0) wakeup(&palloc.r); unlock(p); @@ -306,7 +469,7 @@ auxpage(void) if(palloc.freecount <= highwater) { /* memory's tight, don't use it for file cache */ unlock(&palloc); - return nil; + return 0; } pageunchain(p); @@ -340,7 +503,7 @@ retry: print("duppage %d, up %#p\n", retries, up); dupretries += 100; if(dupretries > 100000) - panic("duppage\n"); + panic("duppage"); uncachepage(p); return 1; } @@ -369,6 +532,13 @@ retry: goto retry; } + /* + * apparently we are spending too long here with palloc locked. + * maybe acquiring the lock on np just before unlocking palloc + * can take a long time (say, due to a race with another cpu(s) + * for that Page)? + */ + /* No freelist cache when memory is relatively low */ if(palloc.freecount <= highwater) { unlock(&palloc); @@ -382,7 +552,7 @@ retry: break; /* No page of the correct color */ - if(np == nil) { + if(np == 0) { unlock(&palloc); uncachepage(p); return 1; @@ -426,7 +596,8 @@ copypage(Page *f, Page *t) panic("copypage"); ks = kmap(f); kd = kmap(t); - memmove((void*)VA(kd), (void*)VA(ks), pagesize(t)); + assert(t->lg2size < 32); /* else need to change memmove */ + memmove((void*)VA(kd), (void*)VA(ks), 1LL<lg2size); kunmap(ks); kunmap(kd); } @@ -436,12 +607,12 @@ uncachepage(Page *p) /* Always called with a locked page */ { Page **l, *f; - if(p->image == nil) + if(p->image == 0) return; lock(&palloc.hashlock); l = &pghash(p->daddr); - for(f = *l; f != nil; f = f->hash) { + for(f = *l; f; f = f->hash) { if(f == p) { *l = p->hash; break; @@ -450,7 +621,7 @@ uncachepage(Page *p) /* Always called with a locked page */ } unlock(&palloc.hashlock); putimage(p->image); - p->image = nil; + p->image = 0; p->daddr = 0; } @@ -483,13 +654,13 @@ cachedel(Image *i, ulong daddr) lock(&palloc.hashlock); l = &pghash(daddr); - for(f = *l; f != nil; f = f->hash) { + for(f = *l; f; f = f->hash) { if(f->image == i && f->daddr == daddr) { lock(f); if(f->image == i && f->daddr == daddr){ *l = f->hash; putimage(f->image); - f->image = nil; + f->image = 0; f->daddr = 0; } unlock(f); @@ -506,7 +677,7 @@ lookpage(Image *i, ulong daddr) Page *f; lock(&palloc.hashlock); - for(f = pghash(daddr); f != nil; f = f->hash) { + for(f = pghash(daddr); f; f = f->hash) { if(f->image == i && f->daddr == daddr) { unlock(&palloc.hashlock); @@ -527,7 +698,7 @@ lookpage(Image *i, ulong daddr) } unlock(&palloc.hashlock); - return nil; + return 0; } uvlong @@ -543,8 +714,20 @@ pagereclaim(int npages) * All the pages with images backing them are at the * end of the list (see putpage) so start there and work * backward. + * + * pageunchain and pagechainhead avoid resource starvation caused by + * duppage() shuffling the freelist differently. Without inserting + * cached pages at the freelist tail, the tail accumulates an uncached + * "stopper" page which breaks the invariant of imagereclaim, which + * scans from the tail backwards as long as pages are cached. + * + * imagereclaim does not move the pages to the head after uncaching + * them, so prevents the cached pages before the ones it reclaimed from + * being reclaimed ever. + * + * Vetted by Richard Miller. */ - for(p = palloc.tail; p != nil && p->image != nil && npages > 0; p = p->prev) { + for(p = palloc.tail; p && p->image && npages > 0; p = p->prev) { if(p->ref == 0 && canlock(p)) { if(p->ref == 0) { npages--; @@ -605,18 +788,18 @@ freepte(Segment *s, Pte *p) case SG_PHYSICAL: fn = s->pseg->pgfree; ptop = &p->pages[PTEPERTAB]; - if(fn) { + if(fn) { /* never used yet */ for(pg = p->pages; pg < ptop; pg++) { - if(*pg == nil) + if(*pg == 0) continue; (*fn)(*pg); - *pg = nil; + *pg = 0; } break; } for(pg = p->pages; pg < ptop; pg++) { pt = *pg; - if(pt == nil) + if(pt == 0) continue; lock(pt); ref = --pt->ref; @@ -627,9 +810,9 @@ freepte(Segment *s, Pte *p) break; default: for(pg = p->first; pg <= p->last; pg++) - if(*pg != nil) { + if(*pg) { putpage(*pg); - *pg = nil; + *pg = 0; } } free(p); --- /sys/src/9k/port/portdat.h +++ /sys/src/9k/port/portdat.h @@ -492,15 +492,23 @@ struct Palloc Pallocmem mem[32]; Page *head; /* most recently used */ Page *tail; /* least recently used */ - ulong freecount; /* how many pages on free list now */ + uintptr freecount; /* how many pages on free list now */ Page *pages; /* array of all pages */ - ulong user; /* how many user pages */ + uintptr user; /* how many user pages */ Page *hash[PGHSIZE]; Lock hashlock; Rendez r; /* Sleep for free mem */ QLock pwait; /* Queue of procs waiting for memory */ }; +extern uintptr pgmem; /* bytes allocated for palloc.pages */ +extern uchar pgcarved; /* palloc.pages carved from a memory bank? */ + +enum { /* allocpages() forcemalloc argument */ + Couldmalloc, + Mustmalloc, +}; + struct Waitq { Waitmsg w; @@ -999,3 +1007,4 @@ enum #pragma varargck type "m" Mreg #pragma varargck type "P" uintmem +#pragma varargck type "N" uvlong --- /sys/src/9k/port/portfns.h +++ /sys/src/9k/port/portfns.h @@ -15,6 +15,9 @@ int anyhigher(void); int anyready(void); Image* attachimage(int, Chan*, uintptr, uintptr); Page* auxpage(void); +void allocpages(uintmem, uintmem, int); +void pageunchain(Page*); +int unitsconv(Fmt*); Block* bl2mem(uchar*, Block*, int); int blocklen(Block*); void bootlinks(void);