forked from torvalds/linux
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsetup.c
More file actions
1730 lines (1514 loc) · 48.3 KB
/
setup.c
File metadata and controls
1730 lines (1514 loc) · 48.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Copyright 2010 Tilera Corporation. All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation, version 2.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
* NON INFRINGEMENT. See the GNU General Public License for
* more details.
*/
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/mmzone.h>
#include <linux/bootmem.h>
#include <linux/module.h>
#include <linux/node.h>
#include <linux/cpu.h>
#include <linux/ioport.h>
#include <linux/irq.h>
#include <linux/kexec.h>
#include <linux/pci.h>
#include <linux/swiotlb.h>
#include <linux/initrd.h>
#include <linux/io.h>
#include <linux/highmem.h>
#include <linux/smp.h>
#include <linux/timex.h>
#include <linux/hugetlb.h>
#include <linux/start_kernel.h>
#include <linux/screen_info.h>
#include <asm/setup.h>
#include <asm/sections.h>
#include <asm/cacheflush.h>
#include <asm/pgalloc.h>
#include <asm/mmu_context.h>
#include <hv/hypervisor.h>
#include <arch/interrupts.h>
/* <linux/smp.h> doesn't provide this definition. */
#ifndef CONFIG_SMP
#define setup_max_cpus 1
#endif
static inline int ABS(int x) { return x >= 0 ? x : -x; }
/* Chip information */
char chip_model[64] __write_once;
#ifdef CONFIG_VT
struct screen_info screen_info;
#endif
struct pglist_data node_data[MAX_NUMNODES] __read_mostly;
EXPORT_SYMBOL(node_data);
/* Information on the NUMA nodes that we compute early */
unsigned long node_start_pfn[MAX_NUMNODES];
unsigned long node_end_pfn[MAX_NUMNODES];
unsigned long __initdata node_memmap_pfn[MAX_NUMNODES];
unsigned long __initdata node_percpu_pfn[MAX_NUMNODES];
unsigned long __initdata node_free_pfn[MAX_NUMNODES];
static unsigned long __initdata node_percpu[MAX_NUMNODES];
/*
* per-CPU stack and boot info.
*/
DEFINE_PER_CPU(unsigned long, boot_sp) =
(unsigned long)init_stack + THREAD_SIZE;
#ifdef CONFIG_SMP
DEFINE_PER_CPU(unsigned long, boot_pc) = (unsigned long)start_kernel;
#else
/*
* The variable must be __initdata since it references __init code.
* With CONFIG_SMP it is per-cpu data, which is exempt from validation.
*/
unsigned long __initdata boot_pc = (unsigned long)start_kernel;
#endif
#ifdef CONFIG_HIGHMEM
/* Page frame index of end of lowmem on each controller. */
unsigned long node_lowmem_end_pfn[MAX_NUMNODES];
/* Number of pages that can be mapped into lowmem. */
static unsigned long __initdata mappable_physpages;
#endif
/* Data on which physical memory controller corresponds to which NUMA node */
int node_controller[MAX_NUMNODES] = { [0 ... MAX_NUMNODES-1] = -1 };
#ifdef CONFIG_HIGHMEM
/* Map information from VAs to PAs */
unsigned long pbase_map[1 << (32 - HPAGE_SHIFT)]
__write_once __attribute__((aligned(L2_CACHE_BYTES)));
EXPORT_SYMBOL(pbase_map);
/* Map information from PAs to VAs */
void *vbase_map[NR_PA_HIGHBIT_VALUES]
__write_once __attribute__((aligned(L2_CACHE_BYTES)));
EXPORT_SYMBOL(vbase_map);
#endif
/* Node number as a function of the high PA bits */
int highbits_to_node[NR_PA_HIGHBIT_VALUES] __write_once;
EXPORT_SYMBOL(highbits_to_node);
static unsigned int __initdata maxmem_pfn = -1U;
static unsigned int __initdata maxnodemem_pfn[MAX_NUMNODES] = {
[0 ... MAX_NUMNODES-1] = -1U
};
static nodemask_t __initdata isolnodes;
#if defined(CONFIG_PCI) && !defined(__tilegx__)
enum { DEFAULT_PCI_RESERVE_MB = 64 };
static unsigned int __initdata pci_reserve_mb = DEFAULT_PCI_RESERVE_MB;
unsigned long __initdata pci_reserve_start_pfn = -1U;
unsigned long __initdata pci_reserve_end_pfn = -1U;
#endif
static int __init setup_maxmem(char *str)
{
unsigned long long maxmem;
if (str == NULL || (maxmem = memparse(str, NULL)) == 0)
return -EINVAL;
maxmem_pfn = (maxmem >> HPAGE_SHIFT) << (HPAGE_SHIFT - PAGE_SHIFT);
pr_info("Forcing RAM used to no more than %dMB\n",
maxmem_pfn >> (20 - PAGE_SHIFT));
return 0;
}
early_param("maxmem", setup_maxmem);
static int __init setup_maxnodemem(char *str)
{
char *endp;
unsigned long long maxnodemem;
long node;
node = str ? simple_strtoul(str, &endp, 0) : INT_MAX;
if (node >= MAX_NUMNODES || *endp != ':')
return -EINVAL;
maxnodemem = memparse(endp+1, NULL);
maxnodemem_pfn[node] = (maxnodemem >> HPAGE_SHIFT) <<
(HPAGE_SHIFT - PAGE_SHIFT);
pr_info("Forcing RAM used on node %ld to no more than %dMB\n",
node, maxnodemem_pfn[node] >> (20 - PAGE_SHIFT));
return 0;
}
early_param("maxnodemem", setup_maxnodemem);
struct memmap_entry {
u64 addr; /* start of memory segment */
u64 size; /* size of memory segment */
};
static struct memmap_entry memmap_map[64];
static int memmap_nr;
static void add_memmap_region(u64 addr, u64 size)
{
if (memmap_nr >= ARRAY_SIZE(memmap_map)) {
pr_err("Ooops! Too many entries in the memory map!\n");
return;
}
memmap_map[memmap_nr].addr = addr;
memmap_map[memmap_nr].size = size;
memmap_nr++;
}
static int __init setup_memmap(char *p)
{
char *oldp;
u64 start_at, mem_size;
if (!p)
return -EINVAL;
if (!strncmp(p, "exactmap", 8)) {
pr_err("\"memmap=exactmap\" not valid on tile\n");
return 0;
}
oldp = p;
mem_size = memparse(p, &p);
if (p == oldp)
return -EINVAL;
if (*p == '@') {
pr_err("\"memmap=nn@ss\" (force RAM) invalid on tile\n");
} else if (*p == '#') {
pr_err("\"memmap=nn#ss\" (force ACPI data) invalid on tile\n");
} else if (*p == '$') {
start_at = memparse(p+1, &p);
add_memmap_region(start_at, mem_size);
} else {
if (mem_size == 0)
return -EINVAL;
maxmem_pfn = (mem_size >> HPAGE_SHIFT) <<
(HPAGE_SHIFT - PAGE_SHIFT);
}
return *p == '\0' ? 0 : -EINVAL;
}
early_param("memmap", setup_memmap);
static int __init setup_mem(char *str)
{
return setup_maxmem(str);
}
early_param("mem", setup_mem); /* compatibility with x86 */
static int __init setup_isolnodes(char *str)
{
char buf[MAX_NUMNODES * 5];
if (str == NULL || nodelist_parse(str, isolnodes) != 0)
return -EINVAL;
nodelist_scnprintf(buf, sizeof(buf), isolnodes);
pr_info("Set isolnodes value to '%s'\n", buf);
return 0;
}
early_param("isolnodes", setup_isolnodes);
#if defined(CONFIG_PCI) && !defined(__tilegx__)
static int __init setup_pci_reserve(char* str)
{
unsigned long mb;
if (str == NULL || strict_strtoul(str, 0, &mb) != 0 ||
mb > 3 * 1024)
return -EINVAL;
pci_reserve_mb = mb;
pr_info("Reserving %dMB for PCIE root complex mappings\n",
pci_reserve_mb);
return 0;
}
early_param("pci_reserve", setup_pci_reserve);
#endif
#ifndef __tilegx__
/*
* vmalloc=size forces the vmalloc area to be exactly 'size' bytes.
* This can be used to increase (or decrease) the vmalloc area.
*/
static int __init parse_vmalloc(char *arg)
{
if (!arg)
return -EINVAL;
VMALLOC_RESERVE = (memparse(arg, &arg) + PGDIR_SIZE - 1) & PGDIR_MASK;
/* See validate_va() for more on this test. */
if ((long)_VMALLOC_START >= 0)
early_panic("\"vmalloc=%#lx\" value too large: maximum %#lx\n",
VMALLOC_RESERVE, _VMALLOC_END - 0x80000000UL);
return 0;
}
early_param("vmalloc", parse_vmalloc);
#endif
#ifdef CONFIG_HIGHMEM
/*
* Determine for each controller where its lowmem is mapped and how much of
* it is mapped there. On controller zero, the first few megabytes are
* already mapped in as code at MEM_SV_START, so in principle we could
* start our data mappings higher up, but for now we don't bother, to avoid
* additional confusion.
*
* One question is whether, on systems with more than 768 Mb and
* controllers of different sizes, to map in a proportionate amount of
* each one, or to try to map the same amount from each controller.
* (E.g. if we have three controllers with 256MB, 1GB, and 256MB
* respectively, do we map 256MB from each, or do we map 128 MB, 512
* MB, and 128 MB respectively?) For now we use a proportionate
* solution like the latter.
*
* The VA/PA mapping demands that we align our decisions at 16 MB
* boundaries so that we can rapidly convert VA to PA.
*/
static void *__init setup_pa_va_mapping(void)
{
unsigned long curr_pages = 0;
unsigned long vaddr = PAGE_OFFSET;
nodemask_t highonlynodes = isolnodes;
int i, j;
memset(pbase_map, -1, sizeof(pbase_map));
memset(vbase_map, -1, sizeof(vbase_map));
/* Node zero cannot be isolated for LOWMEM purposes. */
node_clear(0, highonlynodes);
/* Count up the number of pages on non-highonlynodes controllers. */
mappable_physpages = 0;
for_each_online_node(i) {
if (!node_isset(i, highonlynodes))
mappable_physpages +=
node_end_pfn[i] - node_start_pfn[i];
}
for_each_online_node(i) {
unsigned long start = node_start_pfn[i];
unsigned long end = node_end_pfn[i];
unsigned long size = end - start;
unsigned long vaddr_end;
if (node_isset(i, highonlynodes)) {
/* Mark this controller as having no lowmem. */
node_lowmem_end_pfn[i] = start;
continue;
}
curr_pages += size;
if (mappable_physpages > MAXMEM_PFN) {
vaddr_end = PAGE_OFFSET +
(((u64)curr_pages * MAXMEM_PFN /
mappable_physpages)
<< PAGE_SHIFT);
} else {
vaddr_end = PAGE_OFFSET + (curr_pages << PAGE_SHIFT);
}
for (j = 0; vaddr < vaddr_end; vaddr += HPAGE_SIZE, ++j) {
unsigned long this_pfn =
start + (j << HUGETLB_PAGE_ORDER);
pbase_map[vaddr >> HPAGE_SHIFT] = this_pfn;
if (vbase_map[__pfn_to_highbits(this_pfn)] ==
(void *)-1)
vbase_map[__pfn_to_highbits(this_pfn)] =
(void *)(vaddr & HPAGE_MASK);
}
node_lowmem_end_pfn[i] = start + (j << HUGETLB_PAGE_ORDER);
BUG_ON(node_lowmem_end_pfn[i] > end);
}
/* Return highest address of any mapped memory. */
return (void *)vaddr;
}
#endif /* CONFIG_HIGHMEM */
/*
* Register our most important memory mappings with the debug stub.
*
* This is up to 4 mappings for lowmem, one mapping per memory
* controller, plus one for our text segment.
*/
static void store_permanent_mappings(void)
{
int i;
for_each_online_node(i) {
HV_PhysAddr pa = ((HV_PhysAddr)node_start_pfn[i]) << PAGE_SHIFT;
#ifdef CONFIG_HIGHMEM
HV_PhysAddr high_mapped_pa = node_lowmem_end_pfn[i];
#else
HV_PhysAddr high_mapped_pa = node_end_pfn[i];
#endif
unsigned long pages = high_mapped_pa - node_start_pfn[i];
HV_VirtAddr addr = (HV_VirtAddr) __va(pa);
hv_store_mapping(addr, pages << PAGE_SHIFT, pa);
}
hv_store_mapping((HV_VirtAddr)_text,
(uint32_t)(_einittext - _text), 0);
}
/*
* Use hv_inquire_physical() to populate node_{start,end}_pfn[]
* and node_online_map, doing suitable sanity-checking.
* Also set min_low_pfn, max_low_pfn, and max_pfn.
*/
static void __init setup_memory(void)
{
int i, j;
int highbits_seen[NR_PA_HIGHBIT_VALUES] = { 0 };
#ifdef CONFIG_HIGHMEM
long highmem_pages;
#endif
#ifndef __tilegx__
int cap;
#endif
#if defined(CONFIG_HIGHMEM) || defined(__tilegx__)
long lowmem_pages;
#endif
unsigned long physpages = 0;
/* We are using a char to hold the cpu_2_node[] mapping */
BUILD_BUG_ON(MAX_NUMNODES > 127);
/* Discover the ranges of memory available to us */
for (i = 0; ; ++i) {
unsigned long start, size, end, highbits;
HV_PhysAddrRange range = hv_inquire_physical(i);
if (range.size == 0)
break;
#ifdef CONFIG_FLATMEM
if (i > 0) {
pr_err("Can't use discontiguous PAs: %#llx..%#llx\n",
range.size, range.start + range.size);
continue;
}
#endif
#ifndef __tilegx__
if ((unsigned long)range.start) {
pr_err("Range not at 4GB multiple: %#llx..%#llx\n",
range.start, range.start + range.size);
continue;
}
#endif
if ((range.start & (HPAGE_SIZE-1)) != 0 ||
(range.size & (HPAGE_SIZE-1)) != 0) {
unsigned long long start_pa = range.start;
unsigned long long orig_size = range.size;
range.start = (start_pa + HPAGE_SIZE - 1) & HPAGE_MASK;
range.size -= (range.start - start_pa);
range.size &= HPAGE_MASK;
pr_err("Range not hugepage-aligned: %#llx..%#llx:"
" now %#llx-%#llx\n",
start_pa, start_pa + orig_size,
range.start, range.start + range.size);
}
highbits = __pa_to_highbits(range.start);
if (highbits >= NR_PA_HIGHBIT_VALUES) {
pr_err("PA high bits too high: %#llx..%#llx\n",
range.start, range.start + range.size);
continue;
}
if (highbits_seen[highbits]) {
pr_err("Range overlaps in high bits: %#llx..%#llx\n",
range.start, range.start + range.size);
continue;
}
highbits_seen[highbits] = 1;
if (PFN_DOWN(range.size) > maxnodemem_pfn[i]) {
int max_size = maxnodemem_pfn[i];
if (max_size > 0) {
pr_err("Maxnodemem reduced node %d to"
" %d pages\n", i, max_size);
range.size = PFN_PHYS(max_size);
} else {
pr_err("Maxnodemem disabled node %d\n", i);
continue;
}
}
if (physpages + PFN_DOWN(range.size) > maxmem_pfn) {
int max_size = maxmem_pfn - physpages;
if (max_size > 0) {
pr_err("Maxmem reduced node %d to %d pages\n",
i, max_size);
range.size = PFN_PHYS(max_size);
} else {
pr_err("Maxmem disabled node %d\n", i);
continue;
}
}
if (i >= MAX_NUMNODES) {
pr_err("Too many PA nodes (#%d): %#llx...%#llx\n",
i, range.size, range.size + range.start);
continue;
}
start = range.start >> PAGE_SHIFT;
size = range.size >> PAGE_SHIFT;
end = start + size;
#ifndef __tilegx__
if (((HV_PhysAddr)end << PAGE_SHIFT) !=
(range.start + range.size)) {
pr_err("PAs too high to represent: %#llx..%#llx\n",
range.start, range.start + range.size);
continue;
}
#endif
#if defined(CONFIG_PCI) && !defined(__tilegx__)
/*
* Blocks that overlap the pci reserved region must
* have enough space to hold the maximum percpu data
* region at the top of the range. If there isn't
* enough space above the reserved region, just
* truncate the node.
*/
if (start <= pci_reserve_start_pfn &&
end > pci_reserve_start_pfn) {
unsigned int per_cpu_size =
__per_cpu_end - __per_cpu_start;
unsigned int percpu_pages =
NR_CPUS * (PFN_UP(per_cpu_size) >> PAGE_SHIFT);
if (end < pci_reserve_end_pfn + percpu_pages) {
end = pci_reserve_start_pfn;
pr_err("PCI mapping region reduced node %d to"
" %ld pages\n", i, end - start);
}
}
#endif
for (j = __pfn_to_highbits(start);
j <= __pfn_to_highbits(end - 1); j++)
highbits_to_node[j] = i;
node_start_pfn[i] = start;
node_end_pfn[i] = end;
node_controller[i] = range.controller;
physpages += size;
max_pfn = end;
/* Mark node as online */
node_set(i, node_online_map);
node_set(i, node_possible_map);
}
#ifndef __tilegx__
/*
* For 4KB pages, mem_map "struct page" data is 1% of the size
* of the physical memory, so can be quite big (640 MB for
* four 16G zones). These structures must be mapped in
* lowmem, and since we currently cap out at about 768 MB,
* it's impractical to try to use this much address space.
* For now, arbitrarily cap the amount of physical memory
* we're willing to use at 8 million pages (32GB of 4KB pages).
*/
cap = 8 * 1024 * 1024; /* 8 million pages */
if (physpages > cap) {
int num_nodes = num_online_nodes();
int cap_each = cap / num_nodes;
unsigned long dropped_pages = 0;
for (i = 0; i < num_nodes; ++i) {
int size = node_end_pfn[i] - node_start_pfn[i];
if (size > cap_each) {
dropped_pages += (size - cap_each);
node_end_pfn[i] = node_start_pfn[i] + cap_each;
}
}
physpages -= dropped_pages;
pr_warning("Only using %ldMB memory;"
" ignoring %ldMB.\n",
physpages >> (20 - PAGE_SHIFT),
dropped_pages >> (20 - PAGE_SHIFT));
pr_warning("Consider using a larger page size.\n");
}
#endif
/* Heap starts just above the last loaded address. */
min_low_pfn = PFN_UP((unsigned long)_end - PAGE_OFFSET);
#ifdef CONFIG_HIGHMEM
/* Find where we map lowmem from each controller. */
high_memory = setup_pa_va_mapping();
/* Set max_low_pfn based on what node 0 can directly address. */
max_low_pfn = node_lowmem_end_pfn[0];
lowmem_pages = (mappable_physpages > MAXMEM_PFN) ?
MAXMEM_PFN : mappable_physpages;
highmem_pages = (long) (physpages - lowmem_pages);
pr_notice("%ldMB HIGHMEM available.\n",
pages_to_mb(highmem_pages > 0 ? highmem_pages : 0));
pr_notice("%ldMB LOWMEM available.\n",
pages_to_mb(lowmem_pages));
#else
/* Set max_low_pfn based on what node 0 can directly address. */
max_low_pfn = node_end_pfn[0];
#ifndef __tilegx__
if (node_end_pfn[0] > MAXMEM_PFN) {
pr_warning("Only using %ldMB LOWMEM.\n",
MAXMEM>>20);
pr_warning("Use a HIGHMEM enabled kernel.\n");
max_low_pfn = MAXMEM_PFN;
max_pfn = MAXMEM_PFN;
node_end_pfn[0] = MAXMEM_PFN;
} else {
pr_notice("%ldMB memory available.\n",
pages_to_mb(node_end_pfn[0]));
}
for (i = 1; i < MAX_NUMNODES; ++i) {
node_start_pfn[i] = 0;
node_end_pfn[i] = 0;
}
high_memory = __va(node_end_pfn[0]);
#else
lowmem_pages = 0;
for (i = 0; i < MAX_NUMNODES; ++i) {
int pages = node_end_pfn[i] - node_start_pfn[i];
lowmem_pages += pages;
if (pages)
high_memory = pfn_to_kaddr(node_end_pfn[i]);
}
pr_notice("%ldMB memory available.\n",
pages_to_mb(lowmem_pages));
#endif
#endif
}
/*
* On 32-bit machines, we only put bootmem on the low controller,
* since PAs > 4GB can't be used in bootmem. In principle one could
* imagine, e.g., multiple 1 GB controllers all of which could support
* bootmem, but in practice using controllers this small isn't a
* particularly interesting scenario, so we just keep it simple and
* use only the first controller for bootmem on 32-bit machines.
*/
static inline int node_has_bootmem(int nid)
{
#ifdef CONFIG_64BIT
return 1;
#else
return nid == 0;
#endif
}
static inline unsigned long alloc_bootmem_pfn(int nid,
unsigned long size,
unsigned long goal)
{
void *kva = __alloc_bootmem_node(NODE_DATA(nid), size,
PAGE_SIZE, goal);
unsigned long pfn = kaddr_to_pfn(kva);
BUG_ON(goal && PFN_PHYS(pfn) != goal);
return pfn;
}
static void __init setup_bootmem_allocator_node(int i)
{
unsigned long start, end, mapsize, mapstart;
if (node_has_bootmem(i)) {
NODE_DATA(i)->bdata = &bootmem_node_data[i];
} else {
/* Share controller zero's bdata for now. */
NODE_DATA(i)->bdata = &bootmem_node_data[0];
return;
}
/* Skip up to after the bss in node 0. */
start = (i == 0) ? min_low_pfn : node_start_pfn[i];
/* Only lowmem, if we're a HIGHMEM build. */
#ifdef CONFIG_HIGHMEM
end = node_lowmem_end_pfn[i];
#else
end = node_end_pfn[i];
#endif
/* No memory here. */
if (end == start)
return;
/* Figure out where the bootmem bitmap is located. */
mapsize = bootmem_bootmap_pages(end - start);
if (i == 0) {
/* Use some space right before the heap on node 0. */
mapstart = start;
start += mapsize;
} else {
/* Allocate bitmap on node 0 to avoid page table issues. */
mapstart = alloc_bootmem_pfn(0, PFN_PHYS(mapsize), 0);
}
/* Initialize a node. */
init_bootmem_node(NODE_DATA(i), mapstart, start, end);
/* Free all the space back into the allocator. */
free_bootmem(PFN_PHYS(start), PFN_PHYS(end - start));
#if defined(CONFIG_PCI) && !defined(__tilegx__)
/*
* Throw away any memory aliased by the PCI region.
*/
if (pci_reserve_start_pfn < end && pci_reserve_end_pfn > start) {
start = max(pci_reserve_start_pfn, start);
end = min(pci_reserve_end_pfn, end);
reserve_bootmem(PFN_PHYS(start), PFN_PHYS(end - start),
BOOTMEM_EXCLUSIVE);
}
#endif
}
static void __init setup_bootmem_allocator(void)
{
int i;
for (i = 0; i < MAX_NUMNODES; ++i)
setup_bootmem_allocator_node(i);
/* Reserve any memory excluded by "memmap" arguments. */
for (i = 0; i < memmap_nr; ++i) {
struct memmap_entry *m = &memmap_map[i];
reserve_bootmem(m->addr, m->size, 0);
}
#ifdef CONFIG_BLK_DEV_INITRD
if (initrd_start) {
/* Make sure the initrd memory region is not modified. */
if (reserve_bootmem(initrd_start, initrd_end - initrd_start,
BOOTMEM_EXCLUSIVE)) {
pr_crit("The initrd memory region has been polluted. Disabling it.\n");
initrd_start = 0;
initrd_end = 0;
} else {
/*
* Translate initrd_start & initrd_end from PA to VA for
* future access.
*/
initrd_start += PAGE_OFFSET;
initrd_end += PAGE_OFFSET;
}
}
#endif
#ifdef CONFIG_KEXEC
if (crashk_res.start != crashk_res.end)
reserve_bootmem(crashk_res.start, resource_size(&crashk_res), 0);
#endif
}
void *__init alloc_remap(int nid, unsigned long size)
{
int pages = node_end_pfn[nid] - node_start_pfn[nid];
void *map = pfn_to_kaddr(node_memmap_pfn[nid]);
BUG_ON(size != pages * sizeof(struct page));
memset(map, 0, size);
return map;
}
static int __init percpu_size(void)
{
int size = __per_cpu_end - __per_cpu_start;
size += PERCPU_MODULE_RESERVE;
size += PERCPU_DYNAMIC_EARLY_SIZE;
if (size < PCPU_MIN_UNIT_SIZE)
size = PCPU_MIN_UNIT_SIZE;
size = roundup(size, PAGE_SIZE);
/* In several places we assume the per-cpu data fits on a huge page. */
BUG_ON(kdata_huge && size > HPAGE_SIZE);
return size;
}
static void __init zone_sizes_init(void)
{
unsigned long zones_size[MAX_NR_ZONES] = { 0 };
int size = percpu_size();
int num_cpus = smp_height * smp_width;
const unsigned long dma_end = (1UL << (32 - PAGE_SHIFT));
int i;
for (i = 0; i < num_cpus; ++i)
node_percpu[cpu_to_node(i)] += size;
for_each_online_node(i) {
unsigned long start = node_start_pfn[i];
unsigned long end = node_end_pfn[i];
#ifdef CONFIG_HIGHMEM
unsigned long lowmem_end = node_lowmem_end_pfn[i];
#else
unsigned long lowmem_end = end;
#endif
int memmap_size = (end - start) * sizeof(struct page);
node_free_pfn[i] = start;
/*
* Set aside pages for per-cpu data and the mem_map array.
*
* Since the per-cpu data requires special homecaching,
* if we are in kdata_huge mode, we put it at the end of
* the lowmem region. If we're not in kdata_huge mode,
* we take the per-cpu pages from the bottom of the
* controller, since that avoids fragmenting a huge page
* that users might want. We always take the memmap
* from the bottom of the controller, since with
* kdata_huge that lets it be under a huge TLB entry.
*
* If the user has requested isolnodes for a controller,
* though, there'll be no lowmem, so we just alloc_bootmem
* the memmap. There will be no percpu memory either.
*/
if (i != 0 && cpu_isset(i, isolnodes)) {
node_memmap_pfn[i] =
alloc_bootmem_pfn(0, memmap_size, 0);
BUG_ON(node_percpu[i] != 0);
} else if (node_has_bootmem(start)) {
unsigned long goal = 0;
node_memmap_pfn[i] =
alloc_bootmem_pfn(i, memmap_size, 0);
if (kdata_huge)
goal = PFN_PHYS(lowmem_end) - node_percpu[i];
if (node_percpu[i])
node_percpu_pfn[i] =
alloc_bootmem_pfn(i, node_percpu[i],
goal);
} else {
/* In non-bootmem zones, just reserve some pages. */
node_memmap_pfn[i] = node_free_pfn[i];
node_free_pfn[i] += PFN_UP(memmap_size);
if (!kdata_huge) {
node_percpu_pfn[i] = node_free_pfn[i];
node_free_pfn[i] += PFN_UP(node_percpu[i]);
} else {
node_percpu_pfn[i] =
lowmem_end - PFN_UP(node_percpu[i]);
}
}
#ifdef CONFIG_HIGHMEM
if (start > lowmem_end) {
zones_size[ZONE_NORMAL] = 0;
zones_size[ZONE_HIGHMEM] = end - start;
} else {
zones_size[ZONE_NORMAL] = lowmem_end - start;
zones_size[ZONE_HIGHMEM] = end - lowmem_end;
}
#else
zones_size[ZONE_NORMAL] = end - start;
#endif
if (start < dma_end) {
zones_size[ZONE_DMA] = min(zones_size[ZONE_NORMAL],
dma_end - start);
zones_size[ZONE_NORMAL] -= zones_size[ZONE_DMA];
} else {
zones_size[ZONE_DMA] = 0;
}
/* Take zone metadata from controller 0 if we're isolnode. */
if (node_isset(i, isolnodes))
NODE_DATA(i)->bdata = &bootmem_node_data[0];
free_area_init_node(i, zones_size, start, NULL);
printk(KERN_DEBUG " Normal zone: %ld per-cpu pages\n",
PFN_UP(node_percpu[i]));
/* Track the type of memory on each node */
if (zones_size[ZONE_NORMAL] || zones_size[ZONE_DMA])
node_set_state(i, N_NORMAL_MEMORY);
#ifdef CONFIG_HIGHMEM
if (end != start)
node_set_state(i, N_HIGH_MEMORY);
#endif
node_set_online(i);
}
}
#ifdef CONFIG_NUMA
/* which logical CPUs are on which nodes */
struct cpumask node_2_cpu_mask[MAX_NUMNODES] __write_once;
EXPORT_SYMBOL(node_2_cpu_mask);
/* which node each logical CPU is on */
char cpu_2_node[NR_CPUS] __write_once __attribute__((aligned(L2_CACHE_BYTES)));
EXPORT_SYMBOL(cpu_2_node);
/* Return cpu_to_node() except for cpus not yet assigned, which return -1 */
static int __init cpu_to_bound_node(int cpu, struct cpumask* unbound_cpus)
{
if (!cpu_possible(cpu) || cpumask_test_cpu(cpu, unbound_cpus))
return -1;
else
return cpu_to_node(cpu);
}
/* Return number of immediately-adjacent tiles sharing the same NUMA node. */
static int __init node_neighbors(int node, int cpu,
struct cpumask *unbound_cpus)
{
int neighbors = 0;
int w = smp_width;
int h = smp_height;
int x = cpu % w;
int y = cpu / w;
if (x > 0 && cpu_to_bound_node(cpu-1, unbound_cpus) == node)
++neighbors;
if (x < w-1 && cpu_to_bound_node(cpu+1, unbound_cpus) == node)
++neighbors;
if (y > 0 && cpu_to_bound_node(cpu-w, unbound_cpus) == node)
++neighbors;
if (y < h-1 && cpu_to_bound_node(cpu+w, unbound_cpus) == node)
++neighbors;
return neighbors;
}
static void __init setup_numa_mapping(void)
{
int distance[MAX_NUMNODES][NR_CPUS];
HV_Coord coord;
int cpu, node, cpus, i, x, y;
int num_nodes = num_online_nodes();
struct cpumask unbound_cpus;
nodemask_t default_nodes;
cpumask_clear(&unbound_cpus);
/* Get set of nodes we will use for defaults */
nodes_andnot(default_nodes, node_online_map, isolnodes);
if (nodes_empty(default_nodes)) {
BUG_ON(!node_isset(0, node_online_map));
pr_err("Forcing NUMA node zero available as a default node\n");
node_set(0, default_nodes);
}
/* Populate the distance[] array */
memset(distance, -1, sizeof(distance));
cpu = 0;
for (coord.y = 0; coord.y < smp_height; ++coord.y) {
for (coord.x = 0; coord.x < smp_width;
++coord.x, ++cpu) {
BUG_ON(cpu >= nr_cpu_ids);
if (!cpu_possible(cpu)) {
cpu_2_node[cpu] = -1;
continue;
}
for_each_node_mask(node, default_nodes) {
HV_MemoryControllerInfo info =
hv_inquire_memory_controller(
coord, node_controller[node]);
distance[node][cpu] =
ABS(info.coord.x) + ABS(info.coord.y);
}
cpumask_set_cpu(cpu, &unbound_cpus);
}
}
cpus = cpu;
/*
* Round-robin through the NUMA nodes until all the cpus are
* assigned. We could be more clever here (e.g. create four
* sorted linked lists on the same set of cpu nodes, and pull
* off them in round-robin sequence, removing from all four
* lists each time) but given the relatively small numbers
* involved, O(n^2) seem OK for a one-time cost.
*/
node = first_node(default_nodes);
while (!cpumask_empty(&unbound_cpus)) {
int best_cpu = -1;
int best_distance = INT_MAX;
for (cpu = 0; cpu < cpus; ++cpu) {
if (cpumask_test_cpu(cpu, &unbound_cpus)) {
/*
* Compute metric, which is how much
* closer the cpu is to this memory
* controller than the others, shifted
* up, and then the number of
* neighbors already in the node as an
* epsilon adjustment to try to keep
* the nodes compact.
*/
int d = distance[node][cpu] * num_nodes;
for_each_node_mask(i, default_nodes) {
if (i != node)
d -= distance[i][cpu];
}
d *= 8; /* allow space for epsilon */
d -= node_neighbors(node, cpu, &unbound_cpus);
if (d < best_distance) {
best_cpu = cpu;
best_distance = d;
}
}
}
BUG_ON(best_cpu < 0);
cpumask_set_cpu(best_cpu, &node_2_cpu_mask[node]);
cpu_2_node[best_cpu] = node;
cpumask_clear_cpu(best_cpu, &unbound_cpus);
node = next_node(node, default_nodes);
if (node == MAX_NUMNODES)
node = first_node(default_nodes);
}
/* Print out node assignments and set defaults for disabled cpus */
cpu = 0;
for (y = 0; y < smp_height; ++y) {
printk(KERN_DEBUG "NUMA cpu-to-node row %d:", y);
for (x = 0; x < smp_width; ++x, ++cpu) {
if (cpu_to_node(cpu) < 0) {
pr_cont(" -");
cpu_2_node[cpu] = first_node(default_nodes);
} else {
pr_cont(" %d", cpu_to_node(cpu));
}
}
pr_cont("\n");
}
}
static struct cpu cpu_devices[NR_CPUS];
static int __init topology_init(void)
{
int i;
for_each_online_node(i)
register_one_node(i);