l4re-base-25.08.0

This commit is contained in:
2025-09-12 15:55:45 +02:00
commit d959eaab98
37938 changed files with 9382688 additions and 0 deletions

View File

@@ -0,0 +1,4 @@
PKGDIR ?= ..
L4DIR ?= $(PKGDIR)/../..
include $(L4DIR)/mk/subdir.mk

View File

@@ -0,0 +1,333 @@
#pragma once
extern "C" {
#include <libfdt.h>
}
#include <cassert>
#include <initializer_list>
namespace Dtb {
/**
* Cell provides data structures and methods to handle cell based properties
*
* Device trees contain properties described by cells. The properties are
* - stored in big endian
* - the number of cells is specified by other properties like
* \#address-cells, \#size-cells, \#interrupt-cells
* - a property has at most 4 cells
*
* Cells might be translated from one domain into another. The
* translation is done by comparing regions, calculating the offset
* relative to a region in the current domain and applying this offset
* to a region in another domain. Therefore cells need relational
* operation, addition and subtraction.
*/
class Cell
{
public:
enum
{
Max_size = 4 /**< Maximal number of allowed cells */
};
static Cell make_cell(std::initializer_list<uint32_t> l)
{
return Cell(l);
}
/**
* Construct a default invalid cell
*
* An invalid cell is a tuple of {~0U, ~0U, ~0U, ~0U}.
*/
Cell()
{
for (auto &v: _values)
v = ~0U;
}
/**
* Construct a Cell object from a device tree property
*
* \param values Pointer to the property values
* \param size Number of cells in the property; Must be smaller than
* Max_size.
*/
Cell(fdt32_t const *values, size_t size)
{
assert(size <= Max_size);
for (auto &v: _values)
v = 0;
for (unsigned i = 0, offs = Max_size - size; i < size; ++i)
_values[offs + i] = fdt32_to_cpu(values[i]);
}
uint32_t const &operator [] (size_t idx) const
{
assert(idx < Max_size);
return _values[idx];
}
/**
* Check whether a Cell object is valid
*
* The default constructor set the cell to {~0U, ~0U, ~0U, ~0U}. If
* the cell object contains anything else it is considered to be
* valid.
*
* \return bool true if the cell is different from {~0U, ~0U, ~0U, ~0U}
*/
bool is_valid() const
{
for (auto x: _values)
if (x != ~0U)
return true;
return false;
}
/**
* Add two Cell objects
*
* We assume that cells are stored as 32 bit values in big endian
* order and can be added by simply adding the invidual 32 bit
* values and any overflow from a previous addition.
*
* We do not check whether there is an overflow when adding the
* highest 32 bit values.
*/
Cell operator + (Cell const &other) const
{
Cell result;
uint32_t carry = 0;
for (int i = Max_size - 1; i >= 0; --i)
{
uint64_t a = _values[i];
uint64_t b = other._values[i];
uint64_t res = a + b + carry;
carry = (res >> 32) ? 1 : 0;
result._values[i] = static_cast<uint32_t>(res);
}
// XXX no overflow check yet
return result;
}
/**
* Subtract a Cell object from another
*
* We assume that cells are stored as 32 bit values in big endian
* order and the difference can be calculate by simply subtracting
* the invidual 32 bit values and any overflow from a previous
* subtraction.
*
* We do not check whether a is larger than b in (a - b), which
* would lead to an overflow.
*/
Cell operator - (Cell const &other) const
{
Cell result;
uint32_t carry = 0;
for (int i = Max_size - 1; i >= 0; --i)
{
uint64_t a = _values[i];
uint64_t b = other._values[i];
uint64_t res = a - b - carry;
carry = (res >> 32) ? 1 : 0;
result._values[i] = static_cast<uint32_t>(res);
}
// XXX no overflow check yet
return result;
}
Cell operator & (Cell const &other) const
{
Cell result;
for (int i = 0; i < Max_size; i++)
result._values[i] = _values[i] & other._values[i];
return result;
}
Cell& operator &= (Cell const &other)
{
for (int i = 0; i < Max_size; i++)
_values[i] &= other._values[i];
return *this;
}
/**
* Relational operator Cell A < Cell B
*/
bool operator < (Cell const &other) const
{ return cmp(other) == -1; }
/**
* Relational operator Cell A <= Cell B
*/
bool operator <= (Cell const &other) const
{ return cmp(other) != 1; }
/**
* Relational operator Cell A == Cell B
*/
bool operator == (Cell const &other) const
{ return cmp(other) == 0; }
/**
* Relational operator Cell A != Cell B
*/
bool operator != (Cell const &other) const
{ return cmp(other) != 0; }
/**
* Relational operator Cell A >= Cell B
*/
bool operator >= (Cell const &other) const
{ return cmp(other) != -1; }
/**
* Relational operator Cell A > Cell B
*/
bool operator > (Cell const &other) const
{ return cmp(other) == 1; }
/**
* Check whether the cell object contains a valid memory address
*
* We consider any 32bit or 64bit value a valid memory address. If
* the cell contains anything other than 0 in the highest order
* values, it must be something else and cannot be interpreted as a
* memory address.
*
* \return bool true, if the cell contains a 32bit or 64bit value.
*/
bool is_uint64() const
{ return !_values[0] && !_values[1]; }
/**
* Get the memory address of this cell
*
* Returns the value of the cell as 64bit value. It asserts, that
* the cell actually contains something, that can be interpreted as
* memory address.
*
* \return uint64_t the cell contents as 64bit value
*/
uint64_t get_uint64() const
{
assert(is_uint64());
return (static_cast<uint64_t>(_values[2]) << 32) + _values[3];
}
private:
Cell(std::initializer_list<uint32_t> l)
{
assert(l.size() <= Max_size);
for (auto &v: _values)
v = 0;
unsigned i = Max_size - l.size();
for (uint32_t v : l)
_values[i++] = v;
}
/**
* Compare two cell objects
*
* We assume that cells are stored as 32 bit values in big endian
* order and that we can compare them starting at the highest order
* value.
*
* \param Cell cell object to compare with
* \retval -1 cell is smaller than other cell
* \retval 0 cells are equal
* \retval 1 cells is larger than other cell
*/
int cmp(Cell const &other) const
{
unsigned i;
for (i = 0; i < Max_size; ++i)
{
if (_values[i] < other._values[i])
return -1;
if (_values[i] > other._values[i])
return 1;
}
return 0;
}
uint32_t _values[Max_size];
};
/**
* Data and methods associated with a range property in a device tree
*
* Ranges in a device tree describe to translation of regions from one
* domain to another.
*/
class Range
{
public:
/**
* Translate an address from one domain to another
*
* This function takes an address cell and a size cell and
* translates the address from one domain to another if there is a
* matching range.
*
* \param[inout] address Address cell that shall be translated
* \param[in] size Size Size cell associated with the address
*/
bool translate(Cell *address, Cell const &size)
{
assert(address);
if (match(*address, size))
{
*address = (*address - _child) + _parent;
return true;
}
return false;
}
Range(Cell const &child, Cell const &parent, Cell const &length)
: _child{child}, _parent{parent}, _length{length} {};
private:
// ranges: child, parent, length
// child.cells == this->cells
// parent.cells == parent.cells
Cell _child;
Cell _parent;
Cell _length;
// [address, address + size] subset of [child, child + length] ?
bool match(Cell const &address, Cell const &size) const
{
Cell address_max = address + size;
Cell child_max = _child + _length;
return (_child <= address) && (address_max <= child_max);
}
};
/**
* Data and methods associated with a reg property in a device tree
*/
struct Reg
{
Cell address;
Cell size;
Reg(Cell const &address, Cell const &size) : address{address}, size{size} {};
bool operator == (Reg const &other) const
{ return (address == other.address) && (size == other.size); }
bool operator != (Reg const &other) const
{ return !operator == (other); }
};
} // namespace Dtb

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,851 @@
/*
* Copyright (C) 2020, 2022-2024 Kernkonzept GmbH.
* Author(s): Benjamin Lamowski <benjamin.lamowski@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
/**
* \file
* Basic ACPI tables.
*
* Adapted from the ACPI Specification version 6.3.
* Currently only implements the ACPI tables necessary to make Linux find local
* APICs for SMP.
*/
#pragma once
#include <l4/cxx/utils>
#include <consts.h>
#include <array>
#include <vector>
#include "debug.h"
#include "guest.h"
#include "cpu_dev_array.h"
#include "cpu_dev.h"
#include "ioapic.h"
#include "virt_lapic.h"
#include "mem_types.h"
extern "C" {
#include "platform/acenv.h"
#include "actypes.h"
#include "actbl.h"
}
namespace Acpi
{
static Dbg info(Dbg::Dev, Dbg::Info, "ACPI");
static Dbg warn(Dbg::Dev, Dbg::Warn, "ACPI");
static Dbg trace(Dbg::Dev, Dbg::Trace, "ACPI");
class Tables;
class Acpi_device;
/**
* Registry of devices that need to insert information into ACPI tables.
*
* Upon Uvmm startup devices will be created from the device tree. These can
* register themselves here. The Acpi::Tables class will then call the
* Acpi_device functions of these devices to fill the ACPI tables. It will
* also delete the Acpi_device_hub after use.
*/
class Acpi_device_hub
{
friend class Tables;
public:
static void register_device(Acpi_device const *dev)
{ get()->_devices.push_back(dev); }
private:
static Acpi_device_hub *get()
{
if (!_hub)
_hub = new Acpi_device_hub();
return _hub;
}
std::vector<Acpi_device const*> const &devices() const
{
return _devices;
}
static void destroy()
{
if (_hub)
delete _hub;
_hub = nullptr;
}
Acpi_device_hub() = default;
~Acpi_device_hub() = default;
static Acpi_device_hub *_hub;
std::vector<Acpi_device const*> _devices;
};
/**
* Devices that must register with ACPI shall implement this interface.
*/
class Acpi_device
{
public:
explicit Acpi_device()
{
Acpi_device_hub::register_device(this);
}
virtual void amend_fadt(ACPI_TABLE_FADT *) const {};
virtual l4_size_t amend_mcfg(ACPI_MCFG_ALLOCATION *, l4_size_t) const { return 0; };
/**
* Amend the DSDT ACPI table (highest priority).
*
* This method is executed before all the #amend_dsdt_late methods of all
* ACPI devices.
*/
virtual l4_size_t amend_dsdt(void *, l4_size_t) const { return 0; };
/**
* Amend the DSDT ACPI table (lowest priority).
*
* This method is executed after all the #amend_dsdt methods of all ACPI
* devices. This is especially useful if the amendment refers to a scope
* that needs to be already defined before.
*/
virtual l4_size_t amend_dsdt_late(void *, l4_size_t) const { return 0; };
};
/**
* Singleton for access to the FACS table.
*
* Used by ACPI platform to acquire the wakeup vector and zeropage to reserve
* the FACS location in guest memory in the e820 map.
*/
class Facs_storage
{
public:
static Facs_storage *get()
{
if (!_facs_storage)
_facs_storage = new Facs_storage();
return _facs_storage;
}
void set_addr(ACPI_TABLE_FACS *table) { _facs = table; }
void set_gaddr(l4_addr_t gaddr) { _gfacs = Vmm::Guest_addr(gaddr); }
l4_uint32_t waking_vector() const { return _facs->FirmwareWakingVector; }
Vmm::Region mem_region() const
{
assert(_gfacs.get() != 0);
return Vmm::Region::ss(_gfacs, sizeof(ACPI_TABLE_FACS),
Vmm::Region_type::Ram);
}
private:
Facs_storage() = default;
~Facs_storage() = default;
static Facs_storage *_facs_storage;
ACPI_TABLE_FACS *_facs;
Vmm::Guest_addr _gfacs;
};
// Interrupt override data for MADT table
// see ACPI Spec v6.3, 5.2.12.5 Interrupt Source Override Structure
struct Madt_int_override
{
l4_uint8_t src_irq;
l4_uint32_t gsi;
l4_uint16_t flags;
};
// Static storage management for interrupt override entries
class Madt_int_override_storage
{
public:
static Madt_int_override_storage *get()
{
if (!_self)
_self = new Madt_int_override_storage();
return _self;
}
void add_override(Madt_int_override new_override)
{ _overrides.push_back(new_override); }
std::vector<Madt_int_override> const &overrides() const
{ return _overrides; }
private:
Madt_int_override_storage() = default;
~Madt_int_override_storage() = default;
static Madt_int_override_storage *_self;
std::vector<Madt_int_override> _overrides;
};
/**
* ACPI control.
*
* Manage the creation of ACPI tables in guest memory.
*/
class Tables
{
public:
~Tables()
{
Acpi_device_hub::destroy();
}
protected:
enum Table_sizes : l4_size_t
{
Header_size = sizeof(ACPI_TABLE_HEADER),
Rsdp_size = sizeof(ACPI_TABLE_RSDP),
Rsdp_v1_size = sizeof(ACPI_RSDP_COMMON),
Facs_size = sizeof(ACPI_TABLE_FACS)
};
enum class Table : unsigned
{
Rsdt,
Xsdt,
Fadt,
Madt,
Mcfg,
Facs,
Dsdt,
Num_values,
};
/**
* Helps with generating ACPI structures by providing abstractions for common
* operations, table references and checksums.
*
* Table reference fields and checksum fields are not filled in immediately,
* but instead a list of fixups is kept for them. Firstly, this simplifies the
* creation of ACPI structures, since the size and layout of the tables no
* longer have to be calculated in advance, which is particularly tricky for
* dynamically-sized tables. Secondly, this allows a more flexible use of the
* generated ACPI structures, since they can now be relocated to arbitrary
* memory addresses thanks to the fixups.
*/
class Writer
{
public:
Writer(l4_addr_t buf_addr, unsigned buf_size)
: _buf_addr(buf_addr), _buf_size(buf_size), _pos(0)
{}
/**
* Return current write position.
*/
unsigned pos() const
{ return _pos; }
/**
* Return number of unused bytes remaining in the write buffer.
*/
unsigned remaining_size() const
{ return _buf_size - _pos; }
/**
* Register the given ACPI table to start at the current write position, if
* necessary adjusted to the tables alignment requirements. Then reserve
* memory for the ACPI table.
*
* \tparam T Type of the table.
* \param table Table
* \param len Length of memory to reserve for the table.
* \param align Alignment required by the table.
*/
template<typename T>
T *start_table(Table table, unsigned len = sizeof(T), unsigned align = 8)
{
if (_pos % align != 0)
reserve<void>(align - (_pos % align));
_tables[static_cast<unsigned>(table)] = _pos;
return reserve<T>(len);
}
/**
* Reserve memory.
*
* \tparam T Type to reserve memory for.
* \param len Length of the memory to reserve, defaults to size of T.
*/
template<typename T = void>
T *reserve(unsigned len = sizeof(T))
{
if (_pos + len > _buf_size)
{
Err().printf("ACPI table memory allocation exhausted. "
"Please configure less ACPI devices "
"or raise the ACPI table size limit.\n");
L4Re::throw_error(-L4_ENOMEM, "ACPI table memory allocation exhausted.");
}
T *base = as_ptr<T>(_pos);
_pos += len;
return base;
}
/**
* Write an identifier with correct padding.
*
* \param dest Pointer to the memory destination.
* \param value String to write.
* \param len Length of the identifier field.
*/
static void write_identifier(char *dest, char const *value, l4_size_t len)
{
auto value_length = strlen(value);
assert(value_length <= len && "Supplied identifier fits into field.");
memcpy(dest, value, value_length);
memset(dest + value_length, ' ', len - value_length);
}
/**
* Write a common header for ACPI tables as defined in section 5.2.6 of the
* ACPI Specification.
*
* \param h Table header.
* \param sig Signature as described in Table 5-29.
* \param rev Revision of the table.
* \param len Total length of the table.
*/
void write_header(ACPI_TABLE_HEADER *h, char const *sig, l4_uint8_t rev,
l4_uint32_t len)
{
memcpy(h->Signature, sig, ACPI_NAMESEG_SIZE);
h->Length = len;
h->Revision = rev;
add_checksum(&h->Checksum, h, len);
write_identifier(h->OemId, "L4RE", ACPI_OEM_ID_SIZE);
write_identifier(h->OemTableId, "UVMM", ACPI_OEM_TABLE_ID_SIZE);
h->OemRevision = 1;
memcpy(h->AslCompilerId, "UVMM", ACPI_NAMESEG_SIZE);
h->AslCompilerRevision = 1;
}
/**
* Write header for a table and automatically determine size as delta
* between start position of the table and the current position of the
* writer.
*
* Useful for tables with dynamic size.
*
* \param h Table header, must be at the very beginning of the table.
* \param sig Signature as described in Table 5-29.
* \param rev Revision of the table.
*/
void end_table(ACPI_TABLE_HEADER *h, char const *sig, l4_uint8_t rev)
{
write_header(h, sig, rev, _pos - as_offset(h));
}
/**
* Reserve an MADT subtable and write its header.
*
* \tparam T Type of the MADT subtable.
* \param type MADT subtable type.
*/
template<typename T>
T *reserve_madt_subtable(enum AcpiMadtType type)
{
T *subtable = reserve<T>();
subtable->Header.Type = type;
subtable->Header.Length = sizeof(T);
return subtable;
}
/**
* Add fixup for table reference field.
*
* \tparam T Type of the table reference field.
* \param ref Table reference field.
* \param table Table that is referenced.
*/
template<typename T>
void add_table_ref(T const *ref, Table table)
{
_table_refs.emplace_back(Table_ref{as_offset(ref), sizeof(T), table});
}
/**
* Add fixup for checksum field.
*
* \param checksum Checksum field.
* \param base Pointer to start of memory area to checksum.
* \param len Length of the memory area to checksum.
*/
void add_checksum(l4_uint8_t *checksum, void *base, unsigned len)
{
// Although we do not calculate the checksum here, ensure that the
// checksum field is zeroed, which is required for checksum computation.
*checksum = 0U;
_checksums.emplace_back(Checksum{as_offset(checksum),
as_offset(base), len});
}
/**
* Table reference placeholder.
*/
struct Table_ref
{
/// Offset of table reference field in write buffer.
unsigned offset;
/// Size of table reference field.
unsigned size;
/// Table that is referenced.
Table table;
};
/**
* Checksum placeholder.
*/
struct Checksum
{
/// Offset of checksum field in write buffer.
unsigned field_off;
/// Offset of the memory area to checksum in write buffer.
unsigned offset;
/// Length of the memory area to checksum.
unsigned len;
};
/// Return table reference placeholders.
std::vector<Table_ref> const &table_refs() const { return _table_refs; }
/// Return checksum placeholders.
std::vector<Checksum> const &checksums() const { return _checksums; }
/**
* Return start offset of the given table.
*/
unsigned table_offset(Table table) const
{ return _tables[static_cast<unsigned>(table)]; }
/**
* Convert offset into virtual address.
*/
l4_addr_t as_addr(unsigned offset) const
{
assert(offset < _buf_size);
return _buf_addr + offset;
}
/**
* Convert offset into pointer.
*/
template<typename T = void>
T *as_ptr(unsigned offset) const
{ return reinterpret_cast<T *>(as_addr(offset)); }
private:
unsigned as_offset(void const *ptr) const
{
l4_addr_t addr = reinterpret_cast<l4_addr_t>(ptr);
assert(addr >= _buf_addr);
return addr - _buf_addr;
}
l4_addr_t _buf_addr;
unsigned _buf_size;
unsigned _pos;
std::array<unsigned, static_cast<unsigned>(Table::Num_values)> _tables;
std::vector<Table_ref> _table_refs;
std::vector<Checksum> _checksums;
}; // class Writer
/**
* Write a Root System Description Pointer (RSDP).
*
* Base ACPI structure as defined in section 5.2.5 of the ACPI Specification.
* This class includes the ACPI 2.0+ extensions.
*/
static void write_rsdp(Writer &wr)
{
auto *t = wr.reserve<ACPI_TABLE_RSDP>(Rsdp_size);
memcpy(t->Signature, ACPI_SIG_RSDP, sizeof(t->Signature));
wr.add_checksum(&t->Checksum, t, Rsdp_v1_size);
wr.write_identifier(t->OemId, "L4RE", ACPI_OEM_ID_SIZE);
if (Vmm::Cpu_dev::get_max_vcpu_id() >= 0xff)
t->Revision = 4; // needs Local X2APIC MADT entries: ACPI 4.0+
else
t->Revision = 2; // ACPI 2.0+
wr.add_table_ref(&t->RsdtPhysicalAddress, Table::Rsdt);
wr.add_table_ref(&t->XsdtPhysicalAddress, Table::Xsdt);
t->Length = Rsdp_size;
wr.add_checksum(&t->ExtendedChecksum, t, Rsdp_size);
}
/**
* Writes all implemented ACPI tables.
*/
static void write_all_tables(Writer &wr, Vdev::Device_lookup *devs)
{
write_rsdt(wr);
write_xsdt(wr);
write_fadt(wr);
write_madt(wr, devs->cpus()->max_cpuid() + 1, devs->cpus(),
Madt_int_override_storage::get()->overrides());
write_mcfg(wr);
write_facs(wr);
write_dsdt(wr);
}
/**
* Compute ACPI checksum for memory area.
*
* \param dest Base address of the memory area.
* \param len Length of the memory area.
*
* \return Value so that the sum of all bytes in the memory area modulo 256
* is zero.
*/
static l4_uint8_t compute_checksum(void *dest, unsigned len)
{
l4_uint8_t *bytes = reinterpret_cast<l4_uint8_t *>(dest);
l4_uint8_t sum = 0;
for (unsigned i = 0; i < len; i++)
sum += bytes[i];
return -sum;
}
private:
/**
* Write a Root System Description Table (RSDT) or an Extended System
* Description Table (XSDT).
*
* Table holding pointers to other system description tables as defined in
* sections 5.2.7 (RSDT) and 5.2.8 (XSDT) of the ACPI 3.0 Specification.
*/
template <typename TABLE>
static void write_rsdt_xsdt(Writer &wr)
{
// Tables that RSDT / XSDT refers to.
static constexpr std::array<Table, 3> ref_tables = {
Table::Madt,
Table::Fadt,
Table::Mcfg,
};
// RSDT/XSDT table header plus a 32/64-bit word per table pointer.
constexpr auto size =
Header_size + ref_tables.size() * sizeof(TABLE::TableOffsetEntry[0]);
constexpr Table table
= (std::is_same<TABLE, ACPI_TABLE_RSDT>::value)
? Table::Rsdt : Table::Xsdt;
auto *t = wr.start_table<TABLE>(table, size);
// The acpi_table_{rsdt/xsdt} struct defines only one entry, but we simply
// use the extra space allocated in the header. Do not forget to update
// Num_table_refs when adding or removing a table reference here.
for (l4_size_t i = 0; i < ref_tables.size(); i++)
wr.add_table_ref(&t->TableOffsetEntry[i], ref_tables[i]);
constexpr char const *sig
= (std::is_same<TABLE, ACPI_TABLE_RSDT>::value)
? ACPI_SIG_RSDT : ACPI_SIG_XSDT;
wr.end_table(&t->Header, sig, 1);
}
/**
* Write a Root System Description Table (RSDT).
*
* Table holding pointers to other system description tables as defined in
* section 5.2.7 of the ACPI 3.0 Specification.
*/
static void write_rsdt(Writer &wr)
{
write_rsdt_xsdt<ACPI_TABLE_RSDT>(wr);
}
/**
* Write an Extended System Description Table (XSDT).
*
* Table holding pointers to other system description tables as defined in
* section 5.2.8 of the ACPI 3.0 Specification.
*/
static void write_xsdt(Writer &wr)
{
write_rsdt_xsdt<ACPI_TABLE_XSDT>(wr);
}
/**
* Write a Fixed ACPI Description Table (FADT).
*
* Table providing fixed hardware information as defined in section 5.2.8 of
* the ACPI Specification.
*/
static void write_fadt(Writer &wr)
{
auto *t = wr.start_table<ACPI_TABLE_FADT>(Table::Fadt);
// Switching on Hardware-Reduced ACPI has the positive effect of
// eliminating a lot of legacy features we do not implement.
// However, with that flag on Linux requires the DSDT to be properly set
// up for finding PCI devices.
// t->Flags = (1 << 20); // HW_REDUCED_ACPI
wr.add_table_ref(&t->Dsdt, Table::Dsdt);
t->XDsdt = 0; // For now we don't implement the extended DSDT.
wr.add_table_ref(&t->Facs, Table::Facs);
t->XFacs = 0;
// How to pick the ID?
t->HypervisorId = 0;
for (auto const &d : Acpi_device_hub::get()->devices())
d->amend_fadt(t);
// Emulate ACPI 6.3.
wr.end_table(&t->Header, ACPI_SIG_FADT, 6);
t->MinorRevision = 3;
}
/**
* Construct a Multiple APIC Description Table (MADT).
*
* The MADT lists Advanced Programmable Interrupt Controllers in the system
* as defined in section 5.2.12 of the ACPI Specification.
*
* \param nr_cpus The number of enabled CPUs.
* \param cpus Pointer to the CPU container.
*/
static void
write_madt(Writer &wr, unsigned nr_cpus,
cxx::Ref_ptr<Vmm::Cpu_dev_array> cpus,
std::vector<Madt_int_override> const &madt_int_overrides)
{
auto *t = wr.start_table<ACPI_TABLE_MADT>(Table::Madt);
t->Address = Gic::Lapic_access_handler::Mmio_addr;
// ACPI 6.3 Specification, Table 5-44:
// not a PC-AT-compatible dual-8259 setup
t->Flags = 0;
// I/O APIC Structure.
// Provide information about the system's I/O APICs as defined in section
// 5.2.12.3 of the ACPI Specification.
auto *ioapic = wr.reserve_madt_subtable<ACPI_MADT_IO_APIC>(
ACPI_MADT_TYPE_IO_APIC);
ioapic->Reserved = 0;
ioapic->Id = 0;
ioapic->Address = Gic::Io_apic::Mmio_addr;
ioapic->GlobalIrqBase = 0;
// Interrupt Override Structure.
// Information about overriding ISA specified interrupt numbers with new
// ones.
for (auto const &over : madt_int_overrides)
{
auto *tbl = wr.reserve_madt_subtable<ACPI_MADT_INTERRUPT_OVERRIDE>(
ACPI_MADT_TYPE_INTERRUPT_OVERRIDE);
tbl->Bus = 0;
tbl->SourceIrq = over.src_irq;
tbl->GlobalIrq = over.gsi;
tbl->IntiFlags = over.flags;
}
// Processor Local APIC Structure.
// Structure to be appended to the MADT base table for each local APIC.
// Defined in section 5.2.12.2 of the ACPI Specification.
for (unsigned i = 0; i < nr_cpus; ++i)
{
// ACPI spec 4.0 / 5.2.12.12: Processor Local x2APIC Structure: Logical
// processors with APIC ID values less than 255 must use the Processor
// Local APIC structure to convey their APIC information to OSPM.
unsigned vcpu_id = cpus->vcpu(i).get_vcpu_id();
if (vcpu_id < 0xff)
{
auto *lapic = wr.reserve_madt_subtable<ACPI_MADT_LOCAL_APIC>(
ACPI_MADT_TYPE_LOCAL_APIC);
lapic->ProcessorId = i;
lapic->Id = vcpu_id;
lapic->LapicFlags = 1; // Enable CPU.
}
}
// Processor Local X2APIC Structure.
// Structure to be appended to the MADT base table for each local X2APIC.
// Defined in section 5.2.12.12 of the ACPI 4.0 Specification.
for (unsigned i = 0; i < nr_cpus; ++i)
{
unsigned vcpu_id = cpus->vcpu(i).get_vcpu_id();
if (vcpu_id >= 0xff)
{
auto *lx2apic = wr.reserve_madt_subtable<ACPI_MADT_LOCAL_X2APIC>(
ACPI_MADT_TYPE_LOCAL_X2APIC);
lx2apic->LocalApicId = vcpu_id;
lx2apic->LapicFlags = 1; // Enable CPU.
lx2apic->Uid = 0;
}
}
// Finally fill the table header.
wr.end_table(&t->Header, ACPI_SIG_MADT, 5);
}
/**
* Write PCI Express memory mapped configuration space base address
* Description Table (MCFG).
*/
static void write_mcfg(Writer &wr)
{
auto *t = wr.start_table<ACPI_TABLE_MCFG>(Table::Mcfg);
for (auto const &d : Acpi_device_hub::get()->devices())
{
auto *ptr = wr.as_ptr<ACPI_MCFG_ALLOCATION>(wr.pos());
auto amend_size = d->amend_mcfg(ptr, wr.remaining_size());
wr.reserve(amend_size);
}
wr.end_table(&t->Header, ACPI_SIG_MCFG, 1);
}
/**
* Write a Firmware ACPI Control Structure (FACS).
*/
static void write_facs(Writer &wr)
{
auto *t = wr.start_table<ACPI_TABLE_FACS>(Table::Facs, Facs_size, 64);
memcpy(t->Signature, ACPI_SIG_FACS, ACPI_NAMESEG_SIZE);
t->Length = Facs_size;
t->Version = 2;
// other fields written by OSPM or should be zero.
}
/**
* Write Differentiated System Description Table (DSDT).
*/
static void write_dsdt(Writer &wr)
{
auto *t = wr.start_table<ACPI_TABLE_HEADER>(Table::Dsdt);
// Collect the highest priority DSDT fragments of ACPI devices.
for (auto const &d : Acpi_device_hub::get()->devices())
{
void *ptr = wr.as_ptr(wr.pos());
auto amend_size = d->amend_dsdt(ptr, wr.remaining_size());
wr.reserve(amend_size);
}
// Collect the lowest priority DSDT fragments of ACPI devices.
for (auto const &d : Acpi_device_hub::get()->devices())
{
void *ptr = wr.as_ptr(wr.pos());
auto amend_size = d->amend_dsdt_late(ptr, wr.remaining_size());
wr.reserve(amend_size);
}
// The revision of DSDT controls the integer width of AML code/interpreter.
// Values less than two imply 32-bit integers and math, otherwise 64-bit
// (see also ComplianceRevision in AML DefinitionBlock)
wr.end_table(t, ACPI_SIG_DSDT, 1);
}
};
class Bios_tables : public Tables
{
enum : l4_uint32_t
{
/**
* Physical location of the RSDP according to section 5.2.5.1 of the ACPI
* Specification.
*/
Phys_start_addr = 0x0E0000
};
public:
/**
* ACPI control structure.
*
* \param ram Guest RAM.
*/
Bios_tables(Vdev::Device_lookup *devs)
: _devs(devs)
{
info.printf("Initialize legacy BIOS ACPI tables.\n");
_dest_addr = _devs->ram()->guest2host<l4_addr_t>(Vmm::Guest_addr(Phys_start_addr));
}
/**
* Calculate positions for each table and write them in place.
*/
void write_to_guest()
{
// we allow the rsdp and all tables to take up one page
l4_size_t max_size = L4_PAGESIZE;
auto acpi_mem = Vmm::Region::ss(Vmm::Guest_addr(Phys_start_addr), max_size,
Vmm::Region_type::Ram);
// Throws an exception if the ACPI memory region isn't within guest RAM.
_devs->ram()->guest2host<l4_addr_t>(acpi_mem);
// Clear memory because we do not rely on the DS provider to do this for
// us, and we must not have spurious values in ACPI tables.
memset(reinterpret_cast<void *>(_dest_addr), 0, max_size);
Writer wr(_dest_addr, max_size);
write_rsdp(wr);
write_all_tables(wr, _devs);
resolve_table_refs_and_checksums(wr);
l4_addr_t facs_off = wr.table_offset(Tables::Table::Facs);
Facs_storage::get()->set_addr(wr.as_ptr<ACPI_TABLE_FACS>(facs_off));
Facs_storage::get()->set_gaddr(
acpi_phys_addr<l4_uint32_t>(wr.as_addr(facs_off)));
}
private:
void resolve_table_refs_and_checksums(Writer &wr)
{
for (Writer::Table_ref const &ref : wr.table_refs())
{
l4_addr_t table_addr = wr.as_addr(wr.table_offset(ref.table));
if (ref.size == sizeof(l4_uint32_t))
*wr.as_ptr<l4_uint32_t>(ref.offset) =
acpi_phys_addr<l4_uint32_t>(table_addr);
else if (ref.size == sizeof(l4_uint64_t)) // XSDT
*wr.as_ptr<l4_uint64_t>(ref.offset) =
acpi_phys_addr<l4_uint64_t>(table_addr);
else
L4Re::throw_error(-L4_EINVAL, "Unsupported table offset size.");
}
for (Writer::Checksum const &checksum : wr.checksums())
{
l4_uint8_t *field = wr.as_ptr<l4_uint8_t>(checksum.field_off);
// Calculate and write checksum.
*field = compute_checksum(wr.as_ptr(checksum.offset), checksum.len);
}
}
/**
* Compute guest-physical address of target table.
*
* \param virt_target_addr Virtual address of the target table.
*
* \return 32-bit guest-physical address of the target table.
*/
template <typename T>
T acpi_phys_addr(l4_addr_t virt_target_addr) const
{
return Phys_start_addr + static_cast<T>(virt_target_addr - _dest_addr);
}
Vdev::Device_lookup *_devs;
l4_addr_t _dest_addr;
};
} // namespace Acpi

View File

@@ -0,0 +1,555 @@
/*
* Copyright (C) 2020-2024 Kernkonzept GmbH.
* Author(s): Steffen Liebergeld <steffen.liebergeld@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#include <l4/sys/vcon>
#include "acpi.h"
#include "device_factory.h"
#include "irq_dt.h"
#include "monitor/virtio_input_power_cmd_handler.h"
#include "vbus_event.h"
#include <l4/re/event_enums.h>
#include <l4/vbus/vbus>
#include <l4/vbus/vbus_inhibitor.h>
namespace Acpi {
/**
* \file
* Acpi platform support
*
* This implements minimal Acpi command support. Enough that Linux believes
* that Acpi works and that it uses Acpi shutdown.
*
* This requires a device tree entry like this.
*
* \code{.dtb}
* acpi_platform {
* compatible = "virt-acpi";
* interrupt-parent = <&PIC>;
* interrupts = <9>;
* // Optional: Connect vcon to trigger ACPI power events.
* // l4vmm,pwrinput = "acpi_pwr_input";
* };
* \endcode
*
* You may configure a different interrupt number for the system control
* interrupt (SCI), but make sure it does not collide.
*
* The interrupt parent is mandatory. The SCI is currently only used during
* Acpi probing.
*/
template <typename DEV>
class Vcon_pwr_input
: public L4::Irqep_t<Vcon_pwr_input<DEV> >,
public Monitor::Virtio_input_power_cmd_handler<Monitor::Enabled,
Vcon_pwr_input<DEV>>
{
friend Monitor::Virtio_input_power_cmd_handler<Monitor::Enabled,
Vcon_pwr_input<DEV>>;
public:
Vcon_pwr_input(L4::Cap<L4::Vcon> con)
: _con(con) {}
~Vcon_pwr_input()
{
if (_con_irq.is_valid())
if (long err = l4_error(_con->unbind(0, _con_irq)) < 0)
warn().printf("Unbind notification IRQ from Vcon: %s\n.",
l4sys_errtostr(err));
}
void register_obj(L4::Registry_iface *registry)
{
_con_irq = L4Re::chkcap(registry->register_irq_obj(this),
"Register IRQ of Vcon-pwr-input device.");
L4Re::chksys(_con->bind(0, _con_irq),
"Binding Vcon notification irq failed.\n");
}
void handle_irq();
bool inject_command(char cmd);
private:
static Dbg warn() { return Dbg(Dbg::Dev, Dbg::Warn, "pwr-input"); }
static Dbg trace() { return Dbg(Dbg::Dev, Dbg::Trace, "pwr-input"); }
DEV *dev() { return static_cast<DEV *>(this); }
L4::Cap<L4::Vcon> _con;
L4::Cap<L4::Irq> _con_irq;
};
template<typename DEV>
void
Vcon_pwr_input<DEV>::handle_irq()
{
while (1)
{
int r = _con->read(NULL, 0);
if (r <= 0)
break; // empty
char cmd;
r = _con->read(&cmd, sizeof(cmd));
if (r < 0)
{
Err().printf("Vcon_pwr_input: read error: %d\n", r);
break;
}
inject_command(cmd);
trace().printf("Vcon_pwr_input::handle_irq OK\n");
_con->write("OK\n", 3);
}
}
template<typename DEV>
bool
Vcon_pwr_input<DEV>::inject_command(char cmd)
{
bool ret = false;
trace().printf("cmd=%c\n", cmd);
switch(cmd)
{
case 'a':
case 's':
case 'l':
ret = dev()->inject_slpbtn();
break;
case 'p':
case 'q':
ret = dev()->inject_pwrbtn();
break;
case 'h':
{
char response[] = "a: apm suspend\ns: suspend\nl: sleep\np: power\n"
"q: power2\n";
_con->write(response, sizeof(response) - 1);
}
break;
default:
warn().printf("Unknown character '%c'\n", cmd);
break;
}
return ret;
}
class Acpi_platform:
public Vmm::Io_device,
public Vdev::Device,
public Acpi_device,
public Vcon_pwr_input<Acpi_platform>,
public Vbus_stream_id_handler
{
private:
enum Command_values : l4_uint16_t
{
Acpi_enable = 0xf2,
Acpi_disable = 0xf1,
Acpi_shutdown = 0x7,
Acpi_suspend = 0x6,
Reboot = 0x4,
};
public:
enum Ports : l4_uint16_t
{
Ports_start = 0x1800,
Smi_command = Ports_start,
Pm1a_cmd_block = Smi_command + 1, // 0x1801
Pm1a_cmd_length = 2, // 2 ports
Pm2_cmd_block = Pm1a_cmd_block + Pm1a_cmd_length, // 0x1803
Pm2_cmd_length = 1, // 1 port
Pm1a_event_block= Pm2_cmd_block + Pm2_cmd_length, // 0x1804
Pm1a_sts = Pm1a_event_block,
Pm1a_en = Pm1a_event_block + 2,
Pm1_event_length= 4,
Reset_register = Pm1a_event_block + Pm1_event_length, // 0x1808
Ports_last = Reset_register, // inclusive end
};
enum Events : l4_uint32_t
{
Pm1a_evt_gbl = 1U << 5,
Pm1a_evt_pwrbtn = 1U << 8,
Pm1a_evt_slpbtn = 1U << 9,
Pm1a_evt_rtc = 1U << 10,
// PM1 events we implement.
Pm1a_evt_supported = Pm1a_evt_gbl | Pm1a_evt_pwrbtn | Pm1a_evt_slpbtn
| Pm1a_evt_rtc,
};
Acpi_platform(Vdev::Device_lookup *devs, cxx::Ref_ptr<Gic::Ic> const &ic, int irq,
L4::Cap<L4::Vcon> pwr_vcon)
: Acpi_device(), Vcon_pwr_input<Acpi_platform>(pwr_vcon),
_vmm(devs->vmm()),
_sci(ic, irq),
_irq(irq),
_acpi_enabled(false),
_pm1a_sts(0),
_pm1a_en(0)
{
if (!devs->vbus()->available())
return;
auto vbus = devs->vbus()->bus();
info().printf("Registering as event handler for vbus->root() = %lx\n",
vbus->root().dev_handle());
Vbus_event::register_stream_id_handler(vbus->root().dev_handle(), this);
}
char const *dev_name() const override
{ return "ACPI platform"; }
void amend_fadt(ACPI_TABLE_FADT *t) const override
{
t->SmiCommand = Ports::Smi_command; // 32-bit port address of SMI command port
t->SciInterrupt = _irq;
t->AcpiEnable = Command_values::Acpi_enable;
t->AcpiDisable = Command_values::Acpi_disable;
// 32-bit port address of Power Mgt 1a Control Reg Block
t->Pm1aControlBlock = Ports::Pm1a_cmd_block;
// size of block
t->Pm1ControlLength = Ports::Pm1a_cmd_length;
// 32-bit port address of Power Mgt 2 Control Reg Block
t->Pm2ControlBlock = Ports::Pm2_cmd_block;
// size of block
t->Pm2ControlLength = Ports::Pm2_cmd_length;
t->Pm1aEventBlock = Ports::Pm1a_event_block;
t->Pm1EventLength = Ports::Pm1_event_length;
// Indicate the presence of an i8042 keyboard controller.
if (_vmm->i8042_present())
t->BootFlags |= ACPI_FADT_8042;
// set the reset register for ACPI reboot
t->Flags |= ACPI_FADT_RESET_REGISTER;
t->ResetRegister.Address = Ports::Reset_register;
t->ResetRegister.SpaceId = ACPI_ADR_SPACE_SYSTEM_IO;
t->ResetRegister.BitWidth = ACPI_RESET_REGISTER_WIDTH;
t->ResetValue = Command_values::Reboot;
}
/**
* Write an ACPI control object to the DSDT table that allows the guest to
* discover shutdown capability.
*
* This is described in section 7.4.2 of the ACPI specification.
*
* \param buf The memory are where to put the object.
* \param max_size Maximum available size of the designated memory area.
*/
l4_size_t amend_dsdt(void *buf, l4_size_t max_size) const override
{
// _S3 == suspend to ram
// _S5 == shutdown
unsigned char dsdt_S3S5 [] =
{
0x08, 0x5F, 0x53, '3', 0x5F, 0x12, 0x08, 0x04,
0x0A, Command_values::Acpi_suspend,
0x0A, Command_values::Acpi_suspend,
0x00, 0x00,
0x08, 0x5F, 0x53, '5', 0x5F, 0x12, 0x08, 0x04,
0x0A, Command_values::Acpi_shutdown,
0x0A, Command_values::Acpi_shutdown,
0x00, 0x00,
};
l4_size_t size = sizeof(dsdt_S3S5);
if (max_size < size)
L4Re::throw_error(-L4_ENOMEM,
"Not enough space in DSDT");
memcpy(buf, reinterpret_cast<void*>(dsdt_S3S5), size);
return size;
}
/**
* Handle pm1a enable register.
*
* This handles a subset of the PM1A enable register as described in section
* 4.8.3.1 of the ACPI specification. We support GBL_EN, PRWBTN_EN,
* SLPBTN_EN and the RTC_EN bits. If both the corresponding status and the
* enable bit is set, we inject an SCI.
*/
void handle_pm1a_en()
{
if (!_acpi_enabled)
return;
// if sts and en bits are set we issue an SCI
if (_pm1a_sts & _pm1a_en & Pm1a_evt_supported)
{
trace().printf("Injecting SCI\n");
_sci.inject();
}
trace().printf("_pm1a_sts = 0x%x _pm1a_en = 0x%x\n", _pm1a_sts, _pm1a_en);
}
/**
* Handle a subset of the pm1a control register.
*
* This function handles the PM1A control register as described in section
* 4.8.3.2 of the ACPI specification. We only handle the SLP_EN and SLP_TYPx
* bits.
*
* \param value The value written to the register.
*/
void handle_pm1a_control(l4_uint32_t value)
{
enum
{
Slp_enable = 1 << 13,
Slp_type_shutdown = Acpi_shutdown << 10,
Slp_type_suspend = Acpi_suspend << 10,
Slp_type_mask = 0x7 << 10,
};
static_assert((Slp_type_shutdown & Slp_type_mask) == Slp_type_shutdown,
"ACPI platform: Sleep type shutdown within field bounds");
static_assert((Slp_type_suspend & Slp_type_mask) == Slp_type_suspend,
"ACPI platform: Sleep type suspend within field bounds");
if (value & Slp_enable)
{
if ((value & Slp_type_mask) == Slp_type_shutdown)
{
trace().printf("Guest requested power off. Bye\n");
_vmm->shutdown(Vmm::Guest::Shutdown);
}
else if ((value & Slp_type_mask) == Slp_type_suspend)
{
trace().printf("System suspend requested\n");
// If Uvmm loaded a guest Linux kernel itself, it emulates
// firmware behaviour by resuming the guest directly at the
// address the guest specified in the FACS.
// Otherwise the VM resumes at the reset vector where firmware
// shall take care of guest resume.
if (_vmm->guest_type() == Boot::Binary_type::Linux)
_vmm->suspend(Facs_storage::get()->waking_vector());
else
_vmm->suspend(0xffff'fff0);
}
}
}
/**
* Handle IO port reads to the device.
*
* \param port IO port
* \param[out] value The value read from the IO port.
*/
void io_in(unsigned port, Vmm::Mem_access::Width /*width*/,
l4_uint32_t *value) override
{
port += Smi_command;
*value = -1U;
switch (port)
{
case Smi_command:
*value = 0;
break;
case Pm1a_cmd_block:
if (_acpi_enabled)
*value = 1; // SMI_EN == 1
else
*value = 0;
break;
case Pm1a_sts:
trace().printf("read _pm1a_sts = 0x%x\n", _pm1a_sts);
*value = _pm1a_sts;
break;
case Pm1a_en:
trace().printf("read _pm1a_en = 0x%x\n", _pm1a_en);
*value = _pm1a_en;
break;
default:
trace().printf("IO IN port=%x value=%x\n", port, *value);
break;
}
}
/**
* Handle IO port writes to device IO ports.
*
* \param port IO Port
* \param value The value written to the port.
*/
void io_out(unsigned port, Vmm::Mem_access::Width /*width*/,
l4_uint32_t value) override
{
port += Smi_command;
switch (port)
{
case Smi_command:
if (value == Acpi_enable)
{
trace().printf("Acpi enabled\n");
_acpi_enabled = true;
}
else if (value == Acpi_disable)
{
trace().printf("Acpi disabled\n");
_acpi_enabled = false;
}
break;
case Pm1a_cmd_block:
handle_pm1a_control(value);
break;
case Pm1a_sts:
trace().printf("write _pm1a_sts = 0x%x\n", value);
_pm1a_sts &= ~(value & Pm1a_evt_supported);
if ((_pm1a_sts & _pm1a_en) == 0U)
{
trace().printf("SCI ack\n");
_sci.ack();
}
break;
case Pm1a_en:
trace().printf("write _pm1a_en = 0x%x\n", value);
_pm1a_en = value;
handle_pm1a_en();
break;
case Reset_register:
if (value == Command_values::Reboot)
{
trace().printf("Reboot requested. Bye\n");
_vmm->shutdown(Vmm::Guest::Reboot);
}
break;
default:
trace().printf("IO OUT port=%x value=%x\n", port, value);
break;
}
}
bool inject_slpbtn()
{
if (!_acpi_enabled || !(_pm1a_en & Pm1a_evt_slpbtn))
return false;
_pm1a_sts |= Pm1a_evt_slpbtn;
_sci.inject();
return true;
}
bool inject_pwrbtn()
{
if (!_acpi_enabled || !(_pm1a_en & Pm1a_evt_pwrbtn))
return false;
_pm1a_sts |= Pm1a_evt_pwrbtn;
_sci.inject();
return true;
}
void handle_event(L4Re::Event_buffer::Event *e) override
{
// Here we handle inhibitor signals.
//
// Iff Uvmm has a vbus, it will grab inhibitor locks for suspend and
// shutdown. The rationale is that IO is only allowed to shutdown and/or
// suspend the system once all inhibitor locks are free. To that end, IO
// will send out inhibitor signals to its vbus clients. The clients shall
// suspend/shutdown their devices and free the inhibitor lock.
//
// Management of the locks itself is done in pm.{cc,h}
if (e->payload.type != L4RE_EV_PM)
{
warn().printf("Unexpected event type (0x%x). Ignoring.\n", e->payload.type);
return;
}
switch (e->payload.code)
{
case L4VBUS_INHIBITOR_SUSPEND:
info().printf("SUSPEND signal\n");
inject_slpbtn();
break;
case L4VBUS_INHIBITOR_SHUTDOWN:
info().printf("SHUTDOWN signal\n");
inject_pwrbtn();
break;
case L4VBUS_INHIBITOR_WAKEUP:
// The IPC for this signal will have woken Uvmm up. Nothing to do
// here.
break;
default:
warn().printf("Unknown PM event: code 0x%x.\n", e->payload.code);
break;
}
}
private:
static Dbg trace() { return Dbg(Dbg::Dev, Dbg::Trace, "Acpi_platform"); }
static Dbg warn() { return Dbg(Dbg::Dev, Dbg::Warn, "Acpi_platform"); }
static Dbg info() { return Dbg(Dbg::Dev, Dbg::Info, "Acpi_platform"); }
Vmm::Guest *_vmm;
Vmm::Irq_sink _sci;
unsigned const _irq;
bool _acpi_enabled;
l4_uint32_t _pm1a_sts, _pm1a_en;
};
} // namespace Acpi
/***********************************************************************/
namespace
{
struct F : Vdev::Factory
{
cxx::Ref_ptr<Vdev::Device> create(Vdev::Device_lookup *devs,
Vdev::Dt_node const &node) override
{
Vdev::Irq_dt_iterator it(devs, node);
if (it.next(devs) < 0)
return nullptr;
if (!it.ic_is_virt())
L4Re::throw_error(-L4_EINVAL, "Acpi_platform requires a virtual "
"interrupt controller");
auto pwr_vcon = Vdev::get_cap<L4::Vcon>(node, "l4vmm,pwrinput");
auto dev = Vdev::make_device<Acpi::Acpi_platform>(devs, it.ic(),
it.irq(), pwr_vcon);
if (pwr_vcon)
dev->register_obj(devs->vmm()->registry());
Dbg().printf("Creating Acpi_platform\n");
auto *vmm = devs->vmm();
auto start = Acpi::Acpi_platform::Ports::Ports_start;
auto end = Acpi::Acpi_platform::Ports::Ports_last;
vmm->add_io_device(Vmm::Io_region(start, end, Vmm::Region_type::Virtual),
dev);
return dev;
}
}; // struct F
static F f;
static Vdev::Device_type t = {"virt-acpi", nullptr, &f};
}

View File

@@ -0,0 +1,106 @@
/*
* Copyright (C) 2021-2024 Kernkonzept GmbH.
* Author(s): Steffen Liebergeld <steffen.liebergeld@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
/**
* The ACPI PM TIMER is documented in the ACPI Manual in Chapter 4.8.3.3
* "Power Management Timer (PM_TMR)".
*
* Its IO port is 0xb008 by default.
* "This is a 24-bit counter that runs off a 3.579545-MHz clock and counts
* while in the S0 working system state."
*
* The client has to cope with wrap arounds.
*
* This can be used in linux with cmdline "clocksource=acpi_pm".
*
* We do not support interrupt generation.
*/
#include "device_factory.h"
#include "guest.h"
#include "device.h"
#include "acpi.h"
#include "io_device.h"
#include <l4/re/env.h>
#include <l4/util/rdtsc.h>
namespace Vdev {
class Acpi_timer:
public Vmm::Io_device,
public Vdev::Device,
public Acpi::Acpi_device
{
public:
enum
{
Frequency_hz = 3579545,
Port = 0xb008,
};
Acpi_timer()
: Acpi_device()
{
_timebase = l4_rdtsc();
}
char const *dev_name() const override
{ return "ACPI Timer"; }
void amend_fadt(ACPI_TABLE_FADT *t) const override
{
t->PmTimerBlock = Port;
t->PmTimerLength = 4;
t->Flags |= ACPI_FADT_32BIT_TIMER;
}
private:
/* IO write from the guest to device */
void io_out(unsigned, Vmm::Mem_access::Width, l4_uint32_t) override
{
// this is a read only field, so we can ignore that.
return;
}
/* IO read from the guest */
void io_in(unsigned, Vmm::Mem_access::Width, l4_uint32_t *value) override
{
l4_cpu_time_t now = l4_rdtsc();
l4_cpu_time_t diff_ns = l4_tsc_to_ns(now - _timebase);
l4_cpu_time_t period = 1000UL * 1000 * 1000 / Frequency_hz;
*value = diff_ns / period;
}
l4_cpu_time_t _timebase = 0;
};
} // namespace Vdev
namespace {
struct F : Vdev::Factory
{
cxx::Ref_ptr<Vdev::Device> create(Vdev::Device_lookup *devs,
Vdev::Dt_node const &) override
{
auto dev = Vdev::make_device<Vdev::Acpi_timer>();
Acpi::info.printf("Acpi timer @ 0x%x\n", Vdev::Acpi_timer::Port);
auto region = Vmm::Io_region(Vdev::Acpi_timer::Port,
Vdev::Acpi_timer::Port,
Vmm::Region_type::Virtual);
devs->vmm()->add_io_device(region, dev);
return dev;
}
}; // struct F
static F f;
static Vdev::Device_type t = {"acpi-timer", nullptr, &f};
} // namespace

View File

@@ -0,0 +1,62 @@
/*
* Copyright (C) 2022, 2024 Kernkonzept GmbH.
* Author(s): Christian Pötzsch <christian.potzsch@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#include "binary_loader_linux.h"
#include "guest.h"
namespace Boot {
enum : unsigned
{
Linux_kernel_start_addr = 0x100000,
};
int Linux_loader::load(char const * /*bin*/, std::shared_ptr<Binary_ds> image,
Vmm::Vm_ram *ram, Vmm::Ram_free_list *free_list,
l4_addr_t *entry)
{
trace().printf("Checking for Linux image...\n");
if (!image->is_valid())
return -L4_EINVAL;
unsigned char const *h = static_cast<unsigned char const *>(image->get_data());
if (!(h[0x1fe] == 0x55 && h[0x1ff] == 0xaa))
return -L4_EINVAL;
info().printf("Linux kernel detected\n");
_64bit = true;
l4_uint8_t num_setup_sects = *(h + Vmm::Bp_setup_sects);
trace().printf("number of setup sections found: 0x%x\n", num_setup_sects);
// 512 is the size of a segment
l4_addr_t setup_sects_size = (num_setup_sects + 1) * 512;
if (Linux_kernel_start_addr < setup_sects_size)
L4Re::chksys(-L4_EINVAL,
"Supplied kernel image contains an invalid number "
" of setup sections (zeropage).");
l4_addr_t start = Linux_kernel_start_addr - setup_sects_size;
trace().printf("size of setup sections: 0x%lx\n", setup_sects_size);
trace().printf("loading binary at: 0x%lx\n", start);
// load the binary starting after the boot_params
*entry = image->load_as_raw(ram, ram->boot2guest_phys(start), free_list);
trace().printf("Loaded kernel image as raw to 0x%lx\n", *entry);
trace().printf("load kernel as raw entry to 0x%lx\n",
ram->guest_phys2boot(
Vmm::Guest_addr(Linux_kernel_start_addr)));
return L4_EOK;
}
static Linux_loader f __attribute__((init_priority(Boot::Linux)));
}

View File

@@ -0,0 +1,50 @@
/*
* Copyright (C) 2023-2024 genua GmbH, 85551 Kirchheim, Germany
* All rights reserved. Alle Rechte vorbehalten.
*/
/*
* Copyright (C) 2025 Kernkonzept GmbH.
* Author(s): Philipp Eppelt <philipp.eppelt@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#include "binary_loader_openbsd.h"
#include "guest.h"
namespace Boot {
bool OpenBSD_loader::is_openbsd(std::shared_ptr<Binary_ds> image) const
{
bool res = false;
image->get_elf()->iterate_phdr([&res](Ldr::Elf_phdr ph)
{
if (ph.type() == Pt_openbsd_randomize)
res = true;
});
return res;
}
int OpenBSD_loader::load(char const * /*bin*/, std::shared_ptr<Binary_ds> image,
Vmm::Vm_ram *ram, Vmm::Ram_free_list *free_list,
l4_addr_t *entry)
{
trace().printf("Checking for OpenBSD image...\n");
if (!image->is_valid())
return -L4_EINVAL;
if (!image->is_elf_binary() || !image->is_elf64() || !is_openbsd(image))
return -L4_EINVAL;
*entry = image->load_as_elf(ram, free_list);
_binsize = image->loaded_size();
info().printf("Loaded OpenBSD kernel image to 0x%lx, size 0x%zx\n", *entry,
_binsize);
return L4_EOK;
}
static OpenBSD_loader f __attribute__((init_priority(Boot::OpenBSD)));
}

View File

@@ -0,0 +1,32 @@
/*
* Copyright (C) 2023-2024 genua GmbH, 85551 Kirchheim, Germany
* All rights reserved. Alle Rechte vorbehalten.
*/
/*
* Copyright (C) 2025 Kernkonzept GmbH.
* Author(s): Philipp Eppelt <philipp.eppelt@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include "binary_loader.h"
namespace Boot {
class OpenBSD_loader : public Binary_loader
{
enum { Pt_openbsd_randomize = 0x65a3dbe6 };
public:
OpenBSD_loader()
: Binary_loader(OpenBSD)
{}
bool is_openbsd(std::shared_ptr<Binary_ds> image) const;
int load(char const *bin, std::shared_ptr<Binary_ds> image, Vmm::Vm_ram *ram,
Vmm::Ram_free_list *free_list, l4_addr_t *entry) override;
};
}

View File

@@ -0,0 +1,24 @@
/*
* Copyright (C) 2022, 2024 Kernkonzept GmbH.
* Author(s): Christian Pötzsch <christian.potzsch@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
namespace Boot {
static int raw_load_image(std::shared_ptr<Binary_ds> image, Vmm::Vm_ram *ram,
Vmm::Ram_free_list *free_list, l4_addr_t *entry)
{
l4_addr_t start = *entry == ~0ul ? 0x0 : *entry;
// Get the RAM start address.
Vmm::Guest_addr ram_base = free_list->first_free_address();
*entry = image->load_as_raw(ram, ram_base + start, free_list);
return L4_EOK;
}
}

View File

@@ -0,0 +1,8 @@
/*
* Copyright (C) 2016-2017, 2022, 2024 Kernkonzept GmbH.
* Author(s): Philipp Eppelt <philipp.eppelt@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#include "cpu_dev.h"

View File

@@ -0,0 +1,266 @@
/*
* Copyright (C) 2017-2020, 2022-2024 Kernkonzept GmbH.
* Author(s): Philipp Eppelt <philipp.eppelt@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include <atomic>
#include "debug.h"
#include "generic_cpu_dev.h"
#include "vcpu_ptr.h"
#include "monitor/cpu_dev_cmd_handler.h"
#include <deque>
#include <mutex>
extern __thread unsigned vmm_current_cpu_id;
namespace Vmm {
class Cpu_dev
: public Generic_cpu_dev,
public Monitor::Cpu_dev_cmd_handler<Monitor::Enabled, Cpu_dev>
{
public:
enum { Max_cpus = 128 };
enum Cpu_state
{
Sleeping = 1, // Startup state, Thread created but not running,
// needs rescheduling.
Stopped, // Waits for INIT signal, no need for rescheduling.
Init, // Wait for SIPI to transition to Running.
Halted, // Idle state, VMentry only on event.
Running
};
private:
struct State_change
{
State_change(Cpu_state s) : target_state(s) {}
Cpu_state target_state;
};
struct Ipi_event
{
Ipi_event(Cpu_dev *c) : cpu(c) {}
void act()
{
cpu->_check_msgq = true;
}
void registration_failure()
{
Dbg().printf("Failed to register IRQ to for IPI; "
"vCPU %u cannot be started.\n", cpu->vcpu().get_vcpu_id());
}
void trigger_failure(long ipc_err)
{
Dbg().printf("IPI to vCPU %u failed with error %li\n",
cpu->vcpu().get_vcpu_id(), ipc_err);
}
Cpu_dev *cpu;
};
public:
Cpu_dev(unsigned idx, unsigned phys_id, Vdev::Dt_node const *)
: Generic_cpu_dev(idx, phys_id),
_ipi(Ipi_event(this))
{
_cpu_state = (idx == 0) ? Running : Sleeping;
}
~Cpu_dev()
{
Vcpu_obj_registry *reg = _vcpu.get_ipc_registry();
_ipi.disarm(reg);
}
void powerup_cpu() override
{
Generic_cpu_dev::powerup_cpu();
_ipi.arm(_vcpu.get_ipc_registry());
}
/// Reset the Cpu_dev including vCPU does not return to the caller.
void reset() override
{
vmm_current_cpu_id = _vcpu.get_vcpu_id();
info().printf("[%3u] Reset called\n", vmm_current_cpu_id);
reset_common();
wait_until_online();
info().printf("[%3u] Resetting vCPU.\n", vmm_current_cpu_id);
_vcpu.reset(_protected_mode);
}
void hot_reset()
{
// assumption: Guest::run_vm() already called once.
// intention: Do not add leak stack memory.
reset_common();
info().printf("[%3u] Hot resetting vCPU.\n", vmm_current_cpu_id);
_vcpu.hot_reset();
}
/**
* Translate a device tree "reg" value to an internally usable CPU id.
*
* For most architectures this is NOP, but some architectures like ARM
* might encode topology information into this value, which needs to
* be translated.
*/
static unsigned dtid_to_cpuid(l4_int32_t prop_val)
{ return prop_val; }
static bool has_fixed_dt_mapping() { return true; }
Cpu_state get_cpu_state() const
{ return _cpu_state; }
bool cpu_online() const
{
Cpu_state s = get_cpu_state();
return (s == Cpu_state::Running) || (s == Cpu_state::Halted);
}
void set_cpu_state(Cpu_state state)
{ _cpu_state = state; }
void set_protected_mode()
{ _protected_mode = true; }
/**
* Handle the stop event.
*
* The event is usually emitted cross core, but also used in CPU local
* error cases.
*/
void stop() override
{
_stop_irq.disarm(_vcpu.get_ipc_registry());
{
std::lock_guard<std::mutex> lock(_message_q_lock);
// Clear all pending state changes to ensure the core is stopped ASAP.
_message_q.clear();
_message_q.emplace_back(Cpu_state::Stopped);
}
_check_msgq = true;
// Do not do anything blocking here, we need to finish the execution of the
// IPC dispatching that brought us here or return to our local caller.
}
/// core local request to halt the CPU.
void halt_cpu()
{
{
std::lock_guard<std::mutex> lock(_message_q_lock);
_message_q.emplace_back(Cpu_state::Halted);
}
_check_msgq = true;
// No IRQ trigger, we are already in VMexit handling
}
/// Send cross-core INIT signal
void send_init_ipi()
{
{
std::lock_guard<std::mutex> lock(_message_q_lock);
_message_q.emplace_back(Cpu_state::Init);
}
_ipi.trigger();
}
/// Send cross-core SIPI signal
void send_sipi()
{
{
std::lock_guard<std::mutex> lock(_message_q_lock);
_message_q.emplace_back(Cpu_state::Running);
}
_ipi.trigger();
}
Cpu_state next_state()
{
if (!has_message())
return get_cpu_state();
std::lock_guard<std::mutex> lock(_message_q_lock);
if (_message_q.empty())
{
_check_msgq = false;
return get_cpu_state();
}
Cpu_state new_state = _message_q.front().target_state;
_message_q.pop_front();
_check_msgq = !_message_q.empty();
return new_state;
}
/**
* Wait for an IPI, unless there are still items in the message queue.
*/
void wait_for_ipi()
{
if (has_message())
return;
_ipi.receive();
_check_msgq = true;
}
private:
static Dbg info() { return Dbg(Dbg::Cpu, Dbg::Info, "Cpu_dev"); }
bool has_message() const { return _check_msgq; }
/// Wait until an IPI puts the CPU in online state.
void wait_until_online()
{
while (has_message())
set_cpu_state(next_state());
// wait for the SIPI to sets the `Running` state
while (!cpu_online())
{
wait_for_ipi();
while (has_message())
set_cpu_state(next_state());
}
}
/// Functionality performed to reset a vCPU.
void reset_common()
{
_stop_irq.arm(_vcpu.get_ipc_registry());
_vcpu->state = L4_VCPU_F_FPU_ENABLED;
_vcpu->saved_state = L4_VCPU_F_FPU_ENABLED | L4_VCPU_F_USER_MODE;
}
std::atomic<Cpu_state> _cpu_state; // core-local writes; cross-core reads;
bool _protected_mode = false;
bool _check_msgq = false; // use only in local vCPU thread.
Cpu_irq<Ipi_event> _ipi;
// The mutex is used in IPI cases (INIT, SIPI, STOP) and for the local HALT
// event. The IPIs do not happen during normal operation, HALT happens when
// the core has nothing to do and reacts only to IRQs. In all other VMexits,
// this mutex is unused.
std::mutex _message_q_lock;
std::deque<State_change> _message_q;
}; // class Cpu_dev
} // namespace Vmm

View File

@@ -0,0 +1,421 @@
/*
* Copyright (C) 2024 Kernkonzept GmbH.
* Author(s): Steffen Liebergeld <steffen.liebergeld@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
/**
* \file
* This file defines the x86 CPU features that we present to the guest via
* our CPUID emulation.
*
* General rules:
* - Whitelist only those CPU features that we know to support.
* - We shall support as many features as possible because they might be there
* for performance.
*/
namespace
{
enum Cpuid_1_ecx : l4_uint32_t
{
Cpuid_1_ecx_sse3 = (1UL << 0),
Cpuid_1_ecx_pclmulqdq = (1UL << 1),
Cpuid_1_ecx_dtes64 = (1UL << 2),
Cpuid_1_ecx_monitor = (1UL << 3),
Cpuid_1_ecx_ds_cpl = (1UL << 4),
Cpuid_1_ecx_vmx = (1UL << 5),
Cpuid_1_ecx_smx = (1UL << 6),
Cpuid_1_ecx_speed_step = (1UL << 7),
Cpuid_1_ecx_thermal_monitor = (1UL << 8),
Cpuid_1_ecx_ssse3 = (1UL << 9),
Cpuid_1_ecx_context_id = (1UL << 10),
Cpuid_1_ecx_sdbg = (1UL << 11),
Cpuid_1_ecx_fma = (1UL << 12),
Cpuid_1_ecx_cmpxchg16b = (1UL << 13),
Cpuid_1_ecx_xtpr_update = (1UL << 14),
Cpuid_1_ecx_pdcm = (1UL << 15),
Cpuid_1_ecx_pcid = (1UL << 17),
Cpuid_1_ecx_dca = (1UL << 18),
Cpuid_1_ecx_sse4_1 = (1UL << 19),
Cpuid_1_ecx_sse4_2 = (1UL << 20),
Cpuid_1_ecx_x2apic = (1UL << 21),
Cpuid_1_ecx_movbe = (1UL << 22),
Cpuid_1_ecx_popcnt = (1UL << 23),
Cpuid_1_ecx_tsc_deadline = (1UL << 24),
Cpuid_1_ecx_aesni = (1UL << 25),
Cpuid_1_ecx_xsave = (1UL << 26),
Cpuid_1_ecx_osxsave = (1UL << 27),
Cpuid_1_ecx_avx = (1UL << 28),
Cpuid_1_ecx_f16c = (1UL << 29),
Cpuid_1_ecx_rdrand = (1UL << 30),
Cpuid_1_ecx_hypervisor = (1UL << 31),
};
enum Cpuid_1_edx : l4_uint32_t
{
Cpuid_1_edx_fpu = (1UL << 0),
Cpuid_1_edx_vme = (1UL << 1),
Cpuid_1_edx_de = (1UL << 2),
Cpuid_1_edx_pse = (1UL << 3),
Cpuid_1_edx_tsc = (1UL << 4),
Cpuid_1_edx_msr = (1UL << 5),
Cpuid_1_edx_pae = (1UL << 6),
Cpuid_1_edx_mce = (1UL << 7),
Cpuid_1_edx_cx8 = (1UL << 8),
Cpuid_1_edx_apic = (1UL << 9),
Cpuid_1_edx_sep = (1UL << 11),
Cpuid_1_edx_mtrr = (1UL << 12),
Cpuid_1_edx_pge = (1UL << 13),
Cpuid_1_edx_mca = (1UL << 14),
Cpuid_1_edx_cmov = (1UL << 15),
Cpuid_1_edx_pat = (1UL << 16),
Cpuid_1_edx_pse_36= (1UL << 17),
Cpuid_1_edx_psn = (1UL << 18),
Cpuid_1_edx_clfsh = (1UL << 19),
Cpuid_1_edx_ds = (1UL << 21),
Cpuid_1_edx_acpi = (1UL << 22),
Cpuid_1_edx_mmx = (1UL << 23),
Cpuid_1_edx_fxsr = (1UL << 24),
Cpuid_1_edx_sse = (1UL << 25),
Cpuid_1_edx_sse2 = (1UL << 26),
Cpuid_1_edx_ss = (1UL << 27),
Cpuid_1_edx_htt = (1UL << 28),
Cpuid_1_edx_tm = (1UL << 29),
Cpuid_1_edx_pbe = (1UL << 31),
};
// thermal and power management
enum Cpuid_6_eax : l4_uint32_t
{
Cpuid_6_eax_temp_sens = (1UL << 0),
Cpuid_6_eax_turbo_boost = (1UL << 1),
Cpuid_6_eax_arat = (1UL << 2),
Cpuid_6_eax_pln = (1UL << 4),
Cpuid_6_eax_ecmd = (1UL << 5),
Cpuid_6_eax_ptm = (1UL << 6),
Cpuid_6_eax_hwp = (1UL << 7),
Cpuid_6_eax_hwp_notify = (1UL << 8),
Cpuid_6_eax_hwp_act_win = (1UL << 9),
Cpuid_6_eax_hwp_energy_perf_pref = (1UL << 10),
Cpuid_6_eax_hwp_package_level = (1UL << 11),
Cpuid_6_eax_hdc = (1UL << 13),
Cpuid_6_eax_turbo_boost_max = (1UL << 14),
Cpuid_6_eax_hwp_capabilities = (1UL << 15),
Cpuid_6_eax_hwp_peci = (1UL << 16),
Cpuid_6_eax_hwp_flex = (1UL << 17),
Cpuid_6_eax_hwp_request_msr = (1UL << 18),
Cpuid_6_eax_hw_feedback = (1UL << 19),
Cpuid_6_eax_ignore_idle_cpu_hwp = (1UL << 20),
Cpuid_6_eax_hwp_control_msr = (1UL << 22),
Cpuid_6_eax_thread_director = (1UL << 23),
Cpuid_6_eax_therm_irq_msr = (1UL << 24),
};
enum Cpuid_7_0_ebx : l4_uint32_t
{
Cpuid_7_0_ebx_fsgsbase = (1UL << 0),
Cpuid_7_0_ebx_tsc_adjust_msr = (1UL << 1),
Cpuid_7_0_ebx_sgx = (1UL << 2),
Cpuid_7_0_ebx_bmi1 = (1UL << 3),
Cpuid_7_0_ebx_hle = (1UL << 4),
Cpuid_7_0_ebx_avx2 = (1UL << 5),
Cpuid_7_0_ebx_fdp_excptn_only= (1UL << 6),
Cpuid_7_0_ebx_smep = (1UL << 7),
Cpuid_7_0_ebx_bmi2 = (1UL << 8),
Cpuid_7_0_ebx_movsb = (1UL << 9),
Cpuid_7_0_ebx_invpcid = (1UL << 10),
Cpuid_7_0_ebx_rtm = (1UL << 11),
Cpuid_7_0_ebx_rdt_m = (1UL << 12),
Cpuid_7_0_ebx_fpu_cs = (1UL << 13),
Cpuid_7_0_ebx_mpx = (1UL << 14),
Cpuid_7_0_ebx_rdt_a = (1UL << 15),
Cpuid_7_0_ebx_avx_512_f = (1UL << 16),
Cpuid_7_0_ebx_avx_512_dq = (1UL << 17),
Cpuid_7_0_ebx_rdseed = (1UL << 18),
Cpuid_7_0_ebx_adx = (1UL << 19),
Cpuid_7_0_ebx_smap = (1UL << 20),
Cpuid_7_0_ebx_avx_512_ifma = (1UL << 21),
Cpuid_7_0_ebx_clflushopt = (1UL << 23),
Cpuid_7_0_ebx_clwb = (1UL << 24),
Cpuid_7_0_ebx_trace = (1UL << 25),
Cpuid_7_0_ebx_avx_512_pf = (1UL << 26),
Cpuid_7_0_ebx_avx_512_er = (1UL << 27),
Cpuid_7_0_ebx_avx_512_cd = (1UL << 28),
Cpuid_7_0_ebx_sha = (1UL << 29),
Cpuid_7_0_ebx_avx_512_bw = (1UL << 30),
Cpuid_7_0_ebx_avx_512_vl = (1UL << 31),
};
enum Cpuid_7_0_ecx : l4_uint32_t
{
Cpuid_7_0_ecx_prefetchwt1 = (1UL << 0),
Cpuid_7_0_ecx_avx_512_vbmi = (1UL << 1),
Cpuid_7_0_ecx_umip = (1UL << 2),
Cpuid_7_0_ecx_pku = (1UL << 3),
Cpuid_7_0_ecx_ospke = (1UL << 4),
Cpuid_7_0_ecx_waitpkg = (1UL << 5),
Cpuid_7_0_ecx_avx_512_vbmi2 = (1UL << 6),
Cpuid_7_0_ecx_cet_ss = (1UL << 7),
Cpuid_7_0_ecx_gfni = (1UL << 8),
Cpuid_7_0_ecx_vaes = (1UL << 9),
Cpuid_7_0_ecx_vpclmulqdq = (1UL << 10),
Cpuid_7_0_ecx_avx_512_vnni = (1UL << 11),
Cpuid_7_0_ecx_avx_512_bitalg = (1UL << 12),
Cpuid_7_0_ecx_tme_en = (1UL << 13),
Cpuid_7_0_ecx_avx_512_vpopcntdq= (1UL << 14),
Cpuid_7_0_ecx_la57 = (1UL << 16),
Cpuid_7_0_ecx_rdpid = (1UL << 22),
Cpuid_7_0_ecx_kl = (1UL << 23),
Cpuid_7_0_ecx_bus_lock_detect = (1UL << 24),
Cpuid_7_0_ecx_cldemote = (1UL << 25),
Cpuid_7_0_ecx_movdiri = (1UL << 27),
Cpuid_7_0_ecx_movdir64b = (1UL << 28),
Cpuid_7_0_ecx_enqcmd = (1UL << 29),
Cpuid_7_0_ecx_sgx_lc = (1UL << 30),
Cpuid_7_0_ecx_pks = (1UL << 31),
};
enum Cpuid_7_0_edx : l4_uint32_t
{
Cpuid_7_0_edx_sgx_keys = (1UL << 1),
Cpuid_7_0_edx_avx_512_4vnniw = (1UL << 2),
Cpuid_7_0_edx_avx_512_4fmaps = (1UL << 3),
Cpuid_7_0_edx_repmov = (1UL << 4),
Cpuid_7_0_edx_uintr = (1UL << 5),
Cpuid_7_0_edx_avx_512_vp2intersect= (1UL << 8),
Cpuid_7_0_edx_srbds_ctrl = (1UL << 9),
Cpuid_7_0_edx_md_clear = (1UL << 10),
Cpuid_7_0_edx_rtm_always_abort = (1UL << 11),
Cpuid_7_0_edx_rtm_force_abort = (1UL << 13),
Cpuid_7_0_edx_serialize = (1UL << 14),
Cpuid_7_0_edx_hybrid = (1UL << 15),
Cpuid_7_0_edx_tsxldtrk = (1UL << 16),
Cpuid_7_0_edx_pconfig = (1UL << 18),
Cpuid_7_0_edx_arch_lbr = (1UL << 19),
Cpuid_7_0_edx_cet_ibt = (1UL << 20),
Cpuid_7_0_edx_amx_fb16 = (1UL << 22),
Cpuid_7_0_edx_avx_512_fp16 = (1UL << 23),
Cpuid_7_0_edx_amx_tile = (1UL << 24),
Cpuid_7_0_edx_amx_int8 = (1UL << 25),
Cpuid_7_0_edx_ibrs = (1UL << 26),
Cpuid_7_0_edx_stibp = (1UL << 27),
Cpuid_7_0_edx_l1d_flush = (1UL << 28),
Cpuid_7_0_edx_arch_cap_msr = (1UL << 29),
Cpuid_7_0_edx_core_cap_msr = (1UL << 30),
Cpuid_7_0_edx_ssbd = (1UL << 31),
};
enum Cpuid_8000_0001_ecx : l4_uint32_t
{
// TODO amd has several bits here
Cpuid_8000_0001_ecx_lahf = (1UL << 0),
Cpuid_8000_0001_ecx_lzcnt = (1UL << 5),
Cpuid_8000_0001_ecx_prefetchw = (1UL << 8),
};
enum Cpuid_8000_0001_edx : l4_uint32_t
{
Cpuid_8000_0001_edx_syscall = (1UL << 11),
Cpuid_8000_0001_edx_nx = (1UL << 20),
Cpuid_8000_0001_edx_1gb = (1UL << 26),
Cpuid_8000_0001_edx_rdtscp = (1UL << 27),
Cpuid_8000_0001_edx_ia64 = (1UL << 29),
};
enum Cpuid_8000_0007_edx : l4_uint32_t
{
Cpuid_8000_0007_edx_invariant_tsc = (1UL << 8),
};
enum Cpuid_8000_0008_ebx : l4_uint32_t
{
Cpuid_8000_0008_ebx_amd_clzero = (1UL << 0),
Cpuid_8000_0008_ebx_amd_instretcnt_msr = (1UL << 1),
Cpuid_8000_0008_ebx_amd_rstrfperrptrs = (1UL << 2),
Cpuid_8000_0008_ebx_amd_invlpkg = (1UL << 3),
Cpuid_8000_0008_ebx_amd_rdpru = (1UL << 4),
Cpuid_8000_0008_ebx_amd_mcommit = (1UL << 8),
Cpuid_8000_0008_ebx_wbnoinvd = (1UL << 9),
// AMD speculation control.
// 0x8000'0008 EBX
// Whitepaper AMD64 Technology: Indirect Branch Control Extension,
// revision 4.10.18
Cpuid_8000_0008_ebx_amd_ibpb = (1UL << 12),
Cpuid_8000_0008_ebx_amd_ibrs = (1UL << 14),
Cpuid_8000_0008_ebx_amd_stibp = (1UL << 15),
// Whitepaper AMD64 Technology: Speculative Store Bypass Disable, 5.21.18
Cpuid_8000_0008_ebx_amd_ssbd = (1UL << 24),
};
}; // namespace
namespace Vmm
{
enum Cpuid_configuration : l4_uint32_t
{
// general config
Cpuid_max_basic_info_leaf = 0x1f,
Cpuid_max_ext_info_leaf = 0x8000'0008,
// leaf config
// Unsupported:
// Cpuid_1_ecx_monitor
// Cpuid_1_ecx_vmx
// Cpuid_1_ecx_smx
// Cpuid_1_ecx_thermal_monitor
// Cpuid_1_ecx_speed_step
// Cpuid_1_ecx_sdbg
// Cpuid_1_ecx_osxsave
// Cpuid_1_ecx_xtpr_update
// Cpuid_1_ecx_pdcm
// Cpuid_1_ecx_context_id
// Cpuid_1_ecx_dca
// Cpuid_1_ecx_ds_cpl
// Cpuid_1_ecx_dtes64
Cpuid_1_ecx_supported = \
Cpuid_1_ecx_sse3 \
| Cpuid_1_ecx_pclmulqdq \
| Cpuid_1_ecx_ssse3 \
| Cpuid_1_ecx_fma \
| Cpuid_1_ecx_cmpxchg16b \
| Cpuid_1_ecx_sse4_1 \
| Cpuid_1_ecx_sse4_2 \
| Cpuid_1_ecx_movbe \
| Cpuid_1_ecx_popcnt \
| Cpuid_1_ecx_tsc_deadline \
| Cpuid_1_ecx_aesni \
| Cpuid_1_ecx_xsave \
| Cpuid_1_ecx_avx \
| Cpuid_1_ecx_f16c \
| Cpuid_1_ecx_pcid \
| Cpuid_1_ecx_rdrand,
Cpuid_1_ecx_mandatory = \
Cpuid_1_ecx_hypervisor
// x2apic is emulated even if the host doesn't have it
| Cpuid_1_ecx_x2apic,
// Unsupported flags
// Cpuid_1_edx_mca
// Cpuid_1_edx_acpi
// Cpuid_1_edx_ds
// Cpuid_1_edx_tm
// Cpuid_1_edx_htt
// Cpuid_1_edx_psn
// Cpuid_1_edx_pbe
Cpuid_1_edx_supported = \
Cpuid_1_edx_fpu \
| Cpuid_1_edx_vme \
| Cpuid_1_edx_de \
| Cpuid_1_edx_pse \
| Cpuid_1_edx_tsc \
| Cpuid_1_edx_msr \
| Cpuid_1_edx_pae \
| Cpuid_1_edx_mce \
| Cpuid_1_edx_cx8 \
| Cpuid_1_edx_apic\
| Cpuid_1_edx_sep \
| Cpuid_1_edx_mtrr\
| Cpuid_1_edx_pge \
| Cpuid_1_edx_cmov\
| Cpuid_1_edx_pat \
| Cpuid_1_edx_pse_36 \
| Cpuid_1_edx_clfsh \
| Cpuid_1_edx_mmx \
| Cpuid_1_edx_fxsr \
| Cpuid_1_edx_sse \
| Cpuid_1_edx_sse2 \
| Cpuid_1_edx_ss,
Cpuid_6_eax_supported = \
Cpuid_6_eax_arat,
Cpuid_7_0_eax_leafs = 1,
// Unsupported:
// Cpuid_7_0_ebx_mpx
// Cpuid_7_0_ebx_trace
Cpuid_7_0_ebx_supported = \
Cpuid_7_0_ebx_fsgsbase \
| Cpuid_7_0_ebx_bmi1 \
| Cpuid_7_0_ebx_hle \
| Cpuid_7_0_ebx_avx2 \
| Cpuid_7_0_ebx_fdp_excptn_only \
| Cpuid_7_0_ebx_smep \
| Cpuid_7_0_ebx_bmi2 \
| Cpuid_7_0_ebx_movsb \
| Cpuid_7_0_ebx_rtm \
| Cpuid_7_0_ebx_fpu_cs \
| Cpuid_7_0_ebx_avx_512_f \
| Cpuid_7_0_ebx_avx_512_dq \
| Cpuid_7_0_ebx_rdseed \
| Cpuid_7_0_ebx_adx \
| Cpuid_7_0_ebx_smap \
| Cpuid_7_0_ebx_avx_512_ifma \
| Cpuid_7_0_ebx_clflushopt \
| Cpuid_7_0_ebx_clwb \
| Cpuid_7_0_ebx_avx_512_pf \
| Cpuid_7_0_ebx_avx_512_er \
| Cpuid_7_0_ebx_avx_512_cd \
| Cpuid_7_0_ebx_sha \
| Cpuid_7_0_ebx_invpcid \
| Cpuid_7_0_ebx_avx_512_bw \
| Cpuid_7_0_ebx_avx_512_vl,
// Unsupported:
// Cpuid_7_0_ecx_ospke
// Cpuid_7_0_ecx_waitpkg
// Cpuid_7_0_ecx_la57 (ia32e 5 level paging)
Cpuid_7_0_ecx_supported = \
Cpuid_7_0_ecx_prefetchwt1 \
| Cpuid_7_0_ecx_avx_512_vbmi \
| Cpuid_7_0_ecx_umip \
| Cpuid_7_0_ecx_avx_512_vbmi2 \
| Cpuid_7_0_ecx_avx_512_vnni \
| Cpuid_7_0_ecx_avx_512_bitalg,
Cpuid_7_0_edx_supported = \
Cpuid_7_0_edx_avx_512_4vnniw \
| Cpuid_7_0_edx_avx_512_4fmaps \
| Cpuid_7_0_edx_repmov \
| Cpuid_7_0_edx_avx_512_vp2intersect \
| Cpuid_7_0_edx_avx_512_fp16 \
| Cpuid_7_0_edx_uintr \
| Cpuid_7_0_edx_md_clear,
Cpuid_8000_0001_ecx_supported = \
Cpuid_8000_0001_ecx_lahf,
Cpuid_8000_0001_edx_supported = \
Cpuid_8000_0001_edx_syscall \
| Cpuid_8000_0001_edx_nx \
| Cpuid_8000_0001_edx_1gb \
| Cpuid_8000_0001_edx_ia64,
Cpuid_8000_0007_edx_supported = \
Cpuid_8000_0007_edx_invariant_tsc,
// According to the Linux source code at arch/x86/kernel/cpu/common.c,
// "[...] a hypervisor might have set the individual AMD bits even on
// Intel CPUs, for finer-grained selection of what's available."
// Thus filter AMD bits for the case of nested virtualization.
Cpuid_8000_0008_ebx_supported = \
Cpuid_8000_0008_ebx_wbnoinvd,
};
inline void
cpuid_reg_apply(l4_uint32_t *host_register,
l4_uint32_t supported_bits,
l4_uint32_t mandatory_bits = 0)
{
*host_register &= supported_bits;
*host_register |= mandatory_bits;
}
}; // namespace Vmm

View File

@@ -0,0 +1,40 @@
/*
* Copyright (C) 2022, 2024 Kernkonzept GmbH.
* Author(s): Jakub Jermar <jakub.jermar@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include <l4/sys/types.h>
#include "device.h"
namespace Vmm {
/**
* Interface for devices responding to guest CPUID invocations.
*/
struct Cpuid_device : virtual Vdev::Dev_ref
{
virtual ~Cpuid_device() = 0;
/**
* Handle the CPUID instruction.
*
* \param regs Guest register state.
* \param a[out] Output value for RAX.
* \param b[out] Output value for RBX.
* \param c[out] Output value for RCX.
* \param d[out] Output value for RDX.
*
* \return True if the device handled the CPUID instruction,
* false otherwise.
*/
virtual bool handle_cpuid(l4_vcpu_regs_t const *regs, unsigned *a,
unsigned *b, unsigned *c, unsigned *d) const = 0;
};
inline Cpuid_device::~Cpuid_device() = default;
} // namespace

View File

@@ -0,0 +1,22 @@
/*
* Copyright (C) 2019, 2024 Kernkonzept GmbH.
* Author(s): Timo Nicolai <timo.nicolai@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include "debugger/generic_guest_debugger.h"
#include "monitor/dbg_cmd_handler.h"
namespace Monitor {
class Guest_debugger
: public Generic_guest_debugger,
public Dbg_cmd_handler<Enabled, Guest_debugger>
{
public:
using Generic_guest_debugger::Generic_guest_debugger;
};
}

View File

@@ -0,0 +1,58 @@
/*
* Copyright (C) 2023-2024 Kernkonzept GmbH.
* Author(s): Philipp Eppelt <philipp.eppelt@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#include "event_record.h"
#include "event_record_lapic.h"
namespace Vmm {
bool Event_exc::inject(Vm_state *vm)
{
vm->inject_event(
Injection_event(ev_num, 3, error_val != Invalid_error, error_val));
return true;
}
bool Real_mode_exc::inject(Vm_state *vm)
{
vm->inject_event(Injection_event(ev_num, 3, false));
return true;
}
bool Event_nmi::inject(Vm_state *vms)
{
if (vms->can_inject_nmi())
{
vms->disable_nmi_window();
lapic->next_pending_nmi();
vms->inject_event(Injection_event(2, 2, false)); // NMI is vector 2, type 2
return true;
}
vms->enable_nmi_window();
return false;
}
bool Event_irq::inject(Vm_state *vms)
{
if (vms->can_inject_interrupt())
{
vms->disable_interrupt_window();
int irq = lapic->next_pending_irq();
if (irq < 0)
{
return true;
}
vms->inject_event(Injection_event(irq, 0, false)); // IRQ vector, type 0
return true;
}
vms->enable_interrupt_window();
return false;
}
} // namespace Vmm

View File

@@ -0,0 +1,124 @@
/*
* Copyright (C) 2023-2024 Kernkonzept GmbH.
* Author(s): Philipp Eppelt <philipp.eppelt@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include "vm_state.h"
namespace Vmm {
/**
* Event priority order
*
* The priortiy is specified in the Intel SDM 12/2022 Vol 3
* Section 6.9 "Prioritization of Concurrent Events".
*/
enum Event_prio : char
{
// on instruction events
Abort = 0,
Exception,
Sw_int1,
Sw_int3,
Sw_intO,
Sw_intN,
Bound,
// potentially concurrent events raised on instructions boundaries.
Reset,
Machine_check,
Trap_task_switch,
Ext_hw_intervention,
Trap_dbg_except,
Nmi,
Irq,
Fault_dbg_except,
Fault_fetch_next_instr,
Fault_decode_next_instr,
Prio_max // must be last
};
/**
* Single event record, e.g. for an event raised by hardware.
*/
struct Event_record
{
explicit Event_record(Event_prio p) : prio(p) {}
virtual ~Event_record() = default;
virtual bool inject(Vm_state *vms) = 0;
constexpr bool operator < (Event_record const &o) const
{ return prio < o.prio; }
constexpr bool operator > (Event_record const &o) const
{ return prio > o.prio; }
constexpr bool operator == (Event_record const &o) const
{ return prio == o.prio; }
Event_prio const prio; ///< Type of the Event_record
};
/**
* Exception event record.
*/
struct Event_exc : Event_record
{
enum : unsigned { Invalid_error = ~0U };
explicit Event_exc(Event_prio p, unsigned ev_num)
: Event_record(p), ev_num(ev_num)
{}
Event_exc(Event_prio p, unsigned ev_num, unsigned e_val)
: Event_record(p), ev_num(ev_num), error_val(e_val)
{}
bool inject(Vm_state *vm) override;
unsigned ev_num; ///< Event number to inject
unsigned error_val = Invalid_error; ///< Error value to push on the stack
};
struct Real_mode_exc : Event_record
{
explicit Real_mode_exc(Event_prio p, unsigned ev_num)
: Event_record(p), ev_num(ev_num)
{}
bool inject(Vm_state *vm) override;
unsigned ev_num; ///< Event number to inject
};
/**
* Generic software exception/interrupt event to inject into the guest.
*
* \tparam TYPE Event type to use in injection.
*/
template <l4_uint8_t TYPE>
struct Event_sw_generic : Event_record
{
Event_sw_generic(Event_prio p, unsigned ev_num, unsigned insn_len)
: Event_record(p), ev_num(ev_num), instruction_len(insn_len)
{}
bool inject(Vm_state *vm) override
{
vm->inject_event(Injection_event(ev_num, TYPE, false));
if (vm->type() == Vm_state::Type::Vmx)
vm->advance_entry_ip(instruction_len);
return true;
}
unsigned ev_num; ///< Event number to inject
unsigned instruction_len; ///< Bytes to advance IP
};
} // namespace Vmm

View File

@@ -0,0 +1,50 @@
/*
* Copyright (C) 2023-2024 Kernkonzept GmbH.
* Author(s): Philipp Eppelt <philipp.eppelt@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include "event_record.h"
#include "virt_lapic.h"
namespace Vmm
{
/**
* NMI event record.
*/
struct Event_nmi : Event_record
{
explicit Event_nmi(Gic::Virt_lapic *apic)
: Event_record(Event_prio::Nmi), lapic(apic)
{}
bool inject(Vm_state *vm) override;
Gic::Virt_lapic *lapic;
};
/**
* IRQ event record.
*/
struct Event_irq : Event_record
{
explicit Event_irq(Gic::Virt_lapic *apic)
: Event_record(Event_prio::Irq), lapic(apic)
{}
bool inject(Vm_state *vm) override;
Gic::Virt_lapic *lapic;
};
// These are necessary to correctly compute Event_memory::max_event_size().
// The asserts ensure that these event objects don't influence the computation.
static_assert(sizeof(Event_irq) <= sizeof(Event_exc),
"IRQ event objects are not the largest event object.");
static_assert(sizeof(Event_nmi) <= sizeof(Event_exc),
"NMI event objects are not the largest event object.");
} // namespace Vmm

View File

@@ -0,0 +1,114 @@
/*
* Copyright (C) 2023-2024 Kernkonzept GmbH.
* Author(s): Philipp Eppelt <philipp.eppelt@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#include "event_recorder.h"
#include "debug.h"
namespace Vmm {
bool Event_recorder::inject(Vm_state *vms)
{
if (empty())
return false;
auto top = _queue.top();
if (top->inject(vms))
{
_queue.pop();
if (top->prio == Event_prio::Exception)
{
if (_queue.empty() || _queue.top()->prio != Event_prio::Exception)
_has_exception = false;
}
else if (top->prio == Event_prio::Nmi)
_has_nmi = false;
else if (top->prio == Event_prio::Irq)
_has_irq = false;
// We have ownership. We have to free the memory!
free_event(top);
return true;
}
return false;
}
void Event_recorder::add(Event_record *event)
{
if (event->prio == Event_prio::Exception)
_has_exception = true;
else if (event->prio == Event_prio::Nmi)
{
if (_has_nmi)
return;
else
_has_nmi = true;
}
else if (event->prio == Event_prio::Irq)
{
if (_has_irq)
return;
else
_has_irq = true;
}
_queue.push(std::move(event));
}
void Event_recorder::clear()
{
while (!_queue.empty())
{
auto top = _queue.top();
_queue.pop();
// We have ownership. We have to free the memory!
free_event(top);
}
_has_exception = false;
_has_nmi = false;
_has_irq = false;
}
bool Event_recorder::empty() const
{ return _queue.empty(); }
void Event_recorder::dump(unsigned vcpu_id) const
{
static char const *Event_prio_names[Event_prio::Prio_max] = {
"Abort",
"Exception",
"Sw_int1",
"Sw_int3",
"Sw_intO",
"Sw_intN",
"Bound",
"Reset",
"Machine_check",
"Trap_task_switch",
"Ext_hw_intervention",
"Trap_dbg_except",
"Nmi",
"Interrupt",
"Fault_dbg_except",
"Fault_fetch_next_instr",
"Fault_decode_next_instr",
};
if (_queue.empty())
{
Dbg().printf("[%3u] Ev_rec: No event recorded.\n", vcpu_id);
return;
}
auto prio = _queue.top()->prio;
char const *name = prio < Event_prio::Prio_max ? Event_prio_names[prio]
: "Index out of bounds";
Dbg().printf("[%3u] Ev_rec: Top event has prio %i (%s); #events: %zu\n",
vcpu_id, prio, name, _queue.size());
}
} // namespace Vmm

View File

@@ -0,0 +1,307 @@
/*
* Copyright (C) 2023-2024 Kernkonzept GmbH.
* Author(s): Philipp Eppelt <philipp.eppelt@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include "event_record.h"
#include "vm_state.h"
#include "debug.h"
#include <l4/re/error_helper>
#include <l4/cxx/bitmap>
#include <l4/cxx/minmax>
#include <vector>
#include <queue>
#include <cassert>
namespace Vmm {
/// Recorder of all events for a core.
class Event_recorder
{
public:
~Event_recorder() { clear(); }
/**
* Inject highest priority event.
*
* \retval true Event injected.
* \retval false No event to inject or can't inject pending event.
*/
bool inject(Vm_state *vms);
/**
* Record an event.
*
* \note Pending interrupts are recorded as placeholder item such that the
* caller knows the query the local APIC. NMI and IRQs are just
* recorded once.
*
* \post Ownership moves to `Event_recorder`.
*/
void add(Event_record *event);
/// Clears all recorded events.
void clear();
/// True, iff no event recorded.
bool empty() const;
/// FIXME for MSR interface lacking return value tristate.
bool has_exception() const { return _has_exception; }
/// true, iff IRQ event already recorded
bool has_nmi() const { return _has_nmi; }
/// true, iff IRQ event already recorded
bool has_irq() const { return _has_irq; }
/// debugging aid
void dump(unsigned vcpu_id) const;
/// Create an Event instance and record it.
template <typename T, typename... ARGS>
void make_add_event(ARGS... args)
{
add(allocate_event<T, ARGS...>(args...));
}
private:
static Dbg warn() { return Dbg(Dbg::Core, Dbg::Warn, "Event recorder"); }
/**
* Allocate memory for an object of type `T`.
*
* \tparam T Type derived from `Event_record`.
*
*/
template <typename T, typename... ARGS>
Event_record *allocate_event(ARGS... args)
{
static bool warn_once = true;
char *addr = _memory.alloc(sizeof(T));
if (addr)
return new (addr) T(std::forward<ARGS>(args)...);
else
{
// Print message once, if dynamic allocation is necessary on any core.
if (warn_once)
{
warn_once = false;
warn().printf("Usage of the slow path for event allocation. Memory "
"preallocation exceeded for the first time.");
}
return new T(std::forward<ARGS>(args)...);
}
}
/**
* Destruct object derived from `Event_record` and free the memory.
*
* \param object Address of the object to destruct and free.
*/
void free_event(Event_record *object)
{
if (_memory.in_memory(reinterpret_cast<char *>(object)))
{
object->~Event_record();
_memory.free(reinterpret_cast<char *>(object));
}
else
delete object;
}
/**
* Encapsulate all memory management for Event_records within this class.
*
* We want to avoid dynamic memory allocation during VM exit handling and
* thus preallocate the memory and create events within this memory range.
* The memory is split into chunks that fit all Event_records object sizes
* and returns one such chunk on request.
*
* It's an open question how to handle OOM situations.
*/
class Event_memory
{
struct Bin_if
{
virtual ~Bin_if() = default;
virtual char *alloc() = 0;
virtual bool free(char *) = 0;
virtual bool managed_addr(char *addr) const = 0;
};
template <unsigned BIN_SIZE, unsigned SLOTS>
struct Bin : Bin_if
{
Bin() { slot_used.clear_all(); }
~Bin() = default;
char *alloc() noexcept override
{
int free_idx = slot_used.scan_zero(0);
if (free_idx >= 0)
{
slot_used[free_idx] = true;
return mem + free_idx * BIN_SIZE;
}
warn().printf("no space in bin left to allocate. Bin addr %p, num bins "
"%u, bin size %u\n",
&mem, SLOTS, BIN_SIZE);
return nullptr;
}
bool free(char *addr) noexcept override
{
unsigned bin_idx = (addr - mem) / BIN_SIZE;
assert(slot_used[bin_idx] == true);
slot_used[bin_idx] = false;
return true;
}
bool managed_addr(char * addr) const noexcept override
{
if (addr < mem || addr >= mem + MEM_SIZE)
{
info().printf("Address %p not in bin-managed range[%p, %p]. Bin "
"size: 0x%x\n",
addr, mem, mem + MEM_SIZE, BIN_SIZE);
return false;
}
return true;
}
static unsigned constexpr MEM_SIZE = BIN_SIZE * SLOTS;
cxx::Bitmap<SLOTS> slot_used;
char mem[MEM_SIZE];
};
/**
* Compute maximum object size of all events.
*
* This depends on static_asserts for Event_nmi & Event_irq.
*/
static unsigned constexpr max_event_size()
{
// Event types: Event_exc, Real_mode_exc, Event_sw_generic, Event_nmi,
// Event_irq
unsigned constexpr size =
cxx::max(sizeof(Event_exc), sizeof(Real_mode_exc),
sizeof(Event_sw_generic<0>));
// round to next power of two to fit to cache lines.
return next_pow2(size);
}
/**
* Compute the next larger value which is a power of two.
*
* \param num Number to start from.
*/
static unsigned constexpr next_pow2(unsigned num)
{
static_assert(sizeof(unsigned) <= 4,
"Next power of 2 algorithm only supports 32-bit numbers.");
if (num == 0U)
return 1;
--num;
num |= num >> 1;
num |= num >> 2;
num |= num >> 4;
num |= num >> 8;
num |= num >> 16;
return ++num;
}
public:
Event_memory()
{
// instead of one preallocated bin per event size, we simplify and
// use one bin for all events and accept the additional temporary memory
// usage within a bin. Only the bin size affects the total memory usage.
unsigned constexpr size = max_event_size();
_bin = new Bin<size, 32>();
}
~Event_memory()
{
if (_bin)
delete _bin;
}
char *alloc(l4_size_t /* size */)
{
char *addr = _bin->alloc();
return addr;
}
// pre: in_memory(addr) == true
void free(char *addr)
{
assert(in_memory(addr));
_bin->free(addr);
}
bool in_memory(char *addr)
{
return _bin->managed_addr(addr);
}
private:
static Dbg warn() { return Dbg(Dbg::Core, Dbg::Warn, "Event memory"); }
static Dbg info() { return Dbg(Dbg::Core, Dbg::Info, "Event memory"); }
Bin_if *_bin;
}; // class Event_memory
using Qtype = Event_record *;
struct QGreater
{
bool operator()(Qtype const &item1, Qtype const &item2) const
{ return *item1 > *item2; }
};
std::priority_queue<Qtype, std::vector<Qtype>, QGreater> _queue;
Event_memory _memory;
bool _has_exception = false;
bool _has_nmi = false;
bool _has_irq = false;
};
/// Interface to get the event recorder for a specific core.
struct Event_recorders
{
virtual Event_recorder *recorder(unsigned num) = 0;
};
/**
* Management entity for one `Event_recorder` per core.
*/
class Event_recorder_array : public Event_recorders
{
public:
virtual ~Event_recorder_array() = default;
void init(unsigned size)
{ _recorders.resize(size); }
Event_recorder *recorder(unsigned num) override
{
assert(num < _recorders.size());
return &_recorders[num];
}
private:
std::vector<Event_recorder> _recorders;
};
} // namespace Vmm

View File

@@ -0,0 +1,119 @@
/*
* Copyright (C) 2021-2022, 2024 Kernkonzept GmbH.
* Author(s): Jean Wolter <jean.wolter@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#include <l4/re/video/goos>
#include "guest.h"
static bool fb_present = false;
static l4_uint64_t fb_addr, fb_size;
static L4Re::Video::View::Info fb_viewinfo;
namespace Vdev {
// Taken from linux/include/uapi/linux/screen_info.h
struct screen_info {
l4_uint8_t orig_x; /* 0x00 */
l4_uint8_t orig_y; /* 0x01 */
l4_uint16_t ext_mem_k; /* 0x02 */
l4_uint16_t orig_video_page; /* 0x04 */
l4_uint8_t orig_video_mode; /* 0x06 */
l4_uint8_t orig_video_cols; /* 0x07 */
l4_uint8_t flags; /* 0x08 */
l4_uint8_t unused2; /* 0x09 */
l4_uint16_t orig_video_ega_bx;/* 0x0a */
l4_uint16_t unused3; /* 0x0c */
l4_uint8_t orig_video_lines; /* 0x0e */
l4_uint8_t orig_video_isVGA; /* 0x0f */
l4_uint16_t orig_video_points;/* 0x10 */
/* VESA graphic mode -- linear frame buffer */
l4_uint16_t lfb_width; /* 0x12 */
l4_uint16_t lfb_height; /* 0x14 */
l4_uint16_t lfb_depth; /* 0x16 */
l4_uint32_t lfb_base; /* 0x18 */
l4_uint32_t lfb_size; /* 0x1c */
l4_uint16_t cl_magic, cl_offset; /* 0x20 */
l4_uint16_t lfb_linelength; /* 0x24 */
l4_uint8_t red_size; /* 0x26 */
l4_uint8_t red_pos; /* 0x27 */
l4_uint8_t green_size; /* 0x28 */
l4_uint8_t green_pos; /* 0x29 */
l4_uint8_t blue_size; /* 0x2a */
l4_uint8_t blue_pos; /* 0x2b */
l4_uint8_t rsvd_size; /* 0x2c */
l4_uint8_t rsvd_pos; /* 0x2d */
l4_uint16_t vesapm_seg; /* 0x2e */
l4_uint16_t vesapm_off; /* 0x30 */
l4_uint16_t pages; /* 0x32 */
l4_uint16_t vesa_attributes; /* 0x34 */
l4_uint32_t capabilities; /* 0x36 */
l4_uint32_t ext_lfb_base; /* 0x3a */
l4_uint8_t _reserved[2]; /* 0x3e */
} __attribute__((packed));
enum {
Video_type_vlfb = 0x23
};
enum {
Video_capability_skip_quirks = (1 << 0),
/* Frame buffer base is 64-bit */
Video_capability_64bit_base = (1 << 1)
};
static void configure_framebuffer(void *zeropage)
{
auto *si = reinterpret_cast<struct screen_info *>(zeropage);
// define framebuffer type
si->orig_video_isVGA = Video_type_vlfb;
si->capabilities = Video_capability_skip_quirks | Video_capability_64bit_base;
// setup address and size of buffer
si->lfb_base = fb_addr & 0xffffffff;
si->ext_lfb_base = fb_addr >> 32;
// framebuffer size is in 64 KiB chunks for VLFB per historical convention
si->lfb_size = l4_round_size(fb_size, 16) >> 16;
// define dimensions
si->lfb_width = fb_viewinfo.width;
si->lfb_height = fb_viewinfo.height;
si->lfb_linelength = fb_viewinfo.bytes_per_line;
// define color
si->lfb_depth = fb_viewinfo.pixel_info.bytes_per_pixel() * 8;
si->red_size = fb_viewinfo.pixel_info.r().size();
si->red_pos = fb_viewinfo.pixel_info.r().shift();
si->green_size = fb_viewinfo.pixel_info.g().size();
si->green_pos = fb_viewinfo.pixel_info.g().shift();
si->blue_size = fb_viewinfo.pixel_info.b().size();
si->blue_pos = fb_viewinfo.pixel_info.b().shift();
si->rsvd_size = fb_viewinfo.pixel_info.padding().size();
si->rsvd_pos = fb_viewinfo.pixel_info.padding().shift();
}
} // namespace Vdev
namespace Vmm {
bool
Guest::register_framebuffer(l4_uint64_t addr, l4_uint64_t size,
const L4Re::Video::View::Info &info)
{
if (fb_present)
{
Err().printf("0x%llx: Multiple definitions of framebuffer, only one framebuffer is supported\n",
addr);
return false;
}
fb_present = true;
fb_addr = addr;
fb_size = size;
fb_viewinfo = info;
Vmm::Zeropage::set_screen_callback(Vdev::configure_framebuffer);
return true;
}
} // namespace Vmm

View File

@@ -0,0 +1,340 @@
/*
* Copyright (C) 2018-2024 Kernkonzept GmbH.
* Author(s): Sarah Hoffmann <sarah.hoffmann@kernkonzept.com>
* Philipp Eppelt <philipp.eppelt@kernkonzept.com>
* Benjamin Lamowski <benjamin.lamowski@kernkonzept.com>
* Georg Kotheimer <georg.kotheimer@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#include "guest.h"
#include "debug.h"
#include "vm_state_svm.h"
namespace Vmm {
/**
* Synchronize VMCB.StateSaveArea.RAX with Vcpu_regs.RAX.
*/
class Rax_guard
{
public:
Rax_guard(Svm_state *vms, l4_vcpu_regs_t *regs) : _vms(vms), _regs(regs)
{ _regs->ax = _vms->vmcb()->state_save_area.rax; }
~Rax_guard()
{ _vms->vmcb()->state_save_area.rax = _regs->ax; }
private:
Svm_state *_vms;
l4_vcpu_regs_t *_regs;
};
template <>
int
Guest::handle_exit<Svm_state>(Vmm::Cpu_dev *cpu, Svm_state *vms)
{
Vmm::Vcpu_ptr vcpu = cpu->vcpu();
l4_vcpu_regs_t *regs = &vcpu->r;
unsigned vcpu_id = vcpu.get_vcpu_id();
// Synchronize VMCB.StateSaveArea.RAX with Vcpu_regs.RAX. This is necessary
// because the code shared between VMX and SVM uses the RAX in Vcpu_regs,
// since in VMX only RSP and RIP are stored in the "guest state save area".
Rax_guard rax_guard(vms, regs);
// Initially all fields are clean
vms->mark_all_clean();
auto *ev_rec = recorder(vcpu.get_vcpu_id());
using Exit = Svm_state::Exit;
Exit reason = vms->exit_code();
switch (reason)
{
// TODO: Lacks handlers for some of the enabled intercepts, which have not
// been triggered during development. If one of these interceptions is hit,
// first an error message is printed and then the VM is stopped.
case Exit::Cpuid: return handle_cpuid(vcpu);
case Exit::Vmmcall: return handle_vm_call(regs);
case Exit::Ioio:
{
Svm_state::Io_info info(vms->exit_info1());
bool is_read = info.type() == 1;
unsigned port = info.port();
trace().printf("[%3u]: VM exit: IO port access with exit info 0x%x: "
"%s port 0x%x\n",
vcpu_id, info.raw, is_read ? "read" : "write", port);
if (info.str())
{
warn().printf("[%3u]: String based port access is not supported!\n",
vcpu_id);
return Jump_instr;
}
// rep prefix is only specified for string port access instructions,
// which are not yet supported anyway.
if (info.rep())
{
warn().printf("[%3u]: Repeated port access is not supported!\n",
vcpu_id);
return Jump_instr;
}
Mem_access::Width wd = Mem_access::Wd32;
switch (info.data_size())
{
case 1: wd = Mem_access::Wd8; break;
case 2: wd = Mem_access::Wd16; break;
case 4: wd = Mem_access::Wd32; break;
}
return handle_io_access(port, is_read, wd, regs);
}
case Exit::Nested_page_fault:
{
l4_addr_t guest_phys_addr = vms->exit_info2();
Svm_state::Npf_info info(vms->exit_info1());
trace().printf(
"[%3u]: Nested page fault at gp_addr 0x%lx with exit info 0x%llx\n",
vcpu_id, guest_phys_addr, info.raw);
// TODO: Use instruction bytes provided by decode assist
switch(handle_mmio(guest_phys_addr, vcpu))
{
case Retry: return L4_EOK;
case Jump_instr:
{
// TODO: Avoid fetching and decoding the current instruction again
// (handle_mmio already did that once).
l4_uint64_t opcode;
try
{
// overwrite the virtual IP with the physical OP code
opcode = vcpu.get_pt_walker()->walk(vms->cr3(), vms->ip());
}
catch (L4::Runtime_error &e)
{
warn().printf("[%3u]: Could not determine opcode for MMIO "
"access\n",
vcpu_id);
return -L4_EINVAL;
}
// TODO: Check inst_buf points to valid memory and figure out its size.
unsigned char *inst_buf = reinterpret_cast<unsigned char *>(opcode);
unsigned inst_buf_len = 15;
// The next sequential instruction pointer (nRIP) is not saved for
// nested page faults:
// > nRIP is saved for instruction intercepts as well as MSR and
// > IOIO intercepts and exceptions caused by the INT3, INTO,
// > and BOUND instructions.
// > For all other intercepts, nRIP is reset to zero.
if (vms->determine_next_ip_from_ip(regs, inst_buf, inst_buf_len))
return Jump_instr;
else
{
warn().printf("[%3u]: Could not determine next ip for MMIO "
"access\n",
vcpu_id);
return -L4_EINVAL;
}
}
default: break;
}
warn().printf("[%3u]: Unhandled nested page fault @ 0x%lx\n", vcpu_id,
vms->ip());
warn()
.printf("[%3u]: Present: %u, Type: %s, Inst.: %u Phys addr: 0x%lx\n",
vcpu_id, info.present().get(),
info.write() ? "Write" : "Read", info.inst().get(),
guest_phys_addr);
return -L4_EINVAL;
}
case Exit::Msr:
{
bool write = vms->exit_info1() == 1;
bool has_already_exception = ev_rec->has_exception();
if (!msr_devices_rwmsr(regs, write, vcpu.get_vcpu_id()))
{
info().printf("[%3u]: %s unsupported MSR 0x%lx\n", vcpu_id,
write ? "Writing" : "Reading", regs->cx);
ev_rec->make_add_event<Event_exc>(Event_prio::Exception, 13, 0);
return Retry;
}
if (!has_already_exception && ev_rec->has_exception())
return Retry;
else
return Jump_instr;
}
case Exit::Hlt:
trace().printf("[%3u]: HALT 0x%lx!\n", vcpu_id, vms->ip());
vms->halt();
cpu->halt_cpu();
return Jump_instr;
case Exit::Cr0_sel_write:
return vms->handle_cr0_write(regs);
case Exit::Xsetbv:
return vms->handle_xsetbv(regs);
case Exit::Vintr:
// Used as interrupt window notification, handled in run_vm().
return L4_EOK;
case Exit::Rdpmc:
ev_rec->make_add_event<Event_exc>(Event_prio::Exception, 13, 0);
return Retry;
case Exit::Dr0_read:
case Exit::Dr1_read:
case Exit::Dr2_read:
case Exit::Dr3_read:
case Exit::Dr4_read:
case Exit::Dr5_read:
case Exit::Dr6_read:
case Exit::Dr7_read:
{
int i = static_cast<int>(reason) - static_cast<int>(Exit::Dr0_read);
if (i == 4 || i == 5)
{
if (vms->vmcb()->state_save_area.cr4 & (1U << 3)) // CR4.DE set?
{
ev_rec->make_add_event<Event_exc>(Event_prio::Exception, 6);
return Retry;
}
// else: alias to DR6 & DR7
}
unsigned char gp_reg = vms->vmcb()->control_area.exitinfo1 & 0xf;
*(&(regs->ax) - gp_reg) = 0;
return Jump_instr;
}
case Exit::Dr8_read:
case Exit::Dr9_read:
case Exit::Dr10_read:
case Exit::Dr11_read:
case Exit::Dr12_read:
case Exit::Dr13_read:
case Exit::Dr14_read:
case Exit::Dr15_read:
// AMD APM Vol 2 Chapter 13.1.1.5 "64-Bit-Mode Extended Debug Registers":
// DR8-15 are not implemented -> #UD
ev_rec->make_add_event<Event_exc>(Event_prio::Exception, 6);
return Retry;
case Exit::Dr0_write:
case Exit::Dr1_write:
case Exit::Dr2_write:
case Exit::Dr3_write:
case Exit::Dr4_write:
case Exit::Dr5_write:
case Exit::Dr6_write:
case Exit::Dr7_write:
{
// Ignore the writes, except to illegal registers.
int i = static_cast<int>(reason) - static_cast<int>(Exit::Dr0_read);
if (i == 4 || i == 5)
{
if (vms->vmcb()->state_save_area.cr4 & (1U << 3)) // CR4.DE set?
{
ev_rec->make_add_event<Event_exc>(Event_prio::Exception, 6);
return Retry;
}
}
return Jump_instr;
}
case Exit::Dr8_write:
case Exit::Dr9_write:
case Exit::Dr10_write:
case Exit::Dr11_write:
case Exit::Dr12_write:
case Exit::Dr13_write:
case Exit::Dr14_write:
case Exit::Dr15_write:
// AMD APM Vol 2 Chapter 13.1.1.5 "64-Bit-Mode Extended Debug Registers":
// DR8-15 are not implemented -> #UD
ev_rec->make_add_event<Event_exc>(Event_prio::Exception, 6);
return Retry;
case Exit::Vmrun:
case Exit::Vmload:
case Exit::Vmsave:
case Exit::Stgi:
case Exit::Clgi:
case Exit::Skinit:
case Exit::Rdtscp:
// Unsupported instructions, inject undefined opcode exception
ev_rec->make_add_event<Event_exc>(Event_prio::Exception, 6);
return Retry;
case Exit::Sw_int:
{
// exit_info1[7:0] contains vector
l4_uint32_t sw_int_num = vms->exit_info1() & 0xff;
using Event_sw_int = Event_sw_generic<4>;
ev_rec->make_add_event<Event_sw_int>(Event_prio::Sw_intN, sw_int_num,
0U);
return Retry;
}
case Exit::Icebp:
// Emulating ICEBP this way leads to an additional DPL check, which INT1
// does not do normally, but normally, the INT1 is for HW vendors only.
ev_rec->make_add_event<Event_exc>(Event_prio::Sw_int1, 1); // #DB
return Retry;
case Exit::Shutdown:
// Any event that triggeres a shutdown, e.g. triple fault, lands here.
info().printf("[%3u]: Shutdown intercept triggered at IP 0x%lx. Core in "
"shutdown mode.\n",
vcpu_id, vms->ip());
vcpu.dump_regs_t(vms->ip(), info());
// move CPU into stop state
cpu->stop();
return Retry;
default:
if (reason >= Exit::Excp_0 && reason <= Exit::Excp_31)
{
int exc_num = static_cast<unsigned>(reason)
- static_cast<unsigned>(Exit::Excp_0);
return vms->handle_hardware_exception(ev_rec, exc_num);
}
warn().printf("[%3u]: Exit at guest IP 0x%lx with 0x%x (Info1: 0x%llx, "
"Info2: 0x%llx)\n",
vcpu_id, vms->ip(), static_cast<unsigned>(reason),
vms->exit_info1(), vms->exit_info2());
auto str_exit_code = vms->str_exit_code(reason);
if (str_exit_code)
warn().printf("[%3u]: Unhandled exit reason: %s (%d)\n",
vcpu_id, str_exit_code, static_cast<unsigned>(reason));
else
warn().printf("[%3u]: Unknown exit reason: 0x%x\n",
vcpu_id, static_cast<unsigned>(reason));
return -L4_ENOSYS;
}
}
} // namespace

View File

@@ -0,0 +1,374 @@
/*
* Copyright (C) 2018-2024 Kernkonzept GmbH.
* Author(s): Sarah Hoffmann <sarah.hoffmann@kernkonzept.com>
* Philipp Eppelt <philipp.eppelt@kernkonzept.com>
* Benjamin Lamowski <benjamin.lamowski@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#include "guest.h"
#include "debug.h"
#include "vm_state_vmx.h"
#include "vmx_exit_to_str.h"
#include "event_recorder.h"
namespace Vmm {
template <>
int
Guest::handle_io_access_string<Vmx_state>(unsigned port, bool is_in,
Mem_access::Width op_width,
bool is_rep, l4_vcpu_regs_t *regs,
Vmx_state *vms)
{
auto info
= Vmx_state::Vmx_insn_info_field(vms->vmx_read(VMCS_VM_EXIT_INSN_INFO));
while (1)
{
if (is_rep)
{
// REP prefix: Handle loop condition.
bool next;
int rv = vms->rep_prefix_condition(regs, info, &next);
if (rv != Jump_instr)
return rv;
if (!next)
break;
}
if (is_in)
{
l4_uint32_t value = ~0U;
bool ret = handle_io_access_ptr(port, true, op_width, &value);
if (!ret)
{
Dbg(Dbg::Dev, Dbg::Trace)
.printf("WARNING: Unhandled string IO read port 0x%x/%u\n",
port, (1U << op_width) * 8);
int rv = vms->store_io_value(regs, _ptw, info, op_width, ~0U);
if (rv != Jump_instr)
return rv;
}
else
{
int rv = vms->store_io_value(regs, _ptw, info, op_width, value);
if (rv != Jump_instr)
return rv;
}
}
else
{
l4_uint32_t value;
int rv = vms->load_io_value(regs, _ptw, info, op_width, &value);
if (rv != Jump_instr)
return rv;
bool ret = handle_io_access_ptr(port, false, op_width, &value);
if (!ret)
Dbg(Dbg::Dev, Dbg::Trace)
.printf("WARNING: Unhandled string IO write port 0x%x/%u <- "
"0x%x\n",
port, (1U << op_width) * 8, value);
}
// No REP prefix: Terminate the loop after the first iteration.
if (!is_rep)
break;
}
return Jump_instr;
}
template <>
int
Guest::handle_exit<Vmx_state>(Vmm::Cpu_dev *cpu, Vmx_state *vms)
{
using Exit = Vmx_state::Exit;
auto reason = vms->exit_reason();
Vmm::Vcpu_ptr vcpu = cpu->vcpu();
auto *regs = &vcpu->r;
auto *ev_rec = recorder(vcpu.get_vcpu_id());
unsigned vcpu_id = vcpu.get_vcpu_id();
if (reason != Vmx_state::Exit::Exec_vmcall)
trace().printf("[%3u]: Exit at guest IP 0x%lx SP 0x%lx with %llu ('%s') (Qual: 0x%llx)\n",
vcpu_id, vms->ip(), vms->sp(),
vms->vmx_read(VMCS_EXIT_REASON),
exit_reason_to_str(vms->vmx_read(VMCS_EXIT_REASON)),
vms->vmx_read(VMCS_EXIT_QUALIFICATION));
switch (reason)
{
case Exit::Cpuid: return handle_cpuid(vcpu);
case Exit::Exec_vmcall: return handle_vm_call(regs);
case Exit::Io_access:
{
auto qual = vms->vmx_read(VMCS_EXIT_QUALIFICATION);
unsigned qwidth = qual & 7;
bool is_read = qual & 8;
bool is_string = qual & 16;
bool is_rep = qual & 32;
bool is_imm = qual & 64;
unsigned port = (qual >> 16) & 0xFFFFU;
Dbg(Dbg::Dev, Dbg::Trace)
.printf("[%3u]: VM exit @ 0x%lx: IO access with exit qualification "
"0x%llx: %s port 0x%x %s%s%s\n",
vcpu_id, vms->ip(), qual, is_read ? "read" : "write", port,
is_imm ? "immediate" : "in DX", is_string ? " string" : "",
is_rep ? " rep" : "");
if (port == 0xcfb)
Dbg(Dbg::Dev, Dbg::Trace)
.printf("[%3u]: N.B.: 0xcfb IO port access @ 0x%lx\n", vcpu_id,
vms->ip());
Mem_access::Width op_width;
switch (qwidth)
{
// Only 0, 1, 3 are valid values in the exit qualification.
case 0: op_width = Mem_access::Wd8; break;
case 1: op_width = Mem_access::Wd16; break;
case 3: op_width = Mem_access::Wd32; break;
default:
warn().printf("[%3u]: Invalid IO access size %u @ 0x%lx\n",
vcpu_id, qwidth, vms->ip());
return Invalid_opcode;
}
if (is_string)
return handle_io_access_string(port, is_read, op_width, is_rep,
regs, vms);
return handle_io_access(port, is_read, op_width, regs);
}
// Ept_violation needs to be checked here, as handle_mmio needs a vCPU ptr,
// which cannot be passed to Vm_state/Vmx_state due to dependency reasons.
case Exit::Ept_violation:
{
auto guest_phys_addr =
vms->vmx_read(VMCS_GUEST_PHYSICAL_ADDRESS);
auto qual = vms->vmx_read(VMCS_EXIT_QUALIFICATION);
trace().printf("[%3u]: Exit reason due to EPT violation %i; gp_addr "
"0x%llx, qualification 0x%llx\n",
vcpu_id, static_cast<unsigned>(reason), guest_phys_addr,
qual);
auto ret = handle_mmio(guest_phys_addr, vcpu);
// XXX Idt_vectoring_info could be valid.
switch(ret)
{
case Retry: return L4_EOK;
case Jump_instr: return Jump_instr;
default: break;
}
warn().printf("[%3u]: Unhandled pagefault @ 0x%lx\n", vcpu_id,
vms->ip());
warn().printf("[%3u]: Read: %llu, Write: %llu, Inst.: %llu Phys addr: "
"0x%llx\n",
vcpu_id, qual & 1, qual & 2, qual & 4, guest_phys_addr);
if (qual & 0x80)
warn().printf("[%3u]: Linear address: 0x%llx\n", vcpu_id,
vms->vmx_read(VMCS_GUEST_LINEAR_ADDRESS));
return -L4_EINVAL;
}
// VMX specific exits
case Exit::Exception_or_nmi:
{
// XXX Idt_vectoring_info could be valid.
}
// FIXME entry info might be overwritten by exception handling
// currently this isn't fully fletched anyways so this works for now.
[[fallthrough]];
case Exit::External_int:
return vms->handle_exception_nmi_ext_int(ev_rec);
case Exit::Interrupt_window:
case Exit::Nmi_window:
return Retry;
case Exit::Exec_halt:
if (0)
info().printf("[%3u]: HALT @ 0x%llx! Activity state 0x%llx\n",
vcpu_id, vms->vmx_read(VMCS_GUEST_RIP),
vms->vmx_read(VMCS_GUEST_ACTIVITY_STATE));
vms->halt();
cpu->halt_cpu();
return Jump_instr;
case Exit::Exec_rdpmc:
return General_protection;
case Exit::Cr_access:
return vms->handle_cr_access(regs);
case Exit::Exec_rdmsr:
if (!msr_devices_rwmsr(regs, false, vcpu_id))
{
warn().printf("[%3u]: Reading unsupported MSR 0x%lx\n", vcpu_id,
regs->cx);
regs->ax = 0;
regs->dx = 0;
return General_protection;
}
return Jump_instr;
case Exit::Exec_wrmsr:
{
bool has_already_exception = ev_rec->has_exception();
if (!msr_devices_rwmsr(regs, true, vcpu.get_vcpu_id()))
{
warn().printf("[%3u]: Writing unsupported MSR 0x%lx\n", vcpu_id,
regs->cx);
return General_protection;
}
// Writing an MSR e.g. IA32_EFER can lead to injection of a HW exception.
// In this case the instruction wasn't emulated, thus don't jump it.
if (!has_already_exception && ev_rec->has_exception())
return Retry;
else
return Jump_instr;
}
case Exit::Virtualized_eoi:
Dbg().printf("[%3u]: INFO: EOI virtualized for vector 0x%llx\n",
vcpu_id, vms->vmx_read(VMCS_EXIT_QUALIFICATION));
// Trap like exit: IP already on next instruction
return L4_EOK;
case Exit::Exec_xsetbv:
if (regs->cx == 0)
{
l4_uint64_t value = (l4_uint64_t(regs->ax) & 0xFFFFFFFF)
| (l4_uint64_t(regs->dx) << 32);
vms->vmx_write(L4_VM_VMX_VMCS_XCR0, value);
trace().printf("[%3u]: Setting xcr0 to 0x%llx\n", vcpu_id, value);
return Jump_instr;
}
Dbg().printf("[%3u]: Writing unknown extended control register %ld\n",
vcpu_id, regs->cx);
return -L4_EINVAL;
case Exit::Apic_write:
// Trap like exit: IP already on next instruction
assert(0); // Not supported
return L4_EOK;
case Exit::Mov_debug_reg:
{
l4_uint64_t qual = vms->vmx_read(VMCS_EXIT_QUALIFICATION);
unsigned char dbg_reg = qual & 0x7;
bool read = qual & (1 << 4);
unsigned char gp_reg = (qual >> 8) & 0xf;
// check CR4.DE
if (dbg_reg == 4 || dbg_reg == 5)
{
if (vms->vmx_read(VMCS_GUEST_CR4) & (1U << 3)) // CR4.DE set?
return Invalid_opcode;
// else: alias to DR6 & DR7
}
if (read)
{
if (gp_reg == 0x4)
regs->sp = 0UL;
else
{
l4_umword_t *r = &(regs->ax);
*(r - gp_reg) = 0UL;
}
}
// else: ignore writes
trace().printf("[%3u]: MOV DR exit: %s DR%u %s GP%u. Value: 0x%lx\n",
vcpu_id, read ? "read" : "write", dbg_reg,
read ? "to" : "from", gp_reg, *(&(regs->ax) - gp_reg));
return Jump_instr;
}
case Exit::Exec_vmclear:
case Exit::Exec_vmlaunch:
case Exit::Exec_vmptrld:
case Exit::Exec_vmptrst:
case Exit::Exec_vmread:
case Exit::Exec_vmresume:
case Exit::Exec_vmwrite:
case Exit::Exec_vmxoff:
case Exit::Exec_vmxon:
case Exit::Exec_invept:
case Exit::Exec_invvpid:
case Exit::Exec_rdtscp:
// Unsupported instructions, inject undefined opcode exception
return Invalid_opcode;
case Exit::Triple_fault:
// Double-fault experienced exception. Set core into shutdown mode.
info().printf("[%3u]: Triple fault exit at IP 0x%lx. Core is in shutdown "
"mode.\n",
vcpu_id, vms->ip());
vcpu.dump_regs_t(vms->ip(), info());
// move CPU into stop state
cpu->stop();
return Retry;
case Exit::Entry_fail_invalid_guest:
{
auto qual = vms->vmx_read(VMCS_EXIT_QUALIFICATION);
auto reason_raw = vms->vmx_read(VMCS_EXIT_REASON);
auto ip = vms->ip();
auto insn_err = vms->vmx_read(VMCS_VM_INSN_ERROR);
auto entry_exc_err = vms->vmx_read(VMCS_VM_ENTRY_EXCEPTION_ERROR);
Dbg().printf("VM-entry failure due to invalid guest state:\n"
"Exit reason raw: 0x%llx\n"
"Exit qualification: 0x%llx\n"
"IP: 0x%lx\n"
"Instruction error: 0x%llx\n"
"Entry exception error: 0x%llx\n",
reason_raw, qual, ip, insn_err, entry_exc_err
);
}
[[fallthrough]];
case Exit::Task_switch:
case Exit::Apic_access:
case Exit::Ept_misconfig:
case Exit::Page_mod_log_full:
case Exit::Spp_related_event:
// These cases need to check IDT-vectoring info for validity!
default:
{
Dbg().printf("[%3u]: Exit at guest IP 0x%lx SP 0x%lx with 0x%llx "
"(Qual: 0x%llx)\n",
vcpu_id, vms->ip(), vms->sp(),
vms->vmx_read(VMCS_EXIT_REASON),
vms->vmx_read(VMCS_EXIT_QUALIFICATION));
unsigned reason_u = static_cast<unsigned>(reason);
if (reason_u < sizeof(str_exit_reason) / sizeof(*str_exit_reason))
Dbg().printf("[%3u]: Unhandled exit reason: %s (%d)\n",
vcpu_id, str_exit_reason[reason_u], reason_u);
else
Dbg().printf("[%3u]: Unknown exit reason: 0x%x\n", vcpu_id, reason_u);
return -L4_ENOSYS;
}
}
}
} // namespace

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,489 @@
/*
* Copyright (C) 2017-2024 Kernkonzept GmbH.
* Author(s): Sarah Hoffmann <sarah.hoffmann@kernkonzept.com>
* Philipp Eppelt <philipp.eppelt@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include <l4/util/cpu.h>
#include <l4/vbus/vbus>
#include <l4/l4virtio/l4virtio>
#include <map>
#include <mutex>
#include <vector>
#include "cpu_dev_array.h"
#include "generic_guest.h"
#include "msr_device.h"
#include "cpuid_device.h"
#include "mem_access.h"
#include "vcpu_ptr.h"
#include "virt_lapic.h"
#include "vmprint.h"
#include "zeropage.h"
#include "pt_walker.h"
#include "vm_ram.h"
#include "binary_loader.h"
#include "event_recorder.h"
#include "pm_device_if.h"
namespace Vmm {
class Guest : public Generic_guest
{
public:
enum { Default_rambase = 0, Boot_offset = 0 };
enum { Has_io_space = true };
using Io_mem = std::map<Io_region, cxx::Ref_ptr<Io_device>>;
Guest()
: _apics(Vdev::make_device<Gic::Lapic_array>()),
_icr_handler(Vdev::make_device<Gic::Icr_handler>()),
_lapic_access_handler(Vdev::make_device<Gic::Lapic_access_handler>(
_apics, _icr_handler, get_max_physical_address_bit()))
{
add_mmio_device(_lapic_access_handler->mmio_region(),
_lapic_access_handler);
register_msr_device(_lapic_access_handler);
// Do this once for all TSC-based timers used in uvmm.
l4_calibrate_tsc(l4re_kip());
}
static Guest *create_instance();
static Guest *get_instance();
void setup_device_tree(Vdev::Device_tree) {}
void show_state_interrupts(FILE *, Vcpu_ptr) {}
void register_io_device(cxx::Ref_ptr<Vmm::Io_device> const &dev,
Region_type type,
Vdev::Dt_node const &node, size_t index = 0);
void add_io_device(Io_region const &region,
cxx::Ref_ptr<Io_device> const &dev);
void del_io_device(Io_region const &region);
/**
* Indicate whether the legacy i8042 keyboard controller is present.
*
* We assume that the legacy i8042 keyboard controller is present if the
* I/O ports 0x60 and 0x64 are registered.
*
* \retval true The legacy i8042 keyboard controller is present.
* \retval false The legacy i8042 keyboard controller is absent.
*/
bool i8042_present();
bool register_framebuffer(l4_uint64_t addr, l4_uint64_t size,
const L4Re::Video::View::Info &info);
/**
* Return IO port map.
*
* Must only be used before the guest started to run or for debugging. Might
* be manipulated concurrently from other vCPUs!
*/
Io_mem const *iomap()
{ return &_iomap; }
void register_msr_device(cxx::Ref_ptr<Msr_device> const &dev);
/**
* Register a CPUID-handling device in a list.
*
* \param dev CPUID-handling device to register.
*/
void register_cpuid_device(cxx::Ref_ptr<Cpuid_device> const &dev);
l4_addr_t load_binary(Vm_ram *ram, char const *binary,
Ram_free_list *free_list);
void prepare_platform(Vdev::Device_lookup *devs);
void prepare_binary_run(Vdev::Device_lookup *devs, l4_addr_t entry,
char const *binary, char const *cmd_line,
l4_addr_t dt_boot_addr);
void run(cxx::Ref_ptr<Cpu_dev_array> const &cpus);
void suspend(l4_addr_t wake_vector)
{
Vdev::Pm_device_registry::suspend_devices();
if (!_pm->suspend())
{
warn().printf("System suspend not possible. Waking up immediately.\n");
Vdev::Pm_device_registry::resume_devices();
return;
}
auto vcpu = _cpus->cpu(0)->vcpu();
/* Go to sleep */
vcpu.wait_for_ipc(l4_utcb(), L4_IPC_NEVER);
/* Back alive */
_pm->resume();
Vdev::Pm_device_registry::resume_devices();
vcpu.vm_state()->init_state();
vcpu.vm_state()->setup_real_mode(wake_vector);
info().printf("Waking CPU %u on EIP 0x%lx\n", 0, wake_vector);
}
void sync_all_other_cores_off() const override;
// returns the number of running cores
unsigned cores_running() const;
void handle_entry(Vcpu_ptr vcpu);
Gic::Virt_lapic *lapic(Vcpu_ptr vcpu)
{ return _apics->get(vcpu.get_vcpu_id()).get(); }
cxx::Ref_ptr<Gic::Lapic_array> apic_array() { return _apics; }
cxx::Ref_ptr<Gic::Icr_handler> icr_handler() { return _icr_handler; }
int handle_cpuid(Vcpu_ptr vcpu);
int handle_vm_call(l4_vcpu_regs_t *regs);
/**
* Access IO port and load/store the value to RAX.
*
* In case the given IO port is not handled by any device on read, the value
* of all ones is stored to RAX. Write errors are silently ignored.
*
* \param[in] port IO port to access.
* \param[in] is_in True if this is the IN (read) access.
* \param[in] op_width Width of the access (1/2/4 bytes).
* \param[in,out] regs Register file. The value read/written is
* stored/loaded into RAX.
*
* \retval Jump_instr Success, all errors are silently ignored.
*/
int handle_io_access(unsigned port, bool is_in, Mem_access::Width op_width,
l4_vcpu_regs_t *regs);
/**
* Access IO port (core implementation).
*
* Core implementation of accessing an IO port. The method looks up the
* device that handles the IO port and does the access.
*
* \param[in] port IO port to access.
* \param[in] is_in True if this is the IN (read) access.
* \param[in] op_width Width of the access (1/2/4 bytes).
* \param[in,out] value Value to read/write.
*
* \retval true The IO access was successful.
* \retval false No device handles the given IO port.
*/
bool handle_io_access_ptr(unsigned port, bool is_in,
Mem_access::Width op_width, l4_uint32_t *value);
void run_vm(Vcpu_ptr vcpu) L4_NORETURN;
Boot::Binary_type guest_type() const
{ return _guest_t; }
private:
enum : unsigned
{
Max_phys_addr_bits_mask = 0xff,
};
struct Xsave_state_area
{
struct Size_off { l4_uint64_t size = 0, offset = 0; };
enum
{
// Some indices are valid in xcr0, some in xss.
x87 = 0, // XCR0
sse, // XCR0
avx, // XCR0
mpx1, // XCR0
mpx2, // XCR0
avx512_1, // XCR0
avx512_2, // XCR0
avx512_3, // XCR0
pts, // XSS
pkru, // XCR0,
pasid, // XSS
cetu, // XSS
cets, // XSS
hdc, // XSS
uintr, // XSS
lbr, // XSS
hwp, // XSS
tilecfg, // XCR0
tiledata, // XCR0
Num_fields = 31,
};
bool valid = false;
// first two fields are legacy area, so always (size=0, offset=0);
Size_off feat[Num_fields];
};
void prepare_openbsd_binary_run(Vdev::Device_lookup *devs, l4_addr_t entry,
char const *binary, char const *cmd_line,
l4_addr_t dt_boot_addr);
void prepare_linux_binary_run(Vdev::Device_lookup *devs, l4_addr_t entry,
char const *binary, char const *cmd_line,
l4_addr_t dt_boot_addr);
template<typename VMS>
void run_vm_t(Vcpu_ptr vcpu, VMS *vm) L4_NORETURN;
template <typename VMS>
bool event_injection_t(Vcpu_ptr vcpu, VMS *vm);
template <typename VMS>
int handle_exit(Cpu_dev *cpu, VMS *vm);
/**
* Handle IO access VM exit in case of a [REP] INS/OUTS.
*
* \tparam VMS VM state type.
*
* \param[in] port IO port to access.
* \param[in] is_in True if this is the INS (read) access.
* \param[in] op_width Width of the IO access (1/2/4 bytes).
* \param[in] is_rep True if there is the REP prefix.
* \param[in,out] regs Register file.
* \param[in,out] vms VM state.
*
* \retval Jump_instr [REP] INS/OUTS instruction handled
* successfully.
* \retval Invalid_opcode Instruction decoding failure or unsupported
* CPU mode.
* \retval General_protection Segmentation fault.
* \retval Stack_fault Segmentation fault in the SS segment.
*/
template <typename VMS>
int handle_io_access_string(unsigned port, bool is_in,
Mem_access::Width op_width, bool is_rep,
l4_vcpu_regs_t *regs, VMS *vm);
unsigned get_max_physical_address_bit() const
{
l4_umword_t ax, bx, cx, dx;
// Check for highest extended CPUID leaf
l4util_cpu_cpuid(0x80000000, &ax, &bx, &cx, &dx);
if (ax >= 0x80000008)
l4util_cpu_cpuid(0x80000008, &ax, &bx, &cx, &dx);
else
{
// Check for highest basic CPUID leaf
l4util_cpu_cpuid(0x00, &ax, &bx, &cx, &dx);
if (ax >= 0x01)
{
l4util_cpu_cpuid(0x01, &ax, &bx, &cx, &dx);
if (dx & (1UL << 6)) // PAE
ax = 36;
else
ax = 32;
}
else
ax = 32; // Minimum if leaf not supported
}
return ax & Max_phys_addr_bits_mask;
}
bool msr_devices_rwmsr(l4_vcpu_regs_t *regs, bool write, unsigned vcpu_no);
/**
* Attempt to handle the CPUID instruction by consecutively trying handlers
* of the CPUID-handling devices registered in the _cpuid_devices list. The
* list is traversed from the front to the back.
*/
bool handle_cpuid_devices(l4_vcpu_regs_t const *regs, unsigned *a,
unsigned *b, unsigned *c, unsigned *d);
Event_recorder *recorder(unsigned num)
{ return _event_recorders.recorder(num); }
/**
* Perform actions necessary when changing from one Cpu_dev state to another.
*
* \tparam VMS SVM or VMX state type
* \param current Current CPU state
* \param new_state CPU state to transition into
* \param lapic local APIC of the current vCPU
* \param vm SVM or VMX state
* \param cpu current CPU device
*/
template <typename VMS>
bool state_transition_effects(Cpu_dev::Cpu_state const current,
Cpu_dev::Cpu_state const new_state,
Gic::Virt_lapic *lapic, VMS *vm, Cpu_dev *cpu);
/**
* Perform actions of the state the Cpu_dev just transitioned into.
*
* \tparam VMS SVM or VMX state type
* \param state New CPU state after state transition
* \param halt_req true, if `state` is the halt state and events are pending
* \param cpu current CPU device
* \param vm SVM or VMX state
*/
template <typename VMS>
bool new_state_action(Cpu_dev::Cpu_state state, bool halt_req, Cpu_dev *cpu,
VMS *vm);
void iomap_dump(Dbg::Verbosity l)
{
Dbg d(Dbg::Dev, l, "vmmap");
if (d.is_active())
{
d.printf("IOport map:\n");
std::lock_guard<std::mutex> lock(_iomap_lock);
for (auto const &r : _iomap)
d.printf(" [%4lx:%4lx]: %s\n", r.first.start, r.first.end,
r.second->dev_name());
}
}
std::mutex _iomap_lock;
Io_mem _iomap;
std::vector<cxx::Ref_ptr<Msr_device>> _msr_devices;
std::vector<cxx::Ref_ptr<Cpuid_device>> _cpuid_devices;
// devices
Guest_print_buffer _hypcall_print;
cxx::Ref_ptr<Pt_walker> _ptw;
cxx::Ref_ptr<Gic::Lapic_array> _apics;
cxx::Ref_ptr<Gic::Icr_handler> _icr_handler;
cxx::Ref_ptr<Gic::Lapic_access_handler> _lapic_access_handler;
Boot::Binary_type _guest_t;
cxx::Ref_ptr<Vmm::Cpu_dev_array> _cpus;
Vmm::Event_recorder_array _event_recorders;
Xsave_state_area _xsave_layout;
l4_addr_t _guest_size;
};
/**
* Handler for MSR read/write to a specific vCPU with its corresponding
* VM state.
*/
class Vcpu_msr_handler : public Msr_device
{
public:
Vcpu_msr_handler(Cpu_dev_array *cpus,
Vmm::Event_recorders *ev_rec)
: _cpus(cpus), _ev_rec(ev_rec)
{};
bool read_msr(unsigned msr, l4_uint64_t *value, unsigned vcpu_no) const override
{
return _cpus->vcpu(vcpu_no).vm_state()->read_msr(msr, value);
}
bool write_msr(unsigned msr, l4_uint64_t value, unsigned vcpu_no) override
{
return _cpus->vcpu(vcpu_no)
.vm_state()
->write_msr(msr, value, _ev_rec->recorder(vcpu_no));
}
private:
Cpu_dev_array *_cpus;
Event_recorders *_ev_rec;
};
/**
* Handler for MSR access to all MTRR registeres.
*
* MTRR are architectural registers and do not differ between AMD and Intel.
* MTRRs are core specific and must be kept in sync.
* Since all writes are ignored and reads just show the static state, we do
* no core specific handling for these registers.
*/
class Mtrr_msr_handler : public Msr_device
{
public:
Mtrr_msr_handler() = default;
bool read_msr(unsigned msr, l4_uint64_t *value, unsigned) const override
{
switch(msr)
{
case 0xfe: // IA32_MTRRCAP, RO
*value = 1U << 10; // WriteCombining support bit.
break;
case 0x2ff: // IA32_MTRR_DEF_TYPE
*value = 1U << 11; // E/MTRR enable bit
break;
// MTRRphysMask/Base[0-9]; only present if IA32_MTRRCAP[7:0] > 0
case 0x200: case 0x201: case 0x202: case 0x203: case 0x204: case 0x205:
case 0x206: case 0x207: case 0x208: case 0x209: case 0x20a: case 0x20b:
case 0x20c: case 0x20d: case 0x20e: case 0x20f: case 0x210: case 0x211:
case 0x212: case 0x213:
*value = 0;
break;
case 0x250: // MTRRfix64K_0000
[[fallthrough]];
case 0x258: // MTRRfix16K
[[fallthrough]];
case 0x259: // MTRRfix16K
[[fallthrough]];
// MTRRfix_4K_*
case 0x268: case 0x269: case 0x26a: case 0x26b: case 0x26c: case 0x26d:
case 0x26e: case 0x26f:
*value = 0;
break;
default:
return false;
}
return true;
}
bool write_msr(unsigned msr, l4_uint64_t, unsigned) override
{
switch(msr)
{
case 0x2ff: // MTRRdefType
// We report no MTRRs in the MTRRdefType MSR. Thus we ignore writes here.
// MTRRs might also be disabled temporarily by the guest.
break;
// Ignore all writes to MTRR registers, we flagged all of them as unsupported
// MTRRphysMask/Base[0-9]; only present if MTRRcap[7:0] > 0
case 0x200: case 0x201: case 0x202: case 0x203: case 0x204: case 0x205:
case 0x206: case 0x207: case 0x208: case 0x209: case 0x20a: case 0x20b:
case 0x20c: case 0x20d: case 0x20e: case 0x20f: case 0x210: case 0x211:
case 0x212: case 0x213:
break;
case 0x250: // MTRRfix64K_0000
[[fallthrough]];
case 0x258: // MTRRfix16K
[[fallthrough]];
case 0x259: // MTRRfix16K
[[fallthrough]];
// MTRRfix_4K_*
case 0x268: case 0x269: case 0x26a: case 0x26b: case 0x26c: case 0x26d:
case 0x26e: case 0x26f:
break;
default:
return false;
}
return true;
}
}; // class Mtrr_msr_handler
} // namespace Vmm

View File

@@ -0,0 +1,59 @@
/*
* Copyright (C) 2018, 2024 Kernkonzept GmbH.
* Author(s): Philipp Eppelt <philipp.eppelt@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#include <l4/util/port_io.h>
#include "io_port_handler.h"
namespace Vdev {
void Io_port_handler::io_in(unsigned p, Mem_access::Width width, l4_uint32_t *value)
{
l4_uint16_t port = p + _base;
switch(width)
{
case Mem_access::Wd8:
*value = l4util_in8(port);
break;
case Mem_access::Wd16:
*value = l4util_in16(port);
break;
case Mem_access::Wd32:
*value = l4util_in32(port);
break;
case Mem_access::Wd64:
// architecture does not support 64bit port access
*value = -1;
break;
}
}
void Io_port_handler::io_out(unsigned p, Mem_access::Width width, l4_uint32_t value)
{
l4_uint16_t port = p + _base;
switch(width)
{
case Mem_access::Wd8:
l4util_out8(value, port);
break;
case Mem_access::Wd16:
l4util_out16(value, port);
break;
case Mem_access::Wd32:
l4util_out32(value, port);
break;
case Mem_access::Wd64:
// architecture does not support 64bit port access
break;
}
}
} // namespace Vdev

View File

@@ -0,0 +1,263 @@
/*
* Copyright (C) 2023-2024 Kernkonzept GmbH.
* Author(s): Philipp Eppelt <philipp.eppelt@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#include "device_factory.h"
#include "guest.h"
#include "ioapic.h"
namespace Gic {
l4_uint64_t Io_apic::read_reg(unsigned reg) const
{
switch (reg)
{
case Id_reg:
return _id;
case Version_reg:
return Io_apic_ver | ((Io_apic_num_pins - 1) << 16);
case Arbitration_reg:
return _id;
default:
{
unsigned index = reg - Redir_tbl_offset_reg;
unsigned irq = index / 2;
if (irq >= Io_apic_num_pins)
{
info().printf("Unimplemented MMIO read from ioregsel "
"register 0x%x\n", reg);
return -1;
}
if (index % 2)
return _redirect_tbl[irq].load().upper_reg();
else
return _redirect_tbl[irq].load().lower_reg()
& ~(1UL << Redir_tbl_entry::Nospec_level_set_bit);
}
}
}
void Io_apic::write_reg(unsigned reg, l4_uint64_t value)
{
if (reg == Id_reg)
{
_id = value;
return;
}
unsigned index = reg - Redir_tbl_offset_reg;
unsigned irq = index / 2;
if (irq >= Io_apic_num_pins)
{
info().printf("Unimplemented MMIO write to ioregsel register 0x%x\n",
reg);
return;
}
Redir_tbl_entry e = _redirect_tbl[irq];
Redir_tbl_entry e_new;
bool was_pending = e.is_pending();
do
{
e_new = e;
if (index % 2)
e_new.upper_reg() = value;
else
{
// ignore writes to RO fields
value = (value & ~Redir_tbl_entry::Ro_mask)
| e_new.delivery_status().get_unshifted()
| e_new.remote_irr().get_unshifted();
// retain level_set bit, if entry is still masked.
if ( value & (1 << Redir_tbl_entry::Masked_bit)
&& e_new.is_pending())
value |= (1 << Redir_tbl_entry::Nospec_level_set_bit);
e_new.lower_reg() = value;
}
}
while (!_redirect_tbl[irq].compare_exchange_weak(e, e_new));
if (!e_new.masked())
apic_bind_irq_src_handler(irq, e_new.vector(), e_new.dest_id(),
e_new.dest_mode());
// in case of level-triggerd IRQs deliver IRQ since level is high.
if (!e_new.masked() && was_pending)
{
trace()
.printf("IRQ %i not masked anymore. send pending level irq\n",
irq);
set(irq);
}
// no need to clear the level_set bit, we didn't write it into the new
// entry above.
}
l4_uint64_t Io_apic::read(unsigned reg, char, unsigned cpu_id)
{
switch (reg)
{
case Ioregsel:
return _ioregsel;
case Iowin:
return read_reg(_ioregsel.load());
case Eoir:
return 0UL;
default:
info().printf("Unimplemented MMIO read from register %d by CPU %d\n",
reg, cpu_id);
return -1;
}
}
void Io_apic::write(unsigned reg, char, l4_uint64_t value, unsigned cpu_id)
{
switch (reg)
{
case Ioregsel:
_ioregsel = value & 0xff;
break;
case Iowin:
write_reg(_ioregsel.load(), value);
break;
case Eoir:
clear_all_rirr(value & 0xff);
break;
default:
info().printf("Unimplemented MMIO write to register %d by CPU %d\n",
reg, cpu_id);
break;
}
}
void Io_apic::apic_bind_irq_src_handler(unsigned entry_num, unsigned vec,
unsigned dest, unsigned dest_mod)
{
Ioapic_irq_src_handler *hdlr = &_apic_irq_src[entry_num];
if (hdlr->vector != -1U)
{
// assumption: hdlr already bound
if (hdlr->vector == vec)
return;
else
apic_unbind_irq_src_handler(entry_num);
}
hdlr->vector = vec;
hdlr->dest = dest;
hdlr->dest_mod = dest_mod;
do_apic_bind_irq_src_handler(hdlr, true);
};
void Io_apic::apic_unbind_irq_src_handler(unsigned entry_num)
{
Ioapic_irq_src_handler *hdlr = &_apic_irq_src[entry_num];
if (hdlr->vector == -1U)
// don't unbind handler if not bound
return;
do_apic_bind_irq_src_handler(hdlr, false);
hdlr->vector = -1U;
hdlr->dest = -1U;
hdlr->dest_mod = 0U;
}
void Io_apic::do_apic_bind_irq_src_handler(Ioapic_irq_src_handler *hdlr,
bool bind)
{
Ioapic_irq_src_handler *new_hdlr = bind ? hdlr : nullptr;
if (hdlr->dest_mod == 0) // physical
{
auto apic = _lapics->get(hdlr->dest);
if (apic)
apic->bind_irq_src_handler(hdlr->vector, new_hdlr);
}
else
_lapics->apics_bind_irq_src_handler_logical(hdlr->dest, hdlr->vector,
new_hdlr);
}
void Io_apic::set(unsigned irq)
{
// send to PIC. (TODO only if line is masked at IOAPIC?)
if (irq < 16) // PIC can handle only the first 16 lines
_pic->set(irq);
Redir_tbl_entry entry = redirect(irq);
if (entry.masked())
{
if (entry.is_level_triggered())
// We must save the state of the level triggered IRQ, since we get
// the softIRQ only once and can't query the current level.
// We don't notice, if the actual HW line changes to no-IRQ again,
// but that's better than losing an IRQ here.
set_level_set(irq);
return;
}
if (entry.remote_irr())
{
// ignore re-triggered level-triggered IRQs that are in-service at
// local APIC
return;
}
Vdev::Msix::Data_register_format data(entry.vector());
data.trigger_mode() = entry.trigger_mode();
data.trigger_level() = !entry.pin_polarity(); // it's actually inverted.
data.delivery_mode() = entry.delivery_mode();
Vdev::Msix::Interrupt_request_compat addr(0ULL);
addr.dest_id() = entry.dest_id();
addr.dest_mode() = entry.dest_mode();
addr.fixed() = Vdev::Msix::Address_interrupt_prefix;
_distr->send(addr.raw, data.raw);
// update entry if necessary
if (entry.is_level_triggered())
set_remote_irr(irq);
}
} // namespace Gic
namespace {
struct F : Vdev::Factory
{
cxx::Ref_ptr<Vdev::Device> create(Vdev::Device_lookup *devs,
Vdev::Dt_node const &node) override
{
auto msi_distr = devs->get_or_create_mc_dev(node);
auto apic_array = devs->vmm()->apic_array();
// Create the legacy PIC device here to forward legacy Interrupts.
auto pic = Vdev::make_device<Vdev::Legacy_pic>(msi_distr);
auto io_apic =
Vdev::make_device<Gic::Io_apic>(msi_distr, apic_array, pic);
devs->vmm()->add_mmio_device(io_apic->mmio_region(), io_apic);
// Register legacy PIC IO-ports
devs->vmm()->add_io_device(Vmm::Io_region(0x20, 0x21,
Vmm::Region_type::Virtual),
pic->master());
devs->vmm()->add_io_device(Vmm::Io_region(0xA0, 0xA1,
Vmm::Region_type::Virtual),
pic->slave());
return io_apic;
}
};
static F f;
static Vdev::Device_type d = {"intel,ioapic", nullptr, &f};
}

View File

@@ -0,0 +1,294 @@
/*
* Copyright (C) 2023-2024 Kernkonzept GmbH.
* Author(s): Philipp Eppelt <philipp.eppelt@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include "mmio_device.h"
#include "debug.h"
#include "irq.h"
#include "msi_controller.h"
#include "msix.h"
#include "msi_arch.h"
#include "legacy_pic.h"
#include "monitor/ioapic_cmd_handler.h"
namespace Gic {
/**
* Virtual IOAPIC implementation of a 82093AA.
*
* The IOAPIC sends legacy IRQs onwards as MSI as programmed into the
* redirection table by the guest.
*/
class Io_apic : public Ic,
public Vmm::Mmio_device_t<Io_apic>,
public Monitor::Ioapic_cmd_handler<Monitor::Enabled, Io_apic>
{
enum
{
Io_apic_id = 0,
Io_apic_id_offset = 24,
Io_apic_ver = 0x20,
Io_apic_num_pins = 120,
Io_apic_mem_size = 0x1000,
Irq_cells = 1, // keep in sync with virt-pc.dts
};
enum Ioapic_mmio_regs
{
Ioregsel = 0,
Iowin = 0x10,
Eoir = 0x40,
};
enum Ioapic_regs
{
Id_reg = 0,
Version_reg = 1,
Arbitration_reg = 2,
Redir_tbl_offset_reg = 0x10,
};
struct Redir_tbl_entry
{
enum
{
Delivery_status_bit = 12,
Remote_irr_bit = 14,
Masked_bit = 16,
Nospec_level_set_bit = 17,
Ro_mask = 1U << Nospec_level_set_bit | 1U << Delivery_status_bit
| 1U << Remote_irr_bit,
};
Redir_tbl_entry() noexcept = default;
// The IOAPIC spec mentions bit 48, which is specified as reserved, bit 16
// is the mask bit and I think it's sane to start out with masked vectors.
l4_uint64_t raw = 1ULL << 16;
bool is_level_triggered() const { return trigger_mode(); }
bool is_pending() { return is_level_triggered() && level_set(); }
CXX_BITFIELD_MEMBER_RO(56, 63, dest_id, raw);
// use reserved bit for internal state of level triggered input line.
// only relevant, if line is masked
CXX_BITFIELD_MEMBER(17, 17, level_set, raw);
CXX_BITFIELD_MEMBER_RO(16, 16, masked, raw);
CXX_BITFIELD_MEMBER_RO(15, 15, trigger_mode, raw);
CXX_BITFIELD_MEMBER(14, 14, remote_irr, raw);
CXX_BITFIELD_MEMBER_RO(13, 13, pin_polarity, raw);
CXX_BITFIELD_MEMBER_RO(12, 12, delivery_status, raw);
CXX_BITFIELD_MEMBER_RO(11, 11, dest_mode, raw);
CXX_BITFIELD_MEMBER_RO(8, 10, delivery_mode, raw);
CXX_BITFIELD_MEMBER_RO(0, 7, vector, raw);
// Redirection Table entries can only be written as DWORD.
CXX_BITFIELD_MEMBER(0, 31, lower_reg, raw);
CXX_BITFIELD_MEMBER(32, 63, upper_reg, raw);
};
struct Ioapic_irq_src_handler : public Irq_src_handler
{
void eoi() override
{
assert(ioapic != nullptr);
// clear state in redirection table entry
ioapic->entry_eoi(irq_num);
{
// MSI generated from the IRQ can have multiple target cores. If this
// IRQ/MSI is level triggered, multiple cores would send an EOI.
// Would be insane, but who knows.
std::lock_guard<std::mutex> lock(_mtx);
// get IRQ src handler of input IRQ and forward EOI signal
Irq_src_handler *hdlr = ioapic->get_irq_src_handler(irq_num);
if (hdlr)
hdlr->eoi();
}
}
unsigned irq_num = 0;
Io_apic *ioapic = nullptr;
unsigned vector = -1U;
unsigned dest = -1U;
unsigned dest_mod = 0; // default: physical
private:
std::mutex _mtx;
};
public:
enum
{
Mmio_addr = 0xfec00000,
};
Io_apic(cxx::Ref_ptr<Gic::Msix_controller> distr,
cxx::Ref_ptr<Gic::Lapic_array> apic_array,
cxx::Ref_ptr<Vdev::Legacy_pic> pic)
: _distr(distr), _lapics(apic_array),
_id(Io_apic_id << Io_apic_id_offset), _ioregsel(0), _iowin(0),
_pic(pic)
{
// initialize IRQ src handler for LAPIC communication
for (unsigned i = 0; i < Io_apic_num_pins; ++i)
{
_apic_irq_src[i].irq_num = i;
_apic_irq_src[i].ioapic = this;
}
}
// public only for monitor access
l4_uint64_t read_reg(unsigned reg) const;
// Mmio device interface
l4_uint64_t read(unsigned reg, char, unsigned cpu_id);
void write(unsigned reg, char, l4_uint64_t value, unsigned cpu_id);
// IC interface
void set(unsigned irq) override;
void clear(unsigned) override {}
/**
* Bind the IRQ src handler of a level-triggered legacy interrupt.
*
* This handler is signaled, if the IOAPIC receives an EOI signal from the
* local APIC for the corresponding interrupt line.
*/
void bind_irq_src_handler(unsigned irq, Irq_src_handler *handler) override
{
if (irq >= Io_apic_num_pins)
{
warn().printf("Try to bind out-of-range IRQ %u. Ignoring. \n", irq);
return;
}
if (handler && _sources[irq])
L4Re::throw_error(-L4_EEXIST, "Bind IRQ src handler at IOAPIC." );
_sources[irq] = handler;
}
/**
* Get IRQ src handler bound for the given legacy interrupt line or
* `nullptr` if no handler is bound.
*/
Irq_src_handler *get_irq_src_handler(unsigned irq) const override
{
if (irq >= Io_apic_num_pins)
{
warn().printf("Try to get out-of-range IRQ %u. Ignoring. \n", irq);
return nullptr;
}
return _sources[irq];
}
int dt_get_interrupt(fdt32_t const *prop, int propsz,
int *read) const override
{
if (propsz < Irq_cells)
return -L4_ERANGE;
if (read)
*read = Irq_cells;
return fdt32_to_cpu(prop[0]);
}
Vmm::Region mmio_region() const
{
return Vmm::Region::ss(Vmm::Guest_addr(Mmio_addr), Io_apic_mem_size,
Vmm::Region_type::Virtual);
}
char const *dev_name() const override { return "Ioapic"; }
private:
static Dbg trace() { return Dbg(Dbg::Irq, Dbg::Trace, "IOAPIC"); }
static Dbg info() { return Dbg(Dbg::Irq, Dbg::Info, "IOAPIC"); }
static Dbg warn() { return Dbg(Dbg::Irq, Dbg::Warn, "IOAPIC"); }
void write_reg(unsigned reg, l4_uint64_t value);
/// Return the redirection table entry for given `irq`.
Redir_tbl_entry redirect(unsigned irq) const
{
assert(irq < Io_apic_num_pins);
return _redirect_tbl[irq];
}
void entry_eoi(unsigned irq)
{
assert(irq < Io_apic_num_pins);
// clear remote_irr and for level triggered the level_set bit.
Redir_tbl_entry e = _redirect_tbl[irq];
Redir_tbl_entry e_new;
do
{
e_new = e;
e_new.remote_irr() = 0;
e_new.level_set() = 0;
}
while (!_redirect_tbl[irq].compare_exchange_weak(e, e_new));
}
void set_level_set(unsigned irq)
{
assert(irq < Io_apic_num_pins);
Redir_tbl_entry e = _redirect_tbl[irq];
Redir_tbl_entry e_new;
do
{
e_new = e;
e_new.level_set() = 1;
}
while (!_redirect_tbl[irq].compare_exchange_weak(e, e_new));
}
void set_remote_irr(unsigned irq)
{
assert(irq < Io_apic_num_pins);
Redir_tbl_entry e = _redirect_tbl[irq];
Redir_tbl_entry e_new;
do
{
e_new = e;
e_new.remote_irr() = 1;
}
while (!_redirect_tbl[irq].compare_exchange_weak(e, e_new));
}
void clear_all_rirr(l4_uint8_t vec)
{
for (unsigned i = 0; i < Io_apic_num_pins; ++i)
{
if (_redirect_tbl[i].load().vector() == vec)
entry_eoi(i);
}
}
void apic_bind_irq_src_handler(unsigned entry_num, unsigned vec,
unsigned dest, unsigned dest_mod);
void apic_unbind_irq_src_handler(unsigned entry_num);
void do_apic_bind_irq_src_handler(Ioapic_irq_src_handler *hdlr, bool bind);
cxx::Ref_ptr<Gic::Msix_controller> _distr;
cxx::Ref_ptr<Lapic_array> _lapics;
std::atomic<l4_uint32_t> _id;
std::atomic<l4_uint32_t> _ioregsel;
std::atomic<l4_uint32_t> _iowin;
std::atomic<Redir_tbl_entry> _redirect_tbl[Io_apic_num_pins];
Gic::Irq_src_handler *_sources[Io_apic_num_pins] = {};
cxx::Ref_ptr<Vdev::Legacy_pic> _pic;
Ioapic_irq_src_handler _apic_irq_src[Io_apic_num_pins];
}; // class Io_apic
} // namespace Gic

View File

@@ -0,0 +1,119 @@
/*
* Copyright (C) 2021, 2024 Kernkonzept GmbH.
* Author(s): Steffen Liebergeld <steffen.liebergeld@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
/**
* This implements a simple debug channel, similar to the ones implemented in
* Qemu and Bochs.
*
* This can be used for low-level debugging of guests.
*
* Example DT:
*
* \code{.dtb}
* isa {
* device_type = "eisa";
* #address-cells = <2>;
* #size-cells = <1>;
* // The first cell of a child nodes reg property encodes the
* // following information. See the ISA bus device-tree binding [2]
* // for more details:
* //
* // [2] 11-bit aliased (IOPORT only)
* // [1] 10-bit aliased (IOPORT only)
* // [0] 0=MMIO32, 1=IOPORT
* //
* // The standard ranges property defines the translation of child
* // reg address entries into the parent address space. Effectively
* // removes the upper word. For the purpose of the ISA translation,
* // only bit [0] is considered of the first word.
* ranges = <0x0 0x0 0x0 0x0 0xffffffff
* 0x1 0x0 0x0 0x0 0x1000>;
* isa_debugport {
* compatible = "l4vmm,isa-debugport";
* reg = <0x1 0x402 0x1>;
* l4vmm,vcon_cap = "debug";
* };
* };
* \endcode
*/
#include "device_factory.h"
#include "guest.h"
#include "device.h"
#include "io_device.h"
namespace Vdev {
class Isa_debugport : public Vmm::Io_device, public Vdev::Device
{
enum { Bochs_debug_port_magic = 0xe9 };
public:
explicit Isa_debugport(L4::Cap<L4::Vcon> con)
: _con(con)
{
l4_vcon_attr_t attr;
if (l4_error(con->get_attr(&attr)) != L4_EOK)
{
Dbg(Dbg::Dev, Dbg::Warn, "cons")
.printf("WARNING: Cannot set console attributes. "
"Output may not work as expected.\n");
return;
}
attr.set_raw();
L4Re::chksys(con->set_attr(&attr), "console set_attr");
}
char const *dev_name() const override
{ return "ISA Debugport"; }
private:
/* IO write from the guest to device */
void io_out(unsigned, Vmm::Mem_access::Width, l4_uint32_t value) override
{
char s = value & 0xff;
_con->write(&s, 1);
}
/* IO read from the guest */
void io_in(unsigned, Vmm::Mem_access::Width, l4_uint32_t *value) override
{
*value = Bochs_debug_port_magic;
}
L4::Cap<L4::Vcon> _con;
};
} // namespace Vdev
namespace {
struct F : Vdev::Factory
{
cxx::Ref_ptr<Vdev::Device> create(Vdev::Device_lookup *devs,
Vdev::Dt_node const &node) override
{
L4::Cap<L4::Vcon> cap = Vdev::get_cap<L4::Vcon>(node, "l4vmm,vcon_cap");
// Do not default to anything. If the cap is not there, there is no
// debugport.
if (!cap)
return nullptr;
auto dev = Vdev::make_device<Vdev::Isa_debugport>(cap);
devs->vmm()->register_io_device(dev, Vmm::Region_type::Virtual, node);
return dev;
}
}; // struct F
static F f;
static Vdev::Device_type t = {"l4vmm,isa-debugport", nullptr, &f};
} // namespace

View File

@@ -0,0 +1,32 @@
/*
* Copyright (C) 2018-2020, 2022, 2024 Kernkonzept GmbH.
* Author(s): Philipp Eppelt <philipp.eppelt@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#include "device_factory.h"
#include "guest.h"
#include "kvm_clock.h"
#include "mem_types.h"
namespace {
struct F : Vdev::Factory
{
cxx::Ref_ptr<Vdev::Device> create(Vdev::Device_lookup *devs,
Vdev::Dt_node const &) override
{
auto dev = Vdev::make_device<Vdev::Kvm_clock_ctrl>(devs->ram(),
devs->vmm());
devs->vmm()->register_msr_device(dev);
devs->vmm()->register_cpuid_device(dev);
return dev;
}
}; // struct F
static F f;
static Vdev::Device_type t = {"kvm-clock", nullptr, &f};
} // namespace

View File

@@ -0,0 +1,235 @@
/*
* Copyright (C) 2018-2020, 2022-2024 Kernkonzept GmbH.
* Author(s): Sarah Hoffmann <sarah.hoffmann@kernkonzept.com>
* Philipp Eppelt <philipp.eppelt@kernkonzept.com>
* Benjamin Lamowski <benjamin.lamowski@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include <l4/sys/types.h>
#include <l4/util/rdtsc.h>
#include <l4/cxx/ref_ptr>
#include <vector>
#include "debug.h"
#include "mem_types.h"
#include "msr_device.h"
#include "cpuid_device.h"
#include "vm_ram.h"
#include "ds_mmio_mapper.h"
#include "cpu_dev.h"
#include "guest.h"
namespace Vdev {
struct Vcpu_time_info
{
l4_uint32_t version;
l4_uint32_t pad0;
l4_uint64_t tsc_timestamp;
l4_uint64_t system_time;
l4_uint32_t tsc_to_system_mul;
l4_int8_t tsc_shift;
// bit 0 is set, if all Vcpu_time_info instances show the same TSC value.
l4_uint8_t flags;
l4_uint8_t pad[2];
};
static_assert(sizeof(Vcpu_time_info) == 32,
"Vcpu_time_info structure is compact.");
class Kvm_clock : public Vdev::Timer, public Device
{
public:
Kvm_clock(Vcpu_time_info *vti, bool enable)
{
configure(vti, enable);
}
void configure(Vcpu_time_info *vti, bool enable)
{
_vcpu_time_enable = enable;
vti->version = 0;
vti->tsc_to_system_mul = l4_scaler_tsc_to_ns;
vti->tsc_shift = 5;
vti->flags = 0;
_vcpu_time = vti;
}
void tick()
{
auto now = l4_rdtsc();
cxx::write_now(&(_vcpu_time->version), _vcpu_time->version + 1);
_vcpu_time->tsc_timestamp = now;
_vcpu_time->system_time = l4_tsc_to_ns(now);
cxx::write_now(&(_vcpu_time->version), _vcpu_time->version + 1);
}
private:
Vcpu_time_info *_vcpu_time;
bool _vcpu_time_enable;
std::mutex _mutex;
};
class Kvm_clock_ctrl : public Vmm::Msr_device,
public Vmm::Cpuid_device,
public Device
{
struct Wall_clock
{
l4_uint32_t version;
l4_uint32_t sec;
l4_uint32_t nsec;
};
static_assert(sizeof(Wall_clock) == 3 * 4,
"KVM Wall_clock struct is compact.");
enum Kvm_msrs : unsigned
{
Msr_kvm_wall_clock_new = 0x4b564d00,
Msr_kvm_system_time_new = 0x4b564d01,
Msr_kvm_async_pf_en = 0x4b564d02,
Msr_kvm_steal_time = 0x4b564d03,
Msr_kvm_eoi_en = 0x4b564d04,
};
public:
Kvm_clock_ctrl(cxx::Ref_ptr<Vmm::Vm_ram> const &memmap,
Vmm::Guest *vmm)
: _boottime(l4_rdtsc()),
_memmap(memmap),
_vmm(vmm)
{}
bool read_msr(unsigned, l4_uint64_t *, unsigned) const override
{
// Nothing to read, above structures are memory mapped in the guest.
return false;
}
bool write_msr(unsigned msr, l4_uint64_t addr, unsigned core_no) override
{
switch (msr)
{
case Msr_kvm_wall_clock_new:
{
trace().printf("Msr_kvm_wall_clock_new with addr 0x%llx\n", addr);
// address must be 4-byte aligned
auto gaddr = Vmm::Guest_addr(addr & (-1UL << 2));
set_wall_clock(static_cast<Wall_clock *>(host_addr(gaddr)));
break;
}
case Msr_kvm_system_time_new:
{
trace().printf("Msr_kvm_system_time_new to addr 0x%llx\n", addr);
bool enable = addr & 1;
// address must be 4-byte aligned
auto gaddr = Vmm::Guest_addr(addr & (-1UL << 2));
setup_vcpu_time(static_cast<Vcpu_time_info *>(host_addr(gaddr)),
enable, core_no);
break;
}
// NOTE: below functions are disabled via CPUID leaf 0x4000'0001 and
// shouldn't be invoked by a guest.
case Msr_kvm_async_pf_en:
warn().printf("KVM async pf not implemented.\n");
break;
case Msr_kvm_steal_time:
warn().printf("KVM steal time not implemented.\n");
break;
case Msr_kvm_eoi_en:
warn().printf("KVM EIO not implemented.\n");
break;
// If the guest Linux is compiled with CONFIG_KVM and no-kvmclock is
// set on the command line, Linux will try to write to these MSRs on
// shutdown. We ignore that.
case 0x11:
case 0x12:
return true;
default:
return false;
}
return true;
}
bool handle_cpuid(l4_vcpu_regs_t const *regs, unsigned *a, unsigned *b,
unsigned *c, unsigned *d) const override
{
enum Cpuid_kvm_constants
{
Kvm_feature_clocksource = 1UL, // clock at msr 0x11 & 0x12
Kvm_feature_clocksource2 = 1UL << 3, // clock at msrs 0x4b564d00 & 01;
// host communicates synchronized KVM clocks via Vcpu_time_info.flags[0]
Kvm_feature_clocksource_stable_bit = 1UL << 24,
};
switch (regs->ax)
{
case 0x40000000:
*a = 0x40000001; // max CPUID leaf in the 0x4000'0000 range.
*b = 0x4b4d564b; // "KVMK"
*c = 0x564b4d56; // "VMKV"
*d = 0x4d; // "M\0\0\0"
return true;
case 0x40000001:
*a = Kvm_feature_clocksource2 | Kvm_feature_clocksource_stable_bit;
*d = 0;
*b = *c = 0;
return true;
default:
return false;
}
}
private:
void set_wall_clock(Wall_clock *cs) const
{
trace().printf("Set wall clock address: %p \n", cs);
cxx::write_now(&(cs->version), 1U);
l4_tsc_to_s_and_ns(_boottime, &(cs->sec), &(cs->nsec));
cxx::write_now(&(cs->version), 0U);
}
void setup_vcpu_time(Vcpu_time_info *vti, bool enable, unsigned core_no)
{
trace().printf("set system time address: %p: enable: %i, scaler 0x%x\n",
vti, enable, l4_scaler_tsc_to_ns);
if (core_no >= _clocks.size())
_clocks.resize(core_no + 1);
if (_clocks[core_no])
_clocks[core_no]->configure(vti, enable);
else
{
auto clock_dev = Vdev::make_device<Kvm_clock>(vti, enable);
_clocks[core_no] = clock_dev;
clock_dev->tick();
}
}
void *host_addr(Vmm::Guest_addr addr) const
{
return _memmap->guest2host<void *>(addr);
}
static Dbg trace() { return Dbg(Dbg::Dev, Dbg::Trace, "KVMclock"); }
static Dbg warn() { return Dbg(Dbg::Dev, Dbg::Warn, "KVMclock"); }
l4_cpu_time_t _boottime;
std::vector<cxx::Ref_ptr<Kvm_clock>> _clocks;
cxx::Ref_ptr<Vmm::Vm_ram> _memmap;
Vmm::Guest *_vmm;
};
} // namespace

View File

@@ -0,0 +1,35 @@
/*
* Copyright (C) 2018-2022, 2024 Kernkonzept GmbH.
* Author(s): Philipp Eppelt <philipp.eppelt@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#include "legacy_pic.h"
#include "device_factory.h"
#include "guest.h"
namespace
{
struct F : Vdev::Factory
{
cxx::Ref_ptr<Vdev::Device> create(Vdev::Device_lookup *devs,
Vdev::Dt_node const &node) override
{
auto msi_distr = devs->get_or_create_mc_dev(node);
Dbg().printf("PIC found MSI ctrl %p\n", msi_distr.get());
auto dev = Vdev::make_device<Vdev::Legacy_pic>(msi_distr);
auto *vmm = devs->vmm();
vmm->add_io_device(Vmm::Io_region(0x20, 0x21, Vmm::Region_type::Virtual),
dev->master());
vmm->add_io_device(Vmm::Io_region(0xA0, 0xA1, Vmm::Region_type::Virtual),
dev->slave());
return dev;
}
}; // struct F
static F f;
static Vdev::Device_type t = {"virt-i8259-pic", nullptr, &f};
} // namespace

View File

@@ -0,0 +1,488 @@
/*
* Copyright (C) 2018-2024 Kernkonzept GmbH.
* Author(s): Philipp Eppelt <philipp.eppelt@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include "io_device.h"
#include "device.h"
#include "irq.h"
#include "msi_arch.h"
#include "msi_controller.h"
#include <l4/cxx/bitfield>
namespace Vdev {
/**
* Emulation of a programmable interrupt controller.
*
* Example of a device tree entry:
*
* \code{.dtb}
* PIC: pic {
* compatible = "virt-pic";
* reg = <0x0 0x0 0x0 0x0>;
* msi-parent = <&msi_ctrl>;
* interrupt-controller;
* #interrupt-cells = <1>;
* };
* \endcode
*
* The PIC emulation provides the guest with the ability to assign the legacy
* interrupts of the master and slave PIC to a software defined range of two
* times eight consecutive interrupt numbers.
* The emulation reacts to IO-ports 0x20/0x21 and 0xA0/0xA1 as Command/Data
* port combination for the master and slave chips.
*/
class Legacy_pic : public Gic::Ic
{
enum Config
{
Num_irqs = 16 // Number of IRQs supported by PIC
};
enum Ports
{
Cmd_port = 0,
Data_port = 1,
};
enum class Init_words
{
ICW1 = 0,
ICW2,
ICW3,
ICW4,
};
/**
* Single PIC-chip emulation handling IO-port access and interrupt offsets.
*/
class Chip : public Vmm::Io_device
{
// Register set
// We only support ICW1 == 0x11. (ICW4 | INIT).
struct ICW1
{
l4_uint8_t raw;
CXX_BITFIELD_MEMBER(0, 0, icw4, raw);
CXX_BITFIELD_MEMBER(1, 1, single, raw); // only support 0
CXX_BITFIELD_MEMBER(2, 2, address_interval, raw); // only support 0
CXX_BITFIELD_MEMBER(3, 3, level_triggered_mode, raw); // ignore
CXX_BITFIELD_MEMBER(4, 4, init, raw);
};
struct ICW4
{
l4_uint8_t raw;
CXX_BITFIELD_MEMBER(0, 0, upm, raw); // 8086 mode, only one supported
/**
* Note from 8259a manual:
* 8259As with a copyright date of 1985 or later will operate in the AEOI
* mode as a master or a slave.
* In AEOI mode interrupts are acked on delivery.
*/
CXX_BITFIELD_MEMBER(1, 1, aeoi, raw);
CXX_BITFIELD_MEMBER(2, 2, buffer_master, raw);
CXX_BITFIELD_MEMBER(3, 3, buffer_mode, raw);
CXX_BITFIELD_MEMBER(3, 3, sfnm, raw); // One iff special fully nested mode.
};
struct OCW2
{
l4_uint8_t raw;
CXX_BITFIELD_MEMBER(0, 2, irq, raw);
CXX_BITFIELD_MEMBER(5, 5, eoi, raw);
CXX_BITFIELD_MEMBER(6, 6, sl, raw);
};
struct OCW3
{
l4_uint8_t raw;
CXX_BITFIELD_MEMBER(0, 0, ris, raw);
CXX_BITFIELD_MEMBER(1, 1, rr, raw);
CXX_BITFIELD_MEMBER(2, 2, poll, raw);
CXX_BITFIELD_MEMBER(5, 5, smm, raw);
CXX_BITFIELD_MEMBER(6, 6, esmm, raw);
};
// Selected IRR/ISR register by OCW3 for even port reads
bool _read_isr = false;
// Interrupt service register. Stores the Irq currently being serviced.
l4_uint8_t _isr = 0;
// Interrupt request register. Stores incoming Irq requesting to be
// serviced.
l4_uint8_t _irr = 0;
// Interrupt mask register. Masks out interrupts.
l4_uint8_t _imr = 0;
// Needed to keep track of initialization sequence
Init_words _expect = Init_words::ICW1;
// Offset of interrupts
l4_uint8_t _offset = 0;
l4_uint8_t _slave_at = 0;
struct ICW1 _icw1 {0}; // store to keep track of single mode and icw4
struct ICW4 _icw4 {0}; // store to keep track of aeoi mode
bool _is_master;
Legacy_pic *_pic;
public:
Chip(bool master, Legacy_pic *pic) : _is_master(master), _pic(pic)
{
_icw4.aeoi() = 1;
}
char const *dev_name() const override
{ return "PIC"; }
/// Check interrupt mask/in-service and return the IRQ number with offset.
int trigger(unsigned irq)
{
if (_offset == 0)
return -1;
unsigned irq_bit = 1U << irq;
if (_isr || _imr & irq_bit)
{
_irr |= irq_bit;
return -1;
}
else
{
if (!_icw4.aeoi())
_isr |= irq_bit;
_irr &= ~irq_bit;
return _offset + irq;
}
}
public:
/// Handle read accesses on the PICs command and data ports.
void io_in(unsigned port, Vmm::Mem_access::Width width, l4_uint32_t *value)
override
{
*value = -1U;
if (width != Vmm::Mem_access::Width::Wd8)
return;
switch (port)
{
case Cmd_port:
*value = _read_isr ? _isr : _irr;
break;
case Data_port:
*value = _imr;
trace().printf("%s read mask 0x%x\n",
_is_master ? "Master:" : "Slave:", _imr);
break;
}
trace().printf("%s port in: %s - 0x%x\n",
_is_master ? "Master:" : "Slave:",
port == 0 ? "cmd" : "data", *value);
}
/// Handle write accesses on the PICs command and data ports.
void io_out(unsigned port, Vmm::Mem_access::Width width, l4_uint32_t value)
override
{
if (width != Vmm::Mem_access::Width::Wd8)
return;
trace().printf("%s port out: %s - 0x%x\n",
_is_master ? "Master:" : "Slave:",
port == 0 ? "cmd" : "data", value);
switch (port)
{
case Cmd_port:
handle_command_write(value);
break;
case Data_port:
handle_data_write(value);
break;
}
}
private:
/// Return the number of the first pending interrupt or -1.
int check_pending()
{
if (_isr || !(_irr & ~_imr))
// we cannot issue new interrupts
// if an interrupt is currently in service
// or if all pending interrupts (in irr) are masked
return -1;
for (int i = 0; _irr >> i; ++i)
{
l4_uint8_t bit = 1U << i;
if (_irr & bit)
{
_irr &= ~bit;
_isr |= bit;
return i;
}
}
return -1;
}
/**
* EOI of last issued interrupt
*/
void eoi(unsigned irq = 0)
{
if (!irq)
_isr = 0;
else
_isr &= ~(1U << irq);
if (_is_master)
_pic->eoi(irq);
else
_pic->eoi(irq + 8);
issue_next_interrupt();
}
void issue_next_interrupt()
{
int next_irq = check_pending();
if (next_irq != -1)
_pic->send_interrupt(next_irq + _offset);
}
/**
* Reset to initial configuration
*/
void reset()
{
_irr = _imr = _isr = 0;
_expect = Init_words::ICW1;
_offset = 0;
_slave_at = 0;
_icw1 = {0U};
_icw4 = {0U};
_icw4.aeoi() = 1;
}
void handle_command_write(l4_uint32_t command)
{
l4_uint8_t cmd = command;
if (cmd & 0x10) // ICW1
{
// start initialization sequence
reset();
_icw1 = {cmd};
if (_icw1.address_interval() || _icw1.single())
warn().printf("Unsupported initialization value.\n");
_expect = Init_words::ICW2;
return;
}
if (_expect != Init_words::ICW1) // are we still in initialization?
{
warn().printf("%s: PIC is in initialization and guest wrote OCW (%x). Ignoring.\n",
_is_master ? "Master" : "Slave", cmd);
return;
}
// handle OCWs
if (cmd & 0x8)
{
struct OCW3 o{cmd};
if (o.rr())
{
_read_isr = o.ris();
return;
}
// ignore the rest
}
else // OCW2
{
struct OCW2 o{cmd};
if (o.eoi())
{
if (o.sl())
eoi(o.irq());
else
eoi();
}
// ignore the rest for now
}
}
void handle_data_write(l4_uint32_t value)
{
if (_expect != Init_words::ICW1) // we are in initialization
{
switch (_expect)
{
case Init_words::ICW1: break; // avoid compiler warning
case Init_words::ICW2:
_offset = value;
if (_icw1.single())
{
if (_icw1.icw4())
_expect = Init_words::ICW4;
else
_expect = Init_words::ICW1; // initialization complete
}
else
_expect = Init_words::ICW3;
warn().printf("%s: Vector offset %u\n",
_is_master ? "MASTER" : "SLAVE", _offset);
break;
case Init_words::ICW3:
_slave_at = value;
if (_icw1.icw4())
_expect = Init_words::ICW4;
else
{
_expect = Init_words::ICW1; // initialization complete
_read_isr = false;
}
break;
case Init_words::ICW4:
_icw4.raw = value;
if (!_icw4.upm())
warn().printf("Guest tries to set MCS-80 mode. Unsupported.\n");
_expect = Init_words::ICW1; // initialization complete
_read_isr = false;
break;
}
return;
}
// OCW1
_imr = value;
// immediately inject pending irqs
issue_next_interrupt();
}
};
public:
/**
* Create a legacy PIC consisting of a master and slave chip.
*
* \param distr MSI-parent to send interrupts to.
*/
Legacy_pic(cxx::Ref_ptr<Gic::Msix_controller> distr)
: _master(Vdev::make_device<Chip>(true, this)),
_slave(Vdev::make_device<Chip>(false, this)),
_distr(distr)
{
info().printf("Hello, Legacy_pic\n");
}
/// Issue a legacy interrupt in range [0, 15]
void set(unsigned irq) override
{
assert(irq < Num_irqs);
int num = irq < 8 ? _master->trigger(irq) : _slave->trigger(irq - 8);
// Do we need to set the _master line where the slave is wired to?
if (num >= 32)
send_interrupt(num);
};
void send_interrupt(int irq)
{
if (irq >= 32)
{
using namespace Vdev::Msix;
Interrupt_request_compat addr(0ULL);
// dest_id = 0, redirect_hint = 0, dest_mode = 0;
addr.fixed() = Address_interrupt_prefix;
Data_register_format data(0U);
data.vector() = irq;
data.delivery_mode() = Dm_extint;
_distr->send(addr.raw, data.raw);
}
}
void clear(unsigned) override {}
void bind_irq_src_handler(unsigned irq, Gic::Irq_src_handler *handler) override
{
assert(irq < Num_irqs);
if (handler && _sources[irq])
throw L4::Runtime_error(-L4_EEXIST);
_sources[irq] = handler;
}
Gic::Irq_src_handler *get_irq_src_handler(unsigned irq) const override
{
assert(irq < Num_irqs);
return _sources[irq];
}
void eoi(unsigned irq)
{
assert(irq < Num_irqs);
if (_sources[irq])
_sources[irq]->eoi();
}
int dt_get_interrupt(fdt32_t const *prop, int propsz, int *read) const override
{
enum { Irq_cells = 1, };
if (propsz < Irq_cells)
return -L4_ERANGE;
if (read)
*read = Irq_cells;
return fdt32_to_cpu(prop[0]);
}
/// Obtain a pointer to the master PIC chip.
cxx::Ref_ptr<Chip> master() const { return _master; }
/// Obtain a pointer to the slave PIC chip.
cxx::Ref_ptr<Chip> slave() const { return _slave; }
private:
static Dbg trace() { return Dbg(Dbg::Irq, Dbg::Trace, "PIC"); }
static Dbg info() { return Dbg(Dbg::Irq, Dbg::Info, "PIC"); }
static Dbg warn() { return Dbg(Dbg::Irq, Dbg::Warn, "PIC"); }
cxx::Ref_ptr<Chip> _master;
cxx::Ref_ptr<Chip> _slave;
cxx::Ref_ptr<Gic::Msix_controller> _distr;
Gic::Irq_src_handler *_sources[Num_irqs] = {};
};
} // namespace Vdev

View File

@@ -0,0 +1,651 @@
/*
* Copyright (C) 2017-2018, 2021, 2023-2024 Kernkonzept GmbH.
* Author(s): Adam Lackorzynski <adam@l4re.org>
* Philipp Eppelt <philipp.eppelt@kernkonzept.com>
* Georg Kotheimer <georg.kotheimer@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#include <cstdio>
#include <l4/cxx/bitfield>
#include <l4/cxx/exceptions>
#include <l4/re/error_helper>
#include "mad.h"
namespace L4mad
{
static const char *reg_names_x86_32[] = {
"eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi" };
static const char *reg_names_x86_16[] = {
"ax", "cx", "dx", "bx", "sp", "bp", "si", "di" };
static const char *reg_names_x86_8l[] = {
"al", "cl", "dl", "bl" };
static const char *reg_names_x86_8h[] = {
"ah", "ch", "dh", "bh" };
#ifdef ARCH_amd64
enum Reg_names_amd64 { Reg_rax, Reg_rcx, Reg_rdx, Reg_rbx, Reg_rsp, Reg_rbp,
Reg_rsi, Reg_rdi, Reg_r8, Reg_r9, Reg_r10, Reg_r11, Reg_r12,
Reg_r13, Reg_r14, Reg_r15,
Reg_eax = Reg_rax
};
static const char *reg_names_x86_64[]
= { "rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" };
#elif defined(ARCH_x86)
enum Reg_names_x86 { Reg_eax, Reg_ecx, Reg_edx, Reg_ebx, Reg_esp, Reg_ebp,
Reg_esi, Reg_edi };
#endif
static unsigned width_in_bytes(Width width)
{
switch (width)
{
case Width::Wd8: return 1;
case Width::Wd16: return 2;
case Width::Wd32: return 4;
case Width::Wd64: return 8;
}
L4Re::throw_error(-L4_EINVAL, "Invalid width to convert to bytes.");
}
enum Rex
{
/// Operand size
Rex_w = 8,
/// ModR/M reg field
Rex_r = 4,
/// SIB index field
Rex_x = 2,
/// ModR/M r/m field
Rex_b = 1,
};
struct Modrm
{
unsigned char raw;
explicit Modrm(unsigned char val) : raw(val) {}
/// Register (possibly extended by Rex_b) or an addressing mode combined with
/// the mod field.
CXX_BITFIELD_MEMBER(0, 2, rm, raw);
/// Register (possibly extended by Rex_r) or three additional opcode bits.
CXX_BITFIELD_MEMBER(3, 5, reg, raw);
/// Controls whether the rm field encodes a register (mod=3) or an addressing
/// mode.
CXX_BITFIELD_MEMBER(6, 7, mod, raw);
};
enum Rm
{
Rm_sib = 4,
Rm_ripr = 5,
};
enum Mod
{
Mod_indirect = 0,
Mod_indirect_disp8 = 1,
Mod_indirect_disp32 = 2,
Mod_direct = 3,
};
struct Sib
{
unsigned char raw;
explicit Sib(unsigned char val) : raw(val) {}
/// Base register (possibly extended by Rex_b).
CXX_BITFIELD_MEMBER(0, 2, base, raw);
/// Index register (possibly extended by Rex_x).
CXX_BITFIELD_MEMBER(3, 5, index, raw);
/// Scale factor of index field.
CXX_BITFIELD_MEMBER(6, 7, scale, raw);
};
struct Instruction
{
/// Instruction length, only accurate after decoding of the instruction is
/// complete.
unsigned char len = 0;
/// Operand-size override
bool op_size_ovr = false;
/// REX prefix, if present
unsigned char rex = 0;
/// Operand size is forced to one byte
bool op_size_byte = false;
/// Register operand
unsigned char op_reg;
/// Shift to apply to register operand, e.g. used for accessing high byte.
unsigned char op_reg_shift;
/// Address operand
l4_addr_t op_addr;
/// Address operand is IP relative
bool op_addr_ripr;
/// Immediate operand
l4_umword_t op_imm;
// Assumption: If in protected mode or long compatibility mode we assume that
// we are in a 32-bit code segment (CS.d == 1).
Width op_width() const
{
// Operand-size override prefix and Rex.W have no effect on byte-specific
// operations.
if (op_size_byte)
return Width::Wd8;
if (rex & Rex_w)
return Width::Wd64;
return op_size_ovr ? Width::Wd16 : Width::Wd32;
}
// Assumption: If in protected mode or long compatibility mode we assume that
// we are in a 32-bit code segment (CS.d == 1).
Width imm_width() const
{
// Operand-size override prefix has no effect on byte-specific operations.
if (op_size_byte)
return Width::Wd8;
return op_size_ovr ? Width::Wd16 : Width::Wd32;
}
unsigned char rex_reg(unsigned char reg, Rex rex_bit) const
{ return rex & rex_bit ? reg + 8 : reg; }
};
/**
* Truncate value to specified width.
*
* \param v Value to truncate
* \param width Width in bytes
*/
static l4_umword_t
truncate(l4_umword_t v, Width width)
{
if (width_in_bytes(width) >= sizeof(l4_umword_t))
return v;
return v & ((1UL << (width * 8)) - 1);
}
/**
* Sign-extend value from specified width.
*
* \param v Value to sign-extend
* \param from_width Width in bytes
*/
static l4_umword_t
sign_extend(l4_umword_t v, Width from_width)
{
if (width_in_bytes(from_width) >= sizeof(l4_umword_t))
return v;
l4_umword_t const msb = 1UL << (from_width * 8 - 1);
if (v & msb)
v |= ~0UL << (from_width * 8);
return v;
}
Decoder::Decoder(l4_exc_regs_t const *regs, l4_addr_t ip,
unsigned char const *inst_buf, unsigned inst_buf_len)
: _regs(regs), _ip(ip), _inst_buf(inst_buf), _inst_buf_len(inst_buf_len),
#ifdef ARCH_amd64
// TODO: Introduce parameter to Decoder or decode(), that signifies whether
// CPU is in 64-bit mode or in compatibility/protected mode.
_long_mode_64(true)
#else
_long_mode_64(false)
#endif
{
}
l4_umword_t
Decoder::regval_arch(unsigned regnr) const
{
switch (regnr)
{
#ifdef ARCH_x86
case Reg_eax: return _regs->eax;
case Reg_ebx: return _regs->ebx;
case Reg_ecx: return _regs->ecx;
case Reg_edx: return _regs->edx;
case Reg_edi: return _regs->edi;
case Reg_esi: return _regs->esi;
case Reg_ebp: return _regs->ebp;
case Reg_esp: return _regs->sp;
#else
case Reg_rax: return _regs->rax;
case Reg_rbx: return _regs->rbx;
case Reg_rcx: return _regs->rcx;
case Reg_rdx: return _regs->rdx;
case Reg_rdi: return _regs->rdi;
case Reg_rsi: return _regs->rsi;
case Reg_rbp: return _regs->rbp;
case Reg_rsp: return _regs->sp;
case Reg_r8: return _regs->r8;
case Reg_r9: return _regs->r9;
case Reg_r10: return _regs->r10;
case Reg_r11: return _regs->r11;
case Reg_r12: return _regs->r12;
case Reg_r13: return _regs->r13;
case Reg_r14: return _regs->r14;
case Reg_r15: return _regs->r15;
#endif
default: return 0; // cannot happen but gcc complains
}
}
l4_umword_t
Decoder::regval(unsigned regnr, unsigned shift, Width aw) const
{
return truncate(regval_arch(regnr) >> shift, aw);
}
char const *
Decoder::regname(unsigned regnr, unsigned shift, Width aw) const
{
#if defined(ARCH_x86) || defined(ARCH_amd64)
switch (aw)
{
case Width::Wd8:
return shift == 8 ? reg_names_x86_8h[regnr] : reg_names_x86_8l[regnr];
case Width::Wd16:
return reg_names_x86_16[regnr];
case Width::Wd32:
return reg_names_x86_32[regnr];
case Width::Wd64:
#if defined(ARCH_x86)
return 0;
#else
return reg_names_x86_64[regnr];
#endif
}
#endif
return 0;
}
void
Decoder::regname_bm_snprintf(char *buf, unsigned buflen, unsigned reglist) const
{
unsigned w = 0;
for (unsigned i = 0; i < Num_registers; ++i)
if (reglist & (1 << i))
w += snprintf(buf + w, buflen - w, "%s[%lx],",
regname(i, 0, Width::Wd32), regval(i, 0, Width::Wd32));
if (reglist)
buf[w - 1] = 0;
}
char *
Decoder::desc_s(char *buf, unsigned buflen, Desc const &d, Width aw) const
{
switch (d.dtype)
{
case Desc_mem:
snprintf(buf, buflen, "Mem:%08lx", d.val);
break;
case Desc_reg:
snprintf(buf, buflen, "Reg:%s[%08lx] (s:%d,%ld,%d)",
regname(d.val, d.shift, aw), regval(d.val, d.shift, aw),
d.shift, d.val, aw);
break;
case Desc_regbitmap:
{
unsigned w = snprintf(buf, buflen, "Regs:");
regname_bm_snprintf(buf + w, buflen - w, d.val);
}
break;
case Desc_imm:
snprintf(buf, buflen, "Val:%08lx", d.val);
break;
}
buf[buflen - 1] = 0;
return buf;
}
void
Decoder::print_insn_info(Op const &op, Desc const &tgt, Desc const &src) const
{
char buf_s[32], buf_t[32];
warn()
.printf("0x%lx (%d): %s of %u bytes from %s to %s.\n",
_ip, op.insn_len, op.atype == Read ? "Read" : "Write",
op.access_width,
desc_s(buf_s, sizeof(buf_s), src, op.access_width),
desc_s(buf_t, sizeof(buf_t), tgt, op.access_width));
}
// Assumption: If in protected mode or long compatibility mode we assume that
// we are in a 32-bit code segment (CS.d == 1).
Width
Decoder::addr_width(Instruction const &) const
{
// TODO: Add support for address-size override prefix?
return _long_mode_64 ? Width::Wd64 : Width::Wd32;
}
l4_umword_t
Decoder::peek_inst_bytes(Instruction const &inst, Width sz) const
{
unsigned new_inst_len = inst.len + width_in_bytes(sz);
if (new_inst_len > _inst_buf_len || new_inst_len >= Max_instruction_len)
L4Re::throw_error(-L4_ERANGE, "Instruction out of bounds.");
unsigned char const *bytes = &_inst_buf[inst.len];
switch (sz)
{
case Width::Wd8: return *bytes;
case Width::Wd16: return *reinterpret_cast<l4_uint16_t const *>(bytes);
case Width::Wd32: return *reinterpret_cast<l4_uint32_t const *>(bytes);
case Width::Wd64: return *reinterpret_cast<l4_uint64_t const *>(bytes);
}
L4Re::throw_error(-L4_EINVAL, "Invalid instruction buffer access size.");
}
l4_umword_t
Decoder::read_inst_bytes(Instruction &inst, Width sz) const
{
l4_umword_t result = peek_inst_bytes(inst, sz);
inst.len += width_in_bytes(sz);
return result;
}
void
Decoder::decode_legacy_prefixes(Instruction &inst)
{
for(;;)
{
switch (peek_inst_bytes(inst, Width::Wd8))
{
// Group 1
// Lock and repeat prefixes
case 0xf0: // lock;
break;
case 0xf2:
case 0xf3:
trace().printf("Repeat prefix not considered\n");
break;
// Group 2
// Segment-Override Prefixes
case 0x26: // ES
case 0x36: // SS
case 0x64: // FS
case 0x65: // GS
trace().printf("Segment override not considered\n");
break;
// Branch hints
case 0x2e: // branch hint or CS segment override
case 0x3e: // branch hint or DS segment override
break;
// Group 3
// Operand-size override prefix
case 0x66:
inst.op_size_ovr = true;
break;
// Group 4
// Address-size override prefix
case 0x67:
trace().printf("Address-size override not considered\n");
break;
default:
// Not a prefix, opcode follows.
return;
};
++inst.len;
}
}
void
Decoder::decode_rex_prefix(Instruction &inst)
{
if (!_long_mode_64)
return;
unsigned char ib = peek_inst_bytes(inst, Width::Wd8);
// REX prefix?
if ((ib & 0xf0) == 0x40)
{
inst.rex = ib;
++inst.len;
}
}
bool
Decoder::decode_modrm(Instruction &inst, unsigned char *opcode_ext)
{
Modrm modrm(read_inst_bytes(inst, Width::Wd8));
// Writing into or reading from a register cannot raise a page fault,
// thus not relevant for our use case.
if (modrm.mod() == Mod_direct)
return false;
// Reg field encodes register if the opcode does not expect it to contain
// additional opcode bits.
if (!opcode_ext)
{
// Register operand
inst.op_reg = inst.rex_reg(modrm.reg(), Rex_r);
// AH to DH are only accessible if the instruction does not use a REX
// prefix. Then instead SPL, BPL, SIL, and DIL, which is the lower
// byte of the actually referenced register, would be accessed.
if (!inst.rex && inst.op_size_byte && inst.op_reg > 3)
{
inst.op_reg -= 4;
// Access the high byte (AH to DH)
inst.op_reg_shift = 8;
}
}
// Reg field encodes additional opcode bits.
else
*opcode_ext = modrm.reg();
// Memory address operand
if (modrm.rm() == Rm_sib)
{
inst.op_addr = decode_sib(inst, modrm);
}
else if (modrm.mod() == Mod_indirect && modrm.rm() == Rm_ripr)
{
inst.op_addr_ripr = _long_mode_64;
// Plus 32-bit displacement
inst.op_addr = sign_extend(read_inst_bytes(inst, Width::Wd32), Width::Wd32);
}
else
{
inst.op_addr = regval(inst.rex_reg(modrm.rm(), Rex_b), 0,
addr_width(inst));
}
// Displacement
if (modrm.mod() == Mod_indirect_disp8 || modrm.mod() == Mod_indirect_disp32)
{
Width sz = modrm.mod() == Mod_indirect_disp8 ? Width::Wd8 : Width::Wd32;
inst.op_addr += sign_extend(read_inst_bytes(inst, sz), sz);
}
return true;
}
l4_umword_t
Decoder::decode_sib(Instruction &inst, Modrm const &modrm)
{
Sib sib(read_inst_bytes(inst, Width::Wd8));
l4_umword_t base = 0;
if (modrm.mod() == Mod_indirect && sib.base() == 5)
{
// No base register, instead a disp32 is specified.
base = sign_extend(read_inst_bytes(inst, Width::Wd32), Width::Wd32);
}
else
base = regval(inst.rex_reg(sib.base(), Rex_b), 0, addr_width(inst));
l4_umword_t index = 0;
unsigned char rindex = inst.rex_reg(sib.index(), Rex_x);
if (rindex != 4) // otherwise, no index register specified
index = regval(rindex, 0, addr_width(inst));
return base + (index << sib.scale());
}
void
Decoder::decode_imm(Instruction &inst)
{
Width imm_len = inst.imm_width();
inst.op_imm = read_inst_bytes(inst, imm_len);
if (_long_mode_64 && !inst.op_size_byte && (inst.rex & Rex_w))
// In 64-bit mode all immediates are sign-extended to 64 bits.
inst.op_imm = sign_extend(inst.op_imm, imm_len);
}
void
Decoder::decode_imm_moffs(Instruction &inst)
{
inst.op_imm = read_inst_bytes(inst, inst.op_width());
}
Decoder::Result
Decoder::decode(Op *op, Desc *tgt, Desc *src)
{
try
{
Decoder::Result result = decode_unsafe(op, tgt, src);
if (result != Result::Success)
warn().printf("Unsupported or invalid instruction at 0x%lx\n", _ip);
return result;
}
catch (L4::Runtime_error const &e)
{
warn().printf("Invalid instruction in [0x%lx, 0x%lx]: %s (%ld): %s\n",
_ip, _ip + _inst_buf_len, e.str(), e.err_no(),
e.extra_str() ? e.extra_str() : "");
return Result::Invalid;
}
}
Decoder::Result
Decoder::decode_unsafe(Op *op, Desc *tgt, Desc *src)
{
Instruction inst{};
// Instructions consist of the following components in the given order:
// - Legacy prefixes (optional)
// - REX prefix (optional)
// - Opcode (up to three bytes)
// - ModR/M (1 byte, if required)
// - SIB (1 byte, if required)
// - Displacement (1, 2 or 4 bytes, if required)
// - Immediate (1, 2, 4 or 8 bytes, if required)
decode_legacy_prefixes(inst);
decode_rex_prefix(inst);
// Read first opcode byte
unsigned char ib = read_inst_bytes(inst, Width::Wd8);
switch (ib)
{
case 0xc6: // mov $, a
case 0xc7:
{
inst.op_size_byte = !(ib & 1);
unsigned char opcode_ext;
if (!decode_modrm(inst, &opcode_ext))
return Result::Unsupported;
// Opcode extension must be zero.
if (opcode_ext != 0)
return Result::Unsupported;
decode_imm(inst);
op->set(Write, inst.op_width(), inst.len);
imm_from_op_imm(src, inst);
mem_from_op_addr(tgt, inst);
return Result::Success;
}
// read
case 0xa0: // mov a, %al
case 0xa1: // mov a, %eax
// write
case 0xa2: // mov %al, a
case 0xa3: // mov %eax, a
{
inst.op_size_byte = !(ib & 1);
bool write = (ib & 2);
decode_imm_moffs(inst);
op->set(write ? Write : Read, inst.op_width(), inst.len);
(write ? src : tgt)->set_reg(Reg_eax);
mem_from_op_imm(write ? tgt : src, inst);
return Result::Success;
}
// write
case 0x88: // mov %, a
case 0x89: // mov %, a
// read
case 0x8a: // mov a, %
case 0x8b: // mov a, %
{
inst.op_size_byte = !(ib & 1);
bool write = !(ib & 2);
if (!decode_modrm(inst))
return Result::Unsupported;
op->set(write ? Write : Read, inst.op_width(), inst.len);
reg_from_op_reg(write ? src : tgt, inst);
mem_from_op_addr(write ? tgt : src, inst);
return Result::Success;
}
default:
warn().printf("Unsupported opcode: 0x%x\n", ib);
return Result::Unsupported;
}
}
void
Decoder::reg_from_op_reg(Desc *desc, Instruction const &inst) const
{ desc->set_reg(inst.op_reg, inst.op_reg_shift); }
void
Decoder::imm_from_op_imm(Desc *desc, Instruction const &inst) const
{ desc->set_imm(inst.op_imm); }
void
Decoder::mem_from_op_imm(Desc *desc, Instruction const &inst) const
{ desc->set_mem(inst.op_imm); }
void
Decoder::mem_from_op_addr(Desc *desc, Instruction const &inst) const
{
l4_addr_t addr = inst.op_addr;
if (inst.op_addr_ripr)
addr += _ip + inst.len;
// Truncate calculated address to current address width.
addr = truncate(addr, addr_width(inst));
desc->set_mem(addr);
}
} // namspace L4mad

View File

@@ -0,0 +1,156 @@
/*
* Copyright (C) 2017, 2023-2024 Kernkonzept GmbH.
* Author(s): Adam Lackorzynski <adam@l4re.org>
* Philipp Eppelt <philipp.eppelt@kernkonzept.com>
* Georg Kotheimer <georg.kotheimer@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include <l4/sys/utcb.h>
#include <assert.h>
#include "debug.h"
#include "mem_access.h"
namespace L4mad
{
enum Desc_type { Desc_mem, Desc_imm, Desc_reg, Desc_regbitmap };
enum Access_type { Write, Read };
/// Width in bytes.
using Width = Vmm::Mem_access::Width;
struct Desc
{
Desc_type dtype;
l4_umword_t val;
unsigned char shift;
void set_mem(l4_umword_t v)
{ dtype = Desc_mem; val = v; shift = 0; }
void set_reg(l4_umword_t v, unsigned char s = 0)
{ dtype = Desc_reg; val = v; shift = s; }
void set_regbitmap(l4_umword_t bm)
{ dtype = Desc_regbitmap; val = bm; shift = 0; }
void set_imm(l4_umword_t v)
{ dtype = Desc_imm; val = v; shift = 0; }
};
struct Op
{
Access_type atype;
Width access_width;
unsigned char insn_len;
void set(Access_type t, Width aw, unsigned char il)
{
atype = t;
access_width = aw;
insn_len = il;
}
};
#if defined(ARCH_amd64)
enum { Num_registers = 16, };
#elif defined(ARCH_x86)
enum { Num_registers = 8, };
#endif
struct Modrm;
struct Instruction;
class Decoder
{
public:
/**
* Create decoder for the given execution state.
*
* \param regs General-purpose registers
* \param ip Instruction pointer as guest virtual address (required
* for RIP-relative addressing)
* \param inst_buf Buffer containing instruction bytes
* \param inst_buf_len Length of instruction byte buffer
*/
Decoder(l4_exc_regs_t const *regs, l4_addr_t ip,
unsigned char const *inst_buf, unsigned inst_buf_len);
enum { Max_instruction_len = 15 };
enum class Result
{
Success,
Unsupported,
Invalid,
};
/**
* Decode instruction as a read or write operation.
*
* \param[out] op Operation
* \param[out] tgt Target operand description
* \param[out] src Source operation description
*
* \retval Result::Success Instruction was decoded successfully.
* \retval Result::Unsupported Instruction decoding failed, because an
* unsupported instruction was encountered.
* \retval Result::Invalid Instruction decoding failed, because an invalid
* or incomplete instruction was encountered, for
* example if the the instruction spans more bytes
* than available in the decoders instruction
* buffer.
*
* \note The decoder assumes that the CPU is executing in long 64-bit mode or
* long compatibility / protected mode in a 32-bit code segment (i.e.
* CS.d==1). Otherwise incorrect operand and address widths are
* calculated.
*/
Result decode(Op *op, Desc *tgt, Desc *src);
/**
* Print textual representation of a successfully decoded instruction.
*/
void print_insn_info(Op const &op, Desc const &tgt, Desc const &src) const;
private:
static Dbg trace() { return Dbg(Dbg::Core, Dbg::Trace, "Mad"); }
static Dbg warn() { return Dbg(Dbg::Core, Dbg::Warn, "Mad"); }
Result decode_unsafe(Op *op, Desc *tgt, Desc *src);
void decode_legacy_prefixes(Instruction &inst);
void decode_rex_prefix(Instruction &inst);
bool decode_modrm(Instruction &inst, unsigned char *opcode_ext = nullptr);
l4_umword_t decode_sib(Instruction &inst, Modrm const &modrm);
void decode_imm(Instruction &inst);
void decode_imm_moffs(Instruction &inst);
char *desc_s(char *buf, unsigned buflen, Desc const &d, Width aw) const;
void regname_bm_snprintf(char *buf, unsigned buflen, unsigned reglist) const;
char const *regname(unsigned regnr, unsigned shift, Width aw) const;
l4_umword_t regval_arch(unsigned regnr) const;
l4_umword_t regval(unsigned regnr, unsigned shift, Width aw) const;
Width addr_width(Instruction const &inst) const;
l4_umword_t peek_inst_bytes(Instruction const &inst, Width sz) const;
l4_umword_t read_inst_bytes(Instruction &inst, Width sz) const;
void reg_from_op_reg(Desc *desc, Instruction const &inst) const;
void imm_from_op_imm(Desc *desc, Instruction const &inst) const;
void mem_from_op_imm(Desc *desc, Instruction const &inst) const;
void mem_from_op_addr(Desc *desc, Instruction const &inst) const;
l4_exc_regs_t const *const _regs;
l4_addr_t const _ip;
unsigned char const *const _inst_buf;
unsigned const _inst_buf_len;
bool const _long_mode_64;
}; // class Decoder
} // namespace L4mad

View File

@@ -0,0 +1,218 @@
/*
* Copyright (C) 2019-2021, 2023-2024 Kernkonzept GmbH.
* Author(s): Philipp Eppelt <philipp.eppelt@kernkonzept.com>
* Timo Nicolai <timo.nicolai@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include <cstdio>
#include <cstring>
#include "vmcs.h"
#include "vcpu_ptr.h"
#include "vm_state_vmx.h"
#include "monitor/monitor.h"
#include "monitor/monitor_args.h"
namespace Monitor {
template<bool, typename T>
class Cpu_dev_cmd_handler {};
template<typename T>
class Cpu_dev_cmd_handler<true, T> : public Cmd
{
public:
char const *help() const override
{ return "CPU state"; }
void usage(FILE *f) const override
{
fprintf(f, "%s\n"
"* 'cpu <i> regs': dump CPU registers\n"
"* 'cpu <i> vmx': dump VMX state\n",
help());
}
void complete(FILE *f, Completion_request *compl_req) const override
{ compl_req->complete(f, {"regs", "vmx"}); }
void exec(FILE *f, Arglist *args) override
{
if (*args == "regs")
show_regs(f);
else if (*args == "vmx")
show_vmx(f);
else
argument_error("Invalid subcommand");
}
void show_regs(FILE *f) const
{
auto regs = get_vcpu()->r;
auto *vms = get_vcpu().vm_state();
fprintf(f,
"RAX %lx\nRBX %lx\nRCX %lx\nRDX %lx\nRSI %lx\nRDI %lx\n"
"RSP %lx\nRBP %lx\nR8 %lx\nR9 %lx\nR10 %lx\nR11 %lx\n"
"R12 %lx\nR13 %lx\nR14 %lx\nR15 %lx\nRIP %lx\n",
regs.ax, regs.bx, regs.cx, regs.dx, regs.si, regs.di,
regs.sp, regs.bp, regs.r8, regs.r9, regs.r10, regs.r11,
regs.r12, regs.r13, regs.r14, regs.r15, vms->ip());
}
void show_vmx(FILE *f) const
{
Vmm::Vmx_state *vmx = dynamic_cast<Vmm::Vmx_state *>(get_vcpu().vm_state());
if (!vmx)
{
fprintf(f, "Failed to read VMX state\n");
return;
}
fprintf(f, "(C) VPID: 0x%llx\n",
vmx->vmx_read(VMCS_VPID));
fprintf(f, "(C) Int notification vector: 0x%llx\n",
vmx->vmx_read(VMCS_PIR_NOTIFICATION_VECTOR));
fprintf(f, "(C) EPTP index: 0x%llx\n",
vmx->vmx_read(VMCS_EPTP_INDEX));
fprintf(f, "(C) EPT pointer: 0x%llx\n",
vmx->vmx_read(VMCS_EPT_POINTER));
fprintf(f, "(C) Pin-based execution control: 0x%llx\n",
vmx->vmx_read(VMCS_PIN_BASED_VM_EXEC_CTLS));
fprintf(f, "(C) Primary execution control: 0x%llx\n",
vmx->vmx_read(VMCS_PRI_PROC_BASED_VM_EXEC_CTLS));
fprintf(f, "(C) Secondary execution control: 0x%llx\n",
vmx->vmx_read(VMCS_SEC_PROC_BASED_VM_EXEC_CTLS));
fprintf(f, "(c) basic capabilities: 0x%llx\n",
vmx->cap_read(L4_VM_VMX_BASIC_REG));
fprintf(f, "(C) Real pin-based execution control: 0x%llx\n",
vmx->cap_read(L4_VM_VMX_TRUE_PINBASED_CTLS_REG));
fprintf(f, "(C) Real primary execution control: 0x%llx\n",
vmx->cap_read(L4_VM_VMX_TRUE_PROCBASED_CTLS_REG));
fprintf(f, "(C) Real secondary execution control: 0x%llx\n",
vmx->cap_read(L4_VM_VMX_PROCBASED_CTLS2_REG));
fprintf(f, "(G) ES selector: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_ES_SELECTOR));
fprintf(f, "(G) CS selector: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_CS_SELECTOR));
fprintf(f, "(G) SS selector: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_SS_SELECTOR));
fprintf(f, "(G) DS selector: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_DS_SELECTOR));
fprintf(f, "(G) FS selector: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_FS_SELECTOR));
fprintf(f, "(G) GS selector: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_GS_SELECTOR));
fprintf(f, "(G) GDTR base: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_GDTR_BASE));
fprintf(f, "(G) IDTR base: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_IDTR_BASE));
fprintf(f, "(G) LDTR selector: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_LDTR_SELECTOR));
fprintf(f, "(G) TR selector: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_TR_SELECTOR));
fprintf(f, "(G) interrupt status: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_INTERRUPT_STATUS));
fprintf(f, "(C) IO bitmap A: 0x%llx\n",
vmx->vmx_read(VMCS_ADDRESS_IO_BITMAP_A));
fprintf(f, "(C) IO bitmap B: 0x%llx\n",
vmx->vmx_read(VMCS_ADDRESS_IO_BITMAP_B));
fprintf(f, "(C) MSR bitmaps: 0x%llx\n",
vmx->vmx_read(VMCS_ADDRESS_MSR_BITMAP));
fprintf(f, "(C) Exit MSR store address: 0x%llx\n",
vmx->vmx_read(VMCS_VM_EXIT_MSR_STORE_ADDRESS));
fprintf(f, "(C) Exit MSR load address: 0x%llx\n",
vmx->vmx_read(VMCS_VM_EXIT_MSR_LOAD_ADDRESS));
fprintf(f, "(C) Entry MSR load address: 0x%llx\n",
vmx->vmx_read(VMCS_VM_ENTRY_MSR_LOAD_ADDRESS));
fprintf(f, "(C) Entry control: 0x%llx\n",
vmx->vmx_read(VMCS_VM_ENTRY_CTLS));
fprintf(f, "(C) Entry error: 0x%llx\n",
vmx->vmx_read(VMCS_VM_ENTRY_EXCEPTION_ERROR));
fprintf(f, "(C) Entry MSR load cnt: 0x%llx\n",
vmx->vmx_read(VMCS_VM_ENTRY_MSR_LOAD_COUNT));
fprintf(f, "(C) Entry interrupt info: 0x%llx\n",
vmx->vmx_read(VMCS_VM_ENTRY_INTERRUPT_INFO));
fprintf(f, "(C) VM-instruction error: 0x%llx\n",
vmx->vmx_read(VMCS_VM_INSN_ERROR));
fprintf(f, "(C) Exit control: 0x%llx\n",
vmx->vmx_read(VMCS_VM_EXIT_CTLS));
fprintf(f, "(C) Exit reason: 0x%llx\n",
vmx->vmx_read(VMCS_EXIT_REASON));
fprintf(f, "(C) Exit interrupt info: 0x%llx\n",
vmx->vmx_read(VMCS_VM_EXIT_INTERRUPT_INFO));
fprintf(f, "(C) Exit interrupt error: 0x%llx\n",
vmx->vmx_read(VMCS_VM_EXIT_INTERRUPT_ERROR));
fprintf(f, "(C) Guest interruptability: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_INTERRUPTIBILITY_STATE));
fprintf(f, "(G) ES limit: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_ES_LIMIT));
fprintf(f, "(G) CS limit: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_CS_LIMIT));
fprintf(f, "(G) SS limit: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_SS_LIMIT));
fprintf(f, "(G) DS limit: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_DS_LIMIT));
fprintf(f, "(G) FS limit: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_FS_LIMIT));
fprintf(f, "(G) GS limit: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_GS_LIMIT));
fprintf(f, "(G) GDTR limit: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_GDTR_LIMIT));
fprintf(f, "(G) IDTR limit: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_IDTR_LIMIT));
fprintf(f, "(G) Activity state: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_ACTIVITY_STATE));
fprintf(f, "(G) sysenter rip: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_IA32_SYSENTER_EIP));
fprintf(f, "(G) sysenter rsp: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_IA32_SYSENTER_ESP));
fprintf(f, "(G) exit qualification: 0x%llx\n",
vmx->vmx_read(VMCS_EXIT_QUALIFICATION));
fprintf(f, "(G) guest linear address: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_LINEAR_ADDRESS));
fprintf(f, "(G) CR0: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_CR0));
fprintf(f, "(G) CR3: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_CR3));
fprintf(f, "(G) CR4: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_CR4));
fprintf(f, "(G) Guest IA32 EFER: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_IA32_EFER));
fprintf(f, "(G) RFLAGS: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_RFLAGS));
fprintf(f, "(G) RIP: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_RIP));
fprintf(f, "(G) RSP: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_RSP));
fprintf(f, "(G) ES base: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_ES_BASE));
fprintf(f, "(G) CS base: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_CS_BASE));
fprintf(f, "(G) SS base: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_SS_BASE));
fprintf(f, "(G) DS base: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_DS_BASE));
fprintf(f, "(G) FS base: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_FS_BASE));
fprintf(f, "(G) GS base: 0x%llx\n",
vmx->vmx_read(VMCS_GUEST_GS_BASE));
}
private:
Vmm::Vcpu_ptr get_vcpu() const
{ return static_cast<T const *>(this)->vcpu(); }
};
}

View File

@@ -0,0 +1,147 @@
/*
* Copyright (C) 2024 Kernkonzept GmbH.
* Author(s): Timo Nicolai <timo.nicolai@kernkonzept.com>
* Philipp Eppelt <philipp.eppelt@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include <cstdio>
#include <cstring>
#include <string>
#include <l4/sys/l4int.h>
#include "monitor/monitor.h"
#include "monitor/monitor_args.h"
namespace Monitor {
template<bool, typename T>
class Ioapic_cmd_handler {};
template<typename T>
class Ioapic_cmd_handler<true, T> : public Cmd
{
enum Ioapic_regs
{
Id_reg = 0x0,
Version_reg = 0x1,
Arbitration_reg = 0x2,
Redir_tbl_offset_reg = 0x10,
Redir_tbl_last_reg = 0x3f,
};
struct Ioapic_reg
{
char const *name;
unsigned addr;
unsigned bytes;
};
public:
Ioapic_cmd_handler()
{ register_toplevel("ioapic"); }
char const *help() const override
{ return "IO APIC registers"; }
void usage(FILE *f) const override
{
fprintf(f, "%s\n", help());
}
void exec(FILE *f, Arglist * /*args*/) override
{
show_ioapic(f);
}
void show_ioapic(FILE *f) const
{
Ioapic_reg ioapic_regs[] =
{
{"IOAPIC ID", Id_reg, 4},
{"IOAPIC Version", Version_reg, 4},
{"IOAPIC Arbitration ID", Arbitration_reg, 4},
};
fprintf(f, "|%-5s |%-5s |%-30s |%-18s |\n",
"Reg", "Bytes", "Name", "Value");
for (auto const &reg : ioapic_regs)
print_row(f, reg);
print_redirection_table(f);
}
private:
void print_redirection_table(FILE *f) const
{
for (unsigned reg = Redir_tbl_offset_reg; reg < Redir_tbl_last_reg;
reg += 2)
print_redir_row(f, reg, "Redirection table ",
(reg - Redir_tbl_offset_reg) / 2);
}
void print_redir_row(FILE *f, unsigned addr, std::string name,
unsigned idx) const
{
unsigned bytes = 8;
print_location(f, addr, bytes);
name.append(std::to_string(idx));
fprintf(f, "|%-30s ", name.c_str());
l4_uint64_t lower = ioapic_read(addr);
l4_uint64_t upper = ioapic_read(addr + 1);
fprintf(f,
"|0x%0*llx%.*s ",
bytes * 2,
(upper << 32) | (lower & 0xffff'ffffU),
(8 - bytes) * 2,
" ");
fprintf(f,"|\n");
}
void print_row(FILE *f, Ioapic_reg const &r) const
{
print_location(f, r.addr, r.bytes);
fprintf(f, "|%-30s ", r.name);
print_value(f, r.addr, r.bytes);
fprintf(f,"|\n");
}
void print_location(FILE *f, unsigned reg, unsigned bytes) const
{ fprintf(f, "|0x%03x |%-5u ", reg, bytes); }
void print_value(FILE *f, unsigned reg, unsigned bytes) const
{
l4_uint64_t value = ioapic_read(reg);
if (value == -1ULL)
{
fprintf(f, "Failed to read IOAPIC register\n");
return;
}
fprintf(f,
"|0x%0*llx%.*s ",
bytes * 2,
value,
(8 - bytes) * 2,
" ");
}
T const *ioapic() const
{ return static_cast<T const *>(this); }
l4_uint64_t ioapic_read(unsigned reg) const
{ return ioapic()->read_reg(reg); }
};
}

View File

@@ -0,0 +1,202 @@
/*
* Copyright (C) 2019-2020, 2023-2024 Kernkonzept GmbH.
* Author(s): Timo Nicolai <timo.nicolai@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include <cstdio>
#include <cstring>
#include <l4/sys/l4int.h>
#include "monitor/monitor.h"
#include "monitor/monitor_args.h"
namespace Monitor {
template<bool, typename T>
class Lapic_cmd_handler {};
template<typename T>
class Lapic_cmd_handler<true, T> : public Cmd
{
enum { Chunk_size = 4 };
struct Apic_register
{
Apic_register(char const *name, unsigned msr, unsigned bytes = 4)
: name(name), msr(msr), bytes(bytes)
{}
char const *name;
unsigned msr;
unsigned bytes;
};
public:
Lapic_cmd_handler()
{ register_toplevel("lapic"); }
char const *help() const override
{ return "Local APIC registers"; }
void usage(FILE *f) const override
{
fprintf(f, "%s\n"
"* 'lapic <i>': dump local APIC registers for a specific cpu\n"
"* 'lapic all': dump local APIC registers for all cpus\n",
help());
}
void exec(FILE *f, Arglist *args) override
{
if (*args == "all")
{
unsigned i = 0;
while (lapic_check(i))
{
fprintf(f, "LAPIC %u\n", i);
show_lapic(f, i);
fprintf(f, "\n");
++i;
}
}
else
{
unsigned lapic_no =
args->pop<unsigned>("Failed to parse local APIC number.");
if (!lapic_check(lapic_no))
argument_error("No such CPU or no local APIC registers found");
show_lapic(f, lapic_no);
}
}
void show_lapic(FILE *f, unsigned lapic_no) const
{
static Apic_register registers[] = {
{ "Local APIC ID", 0x802 },
{ "Local APIC Version", 0x803 },
{ "Task Priority", 0x808 },
{ "Process Priority", 0x80a },
{ "Logical Destination", 0x80d },
{ "Destination Format", 0x80e },
{ "Spurious Vector", 0x80f },
{ "In-Service", 0x810, 32 },
{ "Trigger Mode", 0x818, 32 },
{ "Interrupt Request", 0x820, 32 },
{ "Error Status", 0x828 },
{ "Corrected Machine Check Error Interrupt", 0x82f },
{ "Interrupt Command", 0x830, 8 },
{ "LVT Timer", 0x832 },
{ "LVT Thermal Sensor", 0x833 },
{ "LVT Performance Monitoring Counters", 0x834 },
{ "LVT LINT0", 0x835 },
{ "LVT LINT1", 0x836 },
{ "LVT Error", 0x837 },
{ "Initial Count", 0x838 },
{ "Current Count", 0x839 },
{ "TSC Deadline", 0x6e0 }
};
fprintf(f, "|%-5s |%-5s |%-40s |%-18s |\n",
"MSR", "Bytes", "Name", "Value");
for (auto const &r : registers)
{
if (r.bytes <= 8)
{
print_row(f, lapic_no, r);
}
else
{
for (unsigned chunk = 0; chunk < r.bytes / Chunk_size; ++chunk)
print_row(f, lapic_no, r, chunk);
}
}
print_row(f, "Is NMI pending",
static_cast<T const *>(this)->get(lapic_no)->is_nmi_pending());
}
private:
void print_row(FILE *f, char const *name, l4_uint64_t value) const
{
fprintf(f, "|0x%03x |%-5u ", 0, 0);
fprintf(f, "|%-40s ", name);
unsigned bytes = 4;
fprintf(f,
"|0x%0*llx%.*s ",
bytes * 2,
value,
(8 - bytes) * 2,
" ");
fprintf(f,"|\n");
}
void print_row(FILE *f, unsigned lapic_no, Apic_register const &r) const
{
print_location(f, r.msr, r.bytes);
fprintf(f, "|%-40s ", r.name);
print_value(f, lapic_no, r.msr, r.bytes);
fprintf(f,"|\n");
}
void print_row(FILE *f,
unsigned lapic_no,
Apic_register const &r,
unsigned chunk) const
{
print_location(f, r.msr, Chunk_size);
fprintf(f,
"|[%3u:%3u] %-30s ",
chunk * Chunk_size * 8,
(chunk + 1) * Chunk_size * 8 - 1,
r.name);
print_value(f, lapic_no, r.msr + chunk, Chunk_size);
fprintf(f,"|\n");
}
void print_location(FILE *f, unsigned msr, unsigned bytes) const
{ fprintf(f, "|0x%03x |%-5u ", msr, bytes); }
void print_value(FILE *f,
unsigned lapic_no,
unsigned msr,
unsigned bytes) const
{
l4_uint64_t value;
if (!lapic_read_msr(lapic_no, msr, &value))
{
fprintf(f, "Failed to read Local APIC register\n");
return;
}
fprintf(f,
"|0x%0*llx%.*s ",
bytes * 2,
value,
(8 - bytes) * 2,
" ");
}
bool lapic_check(unsigned lapic_no) const
{ return static_cast<T const *>(this)->get(lapic_no) != nullptr; }
bool lapic_read_msr(unsigned lapic_no, unsigned msr, l4_uint64_t *value) const
{
return lapic_check(lapic_no)
&& static_cast<T const *>(this)->get(lapic_no)->read_msr(msr, value);
}
};
}

View File

@@ -0,0 +1,8 @@
/*
* Copyright (C) 2016-2017, 2019, 2024 Kernkonzept GmbH.
* Author(s): Timo Nicolai <timo.nicolai@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
constexpr bool has_iomap()
{ return true; }

View File

@@ -0,0 +1,61 @@
/*
* Copyright (C) 2019-2020, 2022, 2024 Kernkonzept GmbH.
* Author(s): Philipp Eppelt <philipp.eppelt@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include <l4/cxx/bitfield>
namespace Vdev { namespace Msix {
enum Table_entry_const_arch
{
Data_vector_mask = 0xff,
Address_interrupt_prefix = 0xfee,
};
/// MSI-X address: Interrupt request compatibility format (Intel)
struct Interrupt_request_compat
{
l4_uint64_t raw;
CXX_BITFIELD_MEMBER(40, 63, dest_id_upper, raw);
CXX_BITFIELD_MEMBER(32, 39, reserved0_2, raw);
CXX_BITFIELD_MEMBER(20, 31, fixed, raw);
CXX_BITFIELD_MEMBER(12, 19, dest_id, raw);
CXX_BITFIELD_MEMBER(4, 11, reserved0_1, raw);
CXX_BITFIELD_MEMBER(3, 3, redirect_hint, raw);
CXX_BITFIELD_MEMBER(2, 2, dest_mode, raw);
CXX_BITFIELD_MEMBER(0, 1, reserved_0, raw);
explicit Interrupt_request_compat(l4_uint64_t addr) : raw(addr)
{}
};
enum Delivery_mode : l4_uint8_t
{
Dm_fixed = 0,
Dm_lowest_prio = 1,
Dm_smi = 2,
Dm_nmi = 4,
Dm_init = 5,
Dm_startup = 6,
Dm_extint = 7,
};
/// MSI-X data format (Intel)
struct Data_register_format
{
// Intel SDM Vol. 3A 10-35, October 2017
l4_uint64_t raw;
CXX_BITFIELD_MEMBER(15, 15, trigger_mode, raw);
CXX_BITFIELD_MEMBER(14, 14, trigger_level, raw);
CXX_BITFIELD_MEMBER(8, 10, delivery_mode, raw);
CXX_BITFIELD_MEMBER(0, 7, vector, raw);
explicit Data_register_format(l4_uint64_t data) : raw(data)
{}
};
}} // namespace Vdev::Msix

View File

@@ -0,0 +1,51 @@
/*
* Copyright (C) 2019-2020, 2022, 2024 Kernkonzept GmbH.
* Author(s): Philipp Eppelt <philipp.eppelt@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include "msr_device.h"
#include "vcpu_ptr.h"
namespace Vdev {
/**
* MSR device handling read access to IA32_BIOS_SIGN_ID.
*
* This MSR provides the currently loaded microcode revision in bit [32:63].
* As MSR access is a priviledged instruction this data can only be read with
* support from the kernel. By default, the kernel provides the relevant 32
* bits of IA32_BIOS_SIGN_ID in the last user_data register of the vCPU state.
*/
class Microcode_revision : public Vmm::Msr_device
{
enum { Ia32_bios_sign_id = 0x8b };
public:
Microcode_revision(Vmm::Vcpu_ptr vcpu)
: _ucode_revision((l4_uint64_t)vcpu.ucode_revision() << 32)
{
// Fiasco reports just the upper 32-bit aka microcode revision. To recreate
// the complete MSR, we need to shift it to the upper 32-bit of the 64-bit
// MSR.
}
bool read_msr(unsigned msr, l4_uint64_t *value, unsigned) const override
{
if (msr != Ia32_bios_sign_id)
return false;
*value = _ucode_revision;
return true;
}
bool write_msr(unsigned, l4_uint64_t, unsigned) override
{ return false; }
private:
l4_uint64_t const _ucode_revision;
}; // Microcode_revision
} // namespace Vdev

View File

@@ -0,0 +1,159 @@
/*
* Copyright (C) 2023-2024 genua GmbH, 85551 Kirchheim, Germany
* All rights reserved. Alle Rechte vorbehalten.
*/
/*
* Copyright (C) 2025 Kernkonzept GmbH.
* Author(s): Philipp Eppelt <philipp.eppelt@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#include "openbsd_bootparams.h"
#include "acpi.h"
namespace Vmm::Openbsd {
void Boot_params::dump()
{
info().printf("OpenBSD Boot Parameters: =============================== \n");
info().printf(" howto: 0x%x\n", _params.howto);
info().printf(" apiversion: 0x%x\n", _params.apiversion);
info().printf(" ac: %d\n", _params.ac);
info().printf(" av: 0x%x\n", _params.av);
info().printf(" bootdev: 0x%x\n", _params.bootdev);
info().printf(" end: 0x%x\n", _params.end);
}
void Boot_params::add_to_memmap(Bios_memmap **map, size_t const num,
l4_uint32_t type, l4_uint64_t addr,
l4_uint64_t size)
{
assert(num > 0); // we expect to allocate something and not free everything
*map = static_cast<Bios_memmap *>(realloc(*map, num * sizeof(Bios_memmap)));
if (*map == nullptr)
L4Re::throw_error(-L4_ENOMEM, "Failed to setup memmap!");
// Fill allocated map entry
Bios_memmap &entry = (*map)[num - 1];
entry.addr = static_cast<l4_uint64_t>(addr);
entry.size = size;
entry.type = type;
std::string typestr;
switch (type)
{
case Bios_map_free: typestr = "Adding free"; break;
case Bios_map_res: typestr = "Adding reserved"; break;
case Bios_map_acpi: typestr = "Adding ACPI"; break;
case Bios_map_nvs: typestr = "Adding ACPI NVS"; break;
default: typestr = "Adding unknown"; break;
}
trace().printf("%s memory to map: addr=0x%llx size=0x%llx\n", typestr.c_str(),
addr, size);
}
void Boot_params::setup_memmap(Vm_ram *ram)
{
Bios_memmap *bios_memmap = nullptr;
size_t num = 0;
// Loop over all regions and add them to guest RAM
ram->foreach_region([&bios_memmap, &num, this](Vmm::Ram_ds const &r) mutable {
if (r.writable())
{
if (r.vm_start().get() < Iom_end
&& (r.vm_start().get() + r.size()) > Iom_end)
{
// Split conventional and extended memory
add_to_memmap(&bios_memmap, ++num, Bios_map_free,
r.vm_start().get(), Iom_end - r.vm_start().get());
add_to_memmap(&bios_memmap, ++num, Bios_map_free, Iom_end,
r.size() - Iom_end + r.vm_start().get());
}
else
{
add_to_memmap(&bios_memmap, ++num, Bios_map_free,
r.vm_start().get(), r.size());
}
}
else
{
add_to_memmap(&bios_memmap, ++num, Bios_map_res, r.vm_start().get(),
r.size());
}
});
auto facs = Acpi::Facs_storage::get()->mem_region();
add_to_memmap(&bios_memmap, ++num, Bios_map_acpi, facs.start.get(),
facs.end - facs.start + 1);
add_to_memmap(&bios_memmap, ++num, Bios_map_end, 0, 0);
if (bios_memmap != nullptr)
{
info().printf("Add BIOS memmap at %p.\n", bios_memmap);
add_bootarg(Bootarg_memmap, num * sizeof(Bios_memmap), bios_memmap);
free(bios_memmap);
}
}
void Boot_params::write(Vm_ram *ram)
{
// Prepare BIOS ram regions
setup_memmap(ram);
// Add default uart console
Bios_consdev cons;
cons.consdev = makedev_obsd(8, 0); // com0
cons.conspeed = 115200;
cons.consaddr = 0x3f8;
add_bootarg(Bootarg_consdev, sizeof(cons), &cons);
// Finalize and write boot arguments to guest memory
add_bootarg(Bootarg_end, 0, nullptr);
Vmm::Guest_addr bootargs_pos = Vmm::Guest_addr(Phys_mem_addr * 9);
memset(ram->guest2host<void *>(bootargs_pos), 0, _bootargs_size);
memcpy(ram->guest2host<void *>(bootargs_pos), _bootargs, _bootargs_size);
_params.av = bootargs_pos.get();
_params.ac = _bootargs_size;
// Write entry stack
memset(ram->guest2host<void *>(_gp_addr), 0, Phys_mem_addr);
memcpy(ram->guest2host<void *>(_gp_addr), &_params,
sizeof(Openbsd_entry_stack));
dump();
}
void Boot_params::add_bootarg(int type, size_t length, void const *data)
{
// Prepare header
Boot_args next;
next.ba_type = type;
next.ba_size = sizeof(next) - sizeof(next.ba_arg) + length;
// Extend memory allocation
size_t newsize = _bootargs_size + next.ba_size;
if (newsize > L4_PAGESIZE)
L4Re::throw_error(-L4_EINVAL, "OpenBSD bootargs: Too many arguments!");
_bootargs = realloc(_bootargs, newsize);
if (_bootargs == nullptr)
L4Re::throw_error(-L4_ENOMEM, "Failed to add bootarg!");
auto ptr_byte_add = [](void *ptr, l4_uint8_t param) {
return static_cast<void *>(static_cast<l4_uint8_t *>(ptr) + param);
};
// Paste header and content to memory
memcpy(ptr_byte_add(_bootargs, _bootargs_size), &next,
sizeof(next) - sizeof(next.ba_arg));
_bootargs_size = newsize;
if (data)
memcpy(ptr_byte_add(_bootargs, _bootargs_size - length), data, length);
}
} // namespace Vmm::Openbsd

View File

@@ -0,0 +1,196 @@
/*
* Copyright (C) 2023-2024 genua GmbH, 85551 Kirchheim, Germany
* All rights reserved. Alle Rechte vorbehalten.
*/
/*
* Copyright (C) 2025 Kernkonzept GmbH.
* Author(s): Philipp Eppelt <philipp.eppelt@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include <l4/sys/types.h>
#include "debug.h"
#include "vm_ram.h"
namespace Vmm::Openbsd {
// See OpenBSD: sys/stand/boot/bootarg.h
enum
{
Bapiv_ancient = 0x00000000, /* MD old i386 bootblocks */
Bapiv_vars = 0x00000001, /* MD structure w/ add info passed */
Bapiv_vector = 0x00000002, /* MI vector of MD structures passed */
Bapiv_env = 0x00000004, /* MI environment vars vector */
Bapiv_bmemmap = 0x00000008, /* MI memory map passed is in bytes */
Bootarg_apiver = (Bapiv_vector|Bapiv_env|Bapiv_bmemmap),
Bootarg_end = -1,
Bootarg_memmap = 0,
};
struct Boot_args
{
l4_int32_t ba_type;
l4_int32_t ba_size;
l4_int32_t ba_next;
//struct _boot_args *ba_next;
char ba_arg[1];
} __attribute__((packed));
static_assert(sizeof(Boot_args) == 13,
"Size of packed Boot_args struct is as expected.");
// See OpenBSD: sys/arch/amd64/include/biosvar.h
enum
{
Bios_map_end = 0x00, /* End of array XXX - special */
Bios_map_free = 0x01, /* Usable memory */
Bios_map_res = 0x02, /* Reserved memory */
Bios_map_acpi = 0x03, /* ACPI Reclaim memory */
Bios_map_nvs = 0x04, /* ACPI NVS memory */
};
struct Bios_memmap
{
l4_uint64_t addr; /* Beginning of block */
l4_uint64_t size; /* Size of block */
l4_uint32_t type; /* Type of block */
} __attribute__((packed));
static_assert(sizeof(Bios_memmap) == 20,
"Size of packed Bios_memmap struct is as expected.");
enum
{
Bootarg_consdev = 5,
};
struct Bios_consdev
{
l4_int32_t consdev;
l4_int32_t conspeed;
l4_uint64_t consaddr;
l4_int32_t consfreq;
l4_uint32_t flags;
l4_int32_t reg_width;
l4_int32_t reg_shift;
} __attribute__((packed));
static_assert(sizeof(Bios_consdev) == 32,
"Size of packed Bios_consdev struct is as expected.");
// See OpenBSD: sys/dev/isa/isareg.h
enum
{
Iom_end = 0x100000 /* End of I/O Memory "hole" */
};
// See OpenBSD: sys/sys/types.h
static constexpr unsigned makedev_obsd(unsigned x, unsigned y)
{
return ((((x) & 0xff) << 8) | ((y) & 0xff) | (((y) & 0xffff00) << 8));
}
// Memory layout for kernel entry function stack with parameters
// This assembles the memory stack for the legacy exec call in OpenBSD
// file sys/arch/amd64/stand/libsa/exec_i386.c
struct Openbsd_entry_stack
{
l4_uint32_t returnaddr; // unused
l4_uint32_t howto; // int
l4_uint32_t bootdev; // dev_t
l4_uint32_t apiversion; // api version of /boot
l4_uint32_t end; // End address of loaded kernel binary
l4_uint32_t extmem; // extended memory, unused
l4_uint32_t cnvmem; // base memory reported by bios
l4_uint32_t ac; // Length of bootargs
l4_uint32_t av; // Offset of bootargs
} __attribute__((packed));
static_assert(sizeof(Openbsd_entry_stack) == 36,
"Size of packed Openbsd_entry_stack struct is as expected.");
class Boot_params
{
public:
enum
{
Phys_mem_addr = L4_PAGESIZE, ///< Location of the OpenBSD boot parameters
};
Boot_params(Vmm::Guest_addr addr, l4_addr_t kernel,
l4_addr_t kernel_size)
: _gp_addr(addr), _bootargs(nullptr), _bootargs_size(0)
{
info().printf("Boot_params @ 0x%lx, Kernel @ 0x%lx (size=%ld)\n",
addr.get(), kernel, kernel_size);
memset(static_cast<void *>(&_params), 0, sizeof(Openbsd_entry_stack));
_params.apiversion = Bootarg_apiver;
_params.cnvmem = Iom_end;
_params.ac = 0;
_params.av = 0;
_params.end = kernel + kernel_size;
}
/**
* Print OpenBSD Boot Parameters on console
*/
void dump();
/**
* Write boot parameters into guest memory
*/
void write(Vm_ram *ram);
private:
/**
* Add memory to memory map
*/
void add_to_memmap(Bios_memmap **map, size_t const num, l4_uint32_t type,
l4_uint64_t addr, l4_uint64_t size);
/**
* Prepare memory map for OpenBSD guest
*/
void setup_memmap(Vm_ram *ram);
/**
* Get guest physical address
*/
Vmm::Guest_addr addr() const { return _gp_addr; }
/**
* Add boot argument to linked list.
*
* \note The data is copied into an internal buffer.
* The caller retains ownership of p.
*/
void add_bootarg(int t, size_t l, void const *p);
private:
static Dbg trace() { return Dbg(Dbg::Core, Dbg::Trace, "OpenBSDBoot"); }
static Dbg info() { return Dbg(Dbg::Core, Dbg::Info, "OpenBSDBoot"); }
/**
* Guest physical address of first page
*/
Vmm::Guest_addr _gp_addr;
/**
* Entry stack
*/
Openbsd_entry_stack _params;
/**
* Blob containing chained boot argument structs of varying sizes
*/
void *_bootargs;
/**
* Size of `_bootargs` in bytes
*/
size_t _bootargs_size;
};
} // namespace Vmm::Openbsd

View File

@@ -0,0 +1,284 @@
/*
* Copyright (C) 2017-2024 Kernkonzept GmbH.
* Author(s): Philipp Eppelt <philipp.eppelt@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#include <l4/re/error_helper>
#include "irq_dt.h"
#include "pit.h"
#include "acpi.h"
namespace Vdev {
Pit_timer::Pit_timer(cxx::Ref_ptr<Gic::Ic> const &ic, unsigned irq)
: _irq(ic, irq)
{
_channel[0] = cxx::make_unique_ptr<Channel>(new Channel(this));
_channel[1] = cxx::make_unique_ptr<Channel>(new Channel(this, true));
_port61 = make_device<Port61>(_channel[1].get());
if (irq != Pit_isa_irq)
{
info().printf("Timer IRQ configured to be %u, default is %u. Adding an "
"override in MADT.\n", irq, Pit_isa_irq);
Acpi::Madt_int_override_storage::get()->add_override(
{Pit_isa_irq, irq, 0U});
}
}
void Pit_timer::io_out(unsigned port, Vmm::Mem_access::Width width,
l4_uint32_t value)
{
if (width != Vmm::Mem_access::Width::Wd8)
return;
std::lock_guard<std::mutex> lock(_mutex);
switch (port)
{
case Mode_command: // PIC_MODE
{
Control_reg control_reg(value);
unsigned channel_select = control_reg.channel();
if (channel_select == 1)
{
warn().printf("set mode for channel 1 unsupported\n");
break;
}
// select either channel 0 or 2
channel_select = (channel_select >> 1) & 0x1;
if (control_reg.is_read_back_cmd())
{
// read-back command
if (control_reg.raw & (1U << 1)) // channel 0
{
if (control_reg.is_latch_status())
_channel[0]->latch_status();
if (control_reg.is_latch_count())
_channel[0]->latch_count();
}
if (control_reg.raw & (1U << 3)) // channel 2
{
if (control_reg.is_latch_status())
_channel[2]->latch_status();
if (control_reg.is_latch_count())
_channel[2]->latch_count();
}
trace().printf("Read-back command: 0x%x\n", control_reg.raw);
break;
}
_channel[channel_select]->write_status(control_reg.raw
& 0x3f);
trace().printf("Mode command on channel %d: 0x%x\n", channel_select,
control_reg.raw);
break;
}
case Channel_0_data:
case Channel_2_data:
{
trace().printf("Writing 0x%x for channel %d\n", value, port);
unsigned channel_select = port2idx(port);
_channel[channel_select]->write_count(value & 0xff);
break;
}
default:
warn().printf("write to unimplemented channel 1\n");
break;
}
}
void Pit_timer::io_in(unsigned port, Vmm::Mem_access::Width width,
l4_uint32_t *value)
{
// *value contains the value returned to the guest. It defaults to -1 from
// Guest::handle_io_access(). Therefore we do not set it here in case of an
// unhandled path.
if (width != Vmm::Mem_access::Width::Wd8)
return;
switch (port)
{
case Mode_command: /* Register is write only. Ignore read. */ break;
case Channel_0_data:
case Channel_2_data:
{
unsigned ch = port2idx(port);
std::lock_guard<std::mutex> lock(_mutex);
*value = _channel[ch]->read();
break;
}
default:
warn().printf("PIT read from unimplemented channel 1\n");
break;
}
}
void Pit_timer::Channel::write_count(l4_uint8_t value)
{
_count_latch.reset();
_status_latch.reset();
if (_status.is_mode0())
{
// when writing a new count, out goes low.
set_output(false);
}
switch(_status.access())
{
case Access_lobyte:
_reload = set_low_byte(_reload, value);
check_start_counter();
break;
case Access_hibyte:
_reload = set_high_byte(_reload, value);
check_start_counter();
break;
case Access_lohi:
write_lo_hi(value);
break;
default:
warn().printf("Invalid access value for write to counter: counter "
"%u, status 0x%x\n",
_is_channel2 ? 2U : 0U, _status.raw);
return;
}
trace().printf("Written new counter value to channel %i: reload: 0x%x, value "
"0x%x\n",
_is_channel2 ? 2U : 0U, _reload, value);
}
void Pit_timer::Channel::check_start_counter()
{
// Assumption: only called after the full write of a counter
if (!_gate)
{
warn().printf("count written, but gate not high: Counter %i\n",
_is_channel2 ? 2 : 0);
return;
}
if (_status.is_mode0() || _status.is_mode4())
{
if (_running)
stop_counter();
start_counter();
}
else if (!_running && (_status.is_mode2() || _status.is_mode3()))
start_counter();
// modes 1, 2, 3, 5 do not change their counter value on a new reload value.
}
void Pit_timer::Channel::write_status(l4_uint8_t value)
{
if ((value & 0x30U) == 0) // latch command
{
latch_count();
return;
}
// Spec states: When writing to control word, all control logic resets.
stop_counter();
_count_latch.reset();
_status_latch.reset();
_read_lo = true;
_write_lo = true;
_status.write(value);
// initial output level depends on the mode. Only mode0 is initially low.
set_output(!_status.is_mode0());
trace().printf("New status on channel %i: 0x%x (mode %u)\n",
_is_channel2 ? 2 : 0, _status.raw, _status.opmode().get());
}
l4_uint8_t Pit_timer::Channel::read()
{
if (_status_latch.valid)
{
_status_latch.valid = false;
return _status_latch.value & 0xff;
}
if (_count_latch.valid)
{
switch (_status.access())
{
case Access_lobyte:
_count_latch.valid = false;
return low_byte(_count_latch.value);
case Access_hibyte:
_count_latch.valid = false;
return high_byte(_count_latch.value);
case Access_lohi:
if (_count_latch.read_lo == false) // reading 2nd byte invalidates
_count_latch.valid = false;
return read_lo_hi(&_count_latch.read_lo, _count_latch.value);
default:
warn().printf("Read latch with invalid access mode: counter "
"%u, status 0x%x\n",
_is_channel2 ? 2U : 0U, _status.raw);
return 0;
}
}
// read counter
l4_uint16_t curr = current();
switch (_status.access())
{
case Access_lobyte: return low_byte(curr);
case Access_hibyte: return high_byte(curr);
case Access_lohi: return read_lo_hi(&_read_lo, curr);
default:
warn().printf("Read counter with invalid access mode: counter "
"%u, status 0x%x\n",
_is_channel2 ? 2U : 0U, _status.raw);
return 0;
}
}
} // namespace Vdev
#include "device_factory.h"
#include "guest.h"
namespace {
struct F : Vdev::Factory
{
cxx::Ref_ptr<Vdev::Device> create(Vdev::Device_lookup *devs,
Vdev::Dt_node const &node) override
{
Vdev::Irq_dt_iterator it(devs, node);
if (it.next(devs) < 0)
return nullptr;
if (!it.ic_is_virt())
L4Re::chksys(-L4_EINVAL, "PIT requires a virtual interrupt controller");
auto dev = Vdev::make_device<Vdev::Pit_timer>(it.ic(), it.irq());
auto *vmm = devs->vmm();
auto region = Vmm::Io_region(0x40, 0x43, Vmm::Region_type::Virtual);
vmm->add_io_device(region, dev);
region = Vmm::Io_region(0x61, 0x61, Vmm::Region_type::Virtual);
vmm->add_io_device(region, dev->port61());
vmm->register_timer_device(dev);
return dev;
}
}; // struct F
static F f;
static Vdev::Device_type t = {"virt-pit", nullptr, &f};
} // namespace

View File

@@ -0,0 +1,432 @@
/*
* Copyright (C) 2017-2020, 2022-2024 Kernkonzept GmbH.
* Author(s): Philipp Eppelt <philipp.eppelt@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include <mutex>
#include <l4/cxx/bitfield>
#include <l4/cxx/unique_ptr>
#include <l4/re/env.h>
#include "device.h"
#include "io_device.h"
#include "irq.h"
#include "timer.h"
namespace Vdev {
/**
* Limited implementation of 8254 PROGRAMMABLE INTERVAL TIMER.
*
* Supports only channel 0 and 2.
* After a read-back command with status field, the following bits in the
* status field latched are not supported: OUTPUT [7], NULL COUNT [6].
*
* Modes 0-3 are supported for both counters.
* Mode 4 is only useable on counter 0, for the triggered interrupt.
* Mode 5 is not supported.
*
* Modes 4 and 5 are not supported for counter 2, because the single tick
* change in output is not emulated and its questionable, if the emulation
* would be precise enough to allow visiblity to the guest.
*
* \note This timer model uses the KIP clock as time base. You need to
* configure the Microkernel with CONFIG_SYNC_TSC in order to achieve
* sufficient granularity.
*/
class Pit_timer
: public Vmm::Io_device,
public Vdev::Device,
public Vdev::Timer
{
enum : l4_uint8_t { Pit_isa_irq = 0, };
enum
{
Channels = 2,
Pit_tick_rate = 1193182, // given in Herz
Microseconds_per_second = 1000000ULL,
Channel_0_data = 0,
Channel_2_data = 2,
Mode_command = 3,
Low_byte_mask = 0xff,
High_byte_mask = 0xff00,
High_byte_shift = 0x8,
Latch_cmd_null_mask = 0x3f,
Latch_cmd_channel_mask = 0xc0,
Access_latch = 0,
Access_lobyte = 1,
Access_hibyte = 2,
Access_lohi = 3,
Mode_terminal_count = 0,
Mode_hw_oneshot = 1,
Mode_rate_gen = 2,
Mode_rate_gen2 = 6,
Mode_square_wave = 3,
Mode_square_wave2 = 7,
Mode_sw_triggerd_strobe = 4,
// mode 5 unsupported.
Mode_periodic_mask = 0x2,
};
class Channel: public L4::Ipc_svr::Timeout_queue::Timeout
{
struct Status
{
Status() : raw(0) {}
Status(l4_uint8_t v) : raw(v) {}
l4_uint8_t raw = 0;
CXX_BITFIELD_MEMBER(7, 7, output, raw);
CXX_BITFIELD_MEMBER(6, 6, count, raw);
CXX_BITFIELD_MEMBER(4, 5, access, raw);
CXX_BITFIELD_MEMBER(1, 3, opmode, raw);
CXX_BITFIELD_MEMBER(0, 0, bcd, raw);
enum
{
// Bits not changed on mode command
Retain_mask = output_bfm_t::Mask | count_bfm_t::Mask
};
void write(l4_uint8_t val)
{ raw = (val & ~Retain_mask) | (raw & Retain_mask); }
bool is_periodic_mode() const { return opmode() > Mode_hw_oneshot; }
bool is_one_shot_mode() const { return !is_periodic_mode(); }
bool is_mode0() const { return opmode() == Mode_terminal_count; }
bool is_mode1() const { return opmode() == Mode_hw_oneshot; }
bool is_mode2() const
{ return opmode() == Mode_rate_gen || opmode() == Mode_rate_gen2; }
bool is_mode3() const
{ return opmode() == Mode_square_wave || opmode() == Mode_square_wave2; }
bool is_mode4() const
{ return opmode() == Mode_sw_triggerd_strobe; }
};
struct Latch
{
void reset()
{
value = 0;
valid = false;
read_lo = true;
}
l4_uint16_t value = 0;
bool valid = false;
bool read_lo = true;
};
public:
Channel(Pit_timer *pit, bool is_channel2 = false)
: _is_channel2(is_channel2), _gate(!is_channel2), _pit(pit)
{}
// called in the context of the timer thread, be careful with locking!
void expired()
{
// Unimplemented: mode2, 4, 5: output shall be low for one tick
// the single-tick output change in modes 2, 4 & 5 is not emulated
if (_status.is_mode3())
{
// Toggle output
set_output(!_status.output());
}
else
set_output(true);
if(!_is_channel2)
_pit->_irq.inject();
if (_status.is_mode2() || _status.is_mode3())
{
_reload_kip_clock = l4_kip_clock(l4re_kip());
if (_reload)
_pit->requeue_timeout(this, next_timeout_us());
}
else
{
// The timer in the non periodic modes does not stop, but rolls over
// and continues counting until gate is low or counter is set to 0.
// Mode0 would not fire an interrupt again, since out is high until
// reprogrammed. We don't emulate any of this and just stop.
_running = false;
}
}
void latch_count()
{
// ignore all but the first latch command
if (_count_latch.valid)
return;
_count_latch.value = current();
_count_latch.valid = true;
_count_latch.read_lo = true;
}
void latch_status()
{
if (_status_latch.valid)
return;
_status_latch.value = _status.raw;
_status_latch.valid = true;
}
void write_count(l4_uint8_t value);
void write_status(l4_uint8_t value);
l4_uint8_t read();
bool gate() const { return _gate; }
void gate(bool high)
{
// We know we are on channel 2, as only channel 2's gate can change.
trace().printf("Channel 2: set gate to %i from %i\n", high, _gate);
if (_status.is_mode0())
{
if (!high && _gate)
stop_counter();
else if (high && !_gate)
start_counter();
// XXX this reloads the counter, but it should stop counting and
// continue after gate goes high again, unless output is high;
}
else if (_status.is_mode1())
{
if (high && !_gate) // retrigger
{
stop_counter();
start_counter();
set_output(false);
}
}
else if (_status.is_mode2() || _status.is_mode3())
{
// the single-tick output change in modes 2, 4 & 5 is not emulated
if (high && !_gate)
{
start_counter();
set_output(true);
}
else if (!high && _gate)
stop_counter();
}
// modes 4 & 5 not supported
_gate = high;
}
private:
static l4_uint8_t low_byte(l4_uint16_t v)
{ return v & Low_byte_mask; }
static l4_uint8_t high_byte(l4_uint16_t v)
{ return (v >> High_byte_shift) & Low_byte_mask; }
static l4_uint16_t set_high_byte(l4_uint16_t reg, l4_uint8_t value)
{ return (reg & Low_byte_mask) | (value << High_byte_shift); }
static l4_uint16_t set_low_byte(l4_uint16_t reg, l4_uint8_t value)
{ return (reg & High_byte_mask) | value; }
static l4_uint8_t read_lo_hi(bool *read_lo, l4_uint16_t count)
{
l4_uint8_t ret = 0;
if (*read_lo)
ret = low_byte(count);
else
ret = high_byte(count);
*read_lo = !*read_lo;
return ret;
}
void write_lo_hi(l4_uint8_t value)
{
if (_write_lo)
_reload = set_low_byte(_reload, value);
else
{
_reload = set_high_byte(_reload, value);
check_start_counter();
}
_write_lo = !_write_lo;
}
void set_output(bool out)
{
_status.output() = out;
if (_is_channel2)
out ? _pit->_port61->set_out() : _pit->_port61->clear_out();
}
void start_counter()
{
_reload_kip_clock = l4_kip_clock(l4re_kip());
if (_reload)
{
_pit->enqueue_timeout(this, next_timeout_us());
trace().printf("start counter for channel %i (was %s)\n",
_is_channel2 ? 2 : 0,
_running ? "running" : "not running");
_running = true;
}
}
void stop_counter()
{
trace().printf("stop counter for channel %i (was %s), reload: 0x%x\n",
_is_channel2 ? 2 : 0, _running ? "running" : "not running",
_reload);
_pit->dequeue_timeout(this);
_running = false;
}
void check_start_counter();
/**
* Next absolute timeout in microseconds.
*/
inline l4_cpu_time_t next_timeout_us() const
{
assert(_reload != 0);
l4_kernel_clock_t kip = l4_kip_clock(l4re_kip());
l4_cpu_time_t timeout_us =
_reload * Microseconds_per_second / Pit_tick_rate;
// square wave with half-time toggle
if (_status.is_mode3())
timeout_us /= 2;
return kip + timeout_us;
}
/**
* Calculate the current value of the counter.
*
* The counters count down from _reload with the fixed Pit_tick_rate.
*
* Our Pit model does not update the tick value by itself. Instead it only
* calculates the tick count when the guest reads the counter register. We
* use the TSC as time basis.
*
* returns the current counter value of this channel
*/
l4_uint32_t current()
{
// current time in microseconds
l4_kernel_clock_t kip_us = l4_kip_clock(l4re_kip());
// time that has gone by since _reload was set
l4_cpu_time_t diff_us = kip_us - _reload_kip_clock;
// return current counter value
l4_uint32_t ticks = diff_us * Pit_tick_rate / Microseconds_per_second;
if (_status.is_mode3())
{
// in mode3 the counter decrements by two on each tick, since we
// compare to _reload, we have to double the number of counter
// decrements. expired() is called on each half-period, where
// _reload_kip_clock is adapted to track only the time since the last
// reload.
ticks *= 2;
}
if (ticks >= _reload)
return 0;
return _reload - ticks;
}
l4_uint16_t _reload = 0U;
Status _status;
bool _is_channel2;
bool _gate; //< 0 = low
bool _running = false;
Latch _count_latch;
Latch _status_latch;
Pit_timer *_pit;
l4_cpu_time_t _reload_kip_clock = 0ULL;
bool _read_lo = true;
bool _write_lo = true;
};
struct Port61 : public Vmm::Io_device
{
Port61(Channel *ch2) : _ch2(ch2) {}
char const *dev_name() const override
{ return "PIT port 61"; }
void io_in(unsigned, Vmm::Mem_access::Width, l4_uint32_t *value) override
{
*value = val;
val &= ~(1 << 5); // destructive read
}
void io_out(unsigned, Vmm::Mem_access::Width, l4_uint32_t value) override
{
_ch2->gate(value & 0x1);
val = value & 0xff;
}
bool channel_2_on() const { return val & 0x1; }
void set_out() { val |= (1 << 5); }
void clear_out() { val &= ~(1 << 5); }
l4_uint8_t val = 0;
Channel *_ch2;
};
struct Control_reg
{
Control_reg(l4_uint8_t val) : raw(val) {}
l4_uint8_t raw;
CXX_BITFIELD_MEMBER(6, 7, channel, raw);
CXX_BITFIELD_MEMBER(4, 5, access, raw);
CXX_BITFIELD_MEMBER(1, 3, opmode, raw);
CXX_BITFIELD_MEMBER(0, 0, bcd, raw);
bool is_read_back_cmd() const { return channel() == 3; }
bool is_latch_status() const { return !(raw & (1U << 4)); }
bool is_latch_count() const { return !(raw & (1U << 5)); }
};
static constexpr int port2idx(int port) { return port >> 1; }
static Dbg trace() { return Dbg(Dbg::Irq, Dbg::Trace, "PIT"); }
static Dbg info() { return Dbg(Dbg::Irq, Dbg::Info, "PIT"); }
static Dbg warn() { return Dbg(Dbg::Irq, Dbg::Warn, "PIT"); }
public:
Pit_timer(cxx::Ref_ptr<Gic::Ic> const &ic, unsigned irq);
virtual ~Pit_timer() = default;
char const *dev_name() const override
{ return "PIT"; }
cxx::Ref_ptr<Vmm::Io_device> const port61() const { return _port61; }
void io_out(unsigned port, Vmm::Mem_access::Width width,
l4_uint32_t value) override;
void io_in(unsigned port, Vmm::Mem_access::Width width,
l4_uint32_t *value) override;
private:
Vmm::Irq_edge_sink _irq;
cxx::unique_ptr<Channel> _channel[Channels];
std::mutex _mutex;
cxx::Ref_ptr<Port61> _port61;
};
} // namespace Vdev

View File

@@ -0,0 +1,188 @@
/*
* Copyright (C) 2017-2020, 2022-2024 Kernkonzept GmbH.
* Author(s): Philipp Eppelt <philipp.eppelt@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include <l4/sys/types.h>
#include <l4/l4virtio/virtqueue>
#include <l4/cxx/ref_ptr>
#include "debug.h"
#include "ds_mmio_mapper.h"
#include "vcpu_ptr.h"
#include "vm_ram.h"
namespace Vmm {
class Pt_walker : public cxx::Ref_obj
{
public:
Pt_walker(cxx::Ref_ptr<Vm_ram> mmap, unsigned max_phys_addr_bit)
: _mmap(mmap),
_levels {{Pml4_shift, Pml4_mask},
{Pdpt_shift, Pdpt_mask},
{Pd_shift, Pd_mask},
{Pt_shift, Pt_mask}
},
_max_phys_addr_mask((1UL << max_phys_addr_bit) - 1)
{
trace().printf("PT_walker: MAXPHYSADDR bits %i\n", max_phys_addr_bit);
_phys_addr_mask_4k = _max_phys_addr_mask & ~((1UL << Phys_addr_4k) - 1);
_phys_addr_mask_2m = _max_phys_addr_mask & ~((1UL << Phys_addr_2m) - 1);
_phys_addr_mask_1g = _max_phys_addr_mask & ~((1UL << Phys_addr_1g) - 1);
}
l4_uint64_t walk(l4_uint64_t cr3, l4_uint64_t virt_addr)
{
// mask everything besides the 4K-aligned PML4 table address
l4_uint64_t *tbl = translate_to_table_base(cr3 & _phys_addr_mask_4k);
l4_uint64_t entry = _levels[0].get_entry(tbl, virt_addr);
if (0)
trace().printf("cr3 0x%llx, entry 0x%llx, vaddr 0x%llx\n", cr3, entry,
virt_addr);
if (!(entry & Present_bit))
L4Re::chksys(-L4_EINVAL, "PML4 table is present\n");
for (unsigned i = 1; i < Pt_levels; ++i)
{
// PML4Entry: no PAT bit (12) --> mask everything except [M-1:12]
tbl = translate_to_table_base(entry & _phys_addr_mask_4k);
entry = _levels[i].get_entry(tbl, virt_addr);
if (!(entry & Present_bit))
{
char buf[78];
snprintf(buf, sizeof(buf),
"Found entry is present. Actual: Entry 0x%llx not "
"present.\n",
entry);
L4Re::chksys(-L4_EINVAL, buf);
}
// check for PS = 0 in PDPT & PD entries
if (i < 3 && entry & Pagesize_bit)
{
if (i == 1)
return add_voffset(translate_to_table_base(entry & _phys_addr_mask_1g),
virt_addr & G1_offset_mask);
if (i == 2)
return add_voffset(translate_to_table_base(entry & _phys_addr_mask_2m),
virt_addr & M2_offset_mask);
}
}
return add_voffset(translate_to_table_base(entry & _phys_addr_mask_4k),
virt_addr & K4_offset_mask);
}
private:
l4_uint64_t *translate_to_table_base(l4_uint64_t addr)
{
auto *ret = _mmap->guest2host<l4_uint64_t *>(Guest_addr(addr));
if (0)
trace().printf("Ram_addr: addr 0x%llx --> %p\n", addr, ret);
return ret;
}
l4_uint64_t add_voffset(l4_uint64_t *addr, l4_uint64_t offset)
{
return reinterpret_cast<l4_uint64_t>(addr) + offset;
}
void dump_level(l4_uint64_t *tbl)
{
trace().printf("Dumping page table %p\n", tbl);
for (int i = 0; i < 512; ++i)
if (tbl[i] != 0 && tbl[i] & Present_bit)
trace().printf("%i :: 0x%16llx\n", i, tbl[i]);
}
void dump_all_valid_entries(l4_uint64_t base_ptr)
{
trace().printf(" +++++ Dumping all entries ++++ \n");
l4_uint64_t *tbl = reinterpret_cast<l4_uint64_t *>(base_ptr);
for (int i = 0; i < 512; ++i)
{
if (tbl[i] != 0 && tbl[i] & Present_bit)
{
trace().printf("%i :: 0x%16llx\n", i, tbl[i]);
dump_level(translate_to_table_base(tbl[i] & _phys_addr_mask_4k));
}
}
trace().printf(" +++++ Dumped all entries ++++ \n");
}
struct Level
{
Level(int s, l4_uint64_t m) : shift(s), mask(m) {}
l4_uint64_t get_entry(l4_uint64_t *tbl, l4_uint64_t vaddr) const
{
if (0)
trace().printf("next level idx: %llu\n", (vaddr & mask) >> shift);
return tbl[(vaddr & mask) >> shift];
}
int const shift;
l4_uint64_t const mask;
};
static Dbg trace() { return Dbg(Dbg::Mmio, Dbg::Trace, "PTW"); }
enum
{
Table_index_size = 9,
Table_index_mask = (1UL << Table_index_size) - 1,
K4_offset_size = 12,
K4_offset_mask = (1UL << K4_offset_size) - 1,
M2_offset_size = 21,
M2_offset_mask = (1UL << M2_offset_size) - 1,
G1_offset_size = 30,
G1_offset_mask = (1UL << G1_offset_size) - 1,
Pt_shift = 12,
Pt_mask = Table_index_mask << Pt_shift,
Pd_shift = 21,
Pd_mask = Table_index_mask << Pd_shift,
Pdpt_shift = 30,
Pdpt_mask = Table_index_mask << Pdpt_shift,
Pml4_shift = 39,
Pml4_mask = Table_index_mask << Pml4_shift,
Present_bit = 1UL,
RW_bit = 2UL,
US_bit = 4UL,
Pagesize_bit = 1UL << 7,
Phys_addr_4k = 12,
Phys_addr_2m = 21,
Phys_addr_1g = 30,
XD_bit_shift = 63,
XD_bit = 1UL << XD_bit_shift,
Pt_levels = 4,
};
cxx::Ref_ptr<Vm_ram> _mmap;
Level const _levels[Pt_levels];
l4_uint64_t _phys_addr_mask_4k;
l4_uint64_t _phys_addr_mask_2m;
l4_uint64_t _phys_addr_mask_1g;
l4_uint64_t _max_phys_addr_mask;
};
} // namespace Vmm

View File

@@ -0,0 +1,217 @@
/*
* Copyright (C) 2020-2024 Kernkonzept GmbH.
* Author(s): Steffen Liebergeld <steffen.liebergeld@kernkonzept.com>
* Jan Klötzke <jan.kloetzke@kernkonzept.com>
* Christian Pötzsch <christian.poetzsch@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#include "acpi.h"
#include "device/qemu_fw_cfg.h"
namespace {
using namespace Acpi;
/**
* Provide tables via the Qemu_fw_cfg to the guest firmware.
*
* The details of the interface are documented in the Qemu sources in
* hw/acpi/bios-linker-loader.c. It is actively used by firmwares such as
* Tianocore, so it can be considered stable.
*
* Because the final address of the tables is not known here, a more flexible
* interface is used. The guest firmware is instructed by the
* "etc/table-loader" commands file how to install the tables correctly. It
* holds the commands to allocate space for the tables, patch the pointers
* between the different tables and how to compute the checksums.
*/
class Acpi_tables : public Tables
{
enum
{
Tables_reservation = 8192,
Loader_commands_reservation = 512,
};
enum
{
// Commands
Qemu_loader_allocate = 1,
Qemu_loader_add_pointer = 2,
Qemu_loader_add_checksum = 3,
Qemu_loader_zone_high = 1,
Qemu_loader_zone_fseg = 2,
Qemu_loader_file_name_size = Qemu_fw_cfg::File_name_size,
};
struct Qemu_loader_entry
{
l4_uint32_t type;
union
{
struct Allocate
{
char file_name[Qemu_loader_file_name_size];
l4_uint32_t alignment;
l4_uint8_t zone;
} allocate;
struct Add_pointer
{
char dst_file_name[Qemu_loader_file_name_size];
char src_file_name[Qemu_loader_file_name_size];
l4_uint32_t dst_pointer_offset;
l4_uint8_t dst_pointer_size;
} add_pointer;
struct Add_checksum
{
char file_name[Qemu_loader_file_name_size];
l4_uint32_t checksum_offset;
l4_uint32_t start;
l4_uint32_t size;
} add_checksum;
l4_uint8_t pad[124];
} cmd;
};
static_assert(sizeof(Qemu_loader_entry) == 128,
"Invalid size of Qemu_loader_entry");
public:
static char const constexpr *Rsdp_file_name = "etc/acpi/rsdp";
static char const constexpr *Tables_file_name = "etc/acpi/tables";
static char const constexpr *Loader_commands_file_name = "etc/table-loader";
static char const constexpr *System_states_file_name = "etc/system-states";
Acpi_tables(Vdev::Device_lookup *devs)
: _system_states_file(6)
{
info.printf("Initialize Qemu IF ACPI tables.\n");
_tables.resize(Tables_reservation);
_loader_cmds.reserve(Loader_commands_reservation);
cmd_add_alloc(Tables_file_name, 64 /* FACS requirement */, false);
Writer table_wr(reinterpret_cast<l4_addr_t>(_tables.data()), _tables.size());
write_all_tables(table_wr, devs);
_tables.resize(table_wr.pos());
resolve_table_refs_and_checksums(Tables_file_name, table_wr, table_wr);
cmd_add_alloc(Rsdp_file_name, 16, true /* EBDA area */);
_rsdp.resize(Rsdp_size);
Writer rdsp_wr(reinterpret_cast<l4_addr_t>(_rsdp.data()), _rsdp.size());
write_rsdp(rdsp_wr);
resolve_table_refs_and_checksums(Rsdp_file_name, rdsp_wr, table_wr);
// This is a qemu <-> EFI Interface. It is "documented" in
// edk2/Ovmf/Library/QemuFwCfgS3Lib/QemuFwCfgS3PeiDxe.c
// QemuFwCfgS3Enabled()
// We only implement the bit needed for EFI to signal S3 support.
_system_states_file[3] = (1 << 7); // S3 supported
}
std::vector<char> const &rsdp() const
{ return _rsdp; };
std::vector<char> const &tables() const
{ return _tables; }
std::string const & loader_cmds() const
{ return _loader_cmds; }
std::vector<char> const &system_states_file() const
{ return _system_states_file; }
private:
void resolve_table_refs_and_checksums(char const *fn, Writer &wr,
Writer &table_wr)
{
for (Writer::Table_ref const &ref : wr.table_refs())
{
if (ref.size == 4)
*wr.as_ptr<l4_uint32_t>(ref.offset) = table_wr.table_offset(ref.table);
else if (ref.size == 8) // XSDT
*wr.as_ptr<l4_uint64_t>(ref.offset) = table_wr.table_offset(ref.table);
else
L4Re::throw_error(-L4_EINVAL, "Unsupported table offset size.");
cmd_add_pointer(fn, ref.offset, ref.size, Tables_file_name);
}
for (Writer::Checksum const &checksum : wr.checksums())
cmd_add_checksum(fn, checksum.offset, checksum.len, checksum.field_off);
}
void cmd_add_checksum(char const *fn, l4_size_t start, l4_size_t size,
l4_size_t checksum)
{
Qemu_loader_entry e;
std::memset(&e, 0, sizeof(e));
e.type = Qemu_loader_add_checksum;
std::strncpy(e.cmd.add_checksum.file_name, fn,
sizeof(e.cmd.add_checksum.file_name) - 1U);
e.cmd.add_checksum.checksum_offset = checksum;
e.cmd.add_checksum.start = start;
e.cmd.add_checksum.size = size;
_loader_cmds.append((char*)&e, sizeof(e));
}
/**
* Add the pointer value to `src_fn` in the file `dst_fn` at offset
* `dst_off`. The patched pointer size is `dst_size`.
*/
void cmd_add_pointer(char const *dst_fn, l4_size_t dst_off, l4_size_t dst_size,
char const *src_fn)
{
Qemu_loader_entry e;
std::memset(&e, 0, sizeof(e));
e.type = Qemu_loader_add_pointer;
std::strncpy(e.cmd.add_pointer.dst_file_name, dst_fn,
sizeof(e.cmd.add_pointer.dst_file_name) - 1U);
std::strncpy(e.cmd.add_pointer.src_file_name, src_fn,
sizeof(e.cmd.add_pointer.src_file_name) - 1U);
e.cmd.add_pointer.dst_pointer_offset = dst_off;
e.cmd.add_pointer.dst_pointer_size = dst_size;
_loader_cmds.append((char*)&e, sizeof(e));
}
void cmd_add_alloc(char const *fn, l4_size_t align, bool fseg_zone)
{
Qemu_loader_entry e;
std::memset(&e, 0, sizeof(e));
e.type = Qemu_loader_allocate;
std::strncpy(e.cmd.allocate.file_name, fn,
sizeof(e.cmd.allocate.file_name) - 1U);
e.cmd.allocate.alignment = align;
e.cmd.allocate.zone = fseg_zone ? Qemu_loader_zone_fseg
: Qemu_loader_zone_high;
_loader_cmds.append((char*)&e, sizeof(e));
}
std::vector<char> _rsdp;
std::vector<char> _tables;
std::vector<char> _system_states_file;
std::string _loader_cmds;
};
struct Qemu_fw_cfg_tables : public Qemu_fw_cfg::Provider
{
void init_late(Vdev::Device_lookup *devs) override
{
Acpi_tables tables(devs);
Qemu_fw_cfg::put_file(Acpi_tables::Rsdp_file_name, tables.rsdp());
Qemu_fw_cfg::put_file(Acpi_tables::Tables_file_name, tables.tables());
Qemu_fw_cfg::put_file(Acpi_tables::Loader_commands_file_name, tables.loader_cmds());
Qemu_fw_cfg::put_file(Acpi_tables::System_states_file_name,
tables.system_states_file());
}
};
static Qemu_fw_cfg_tables f;
}; // namespace

View File

@@ -0,0 +1,167 @@
/*
* Copyright (C) 2020-2024 Kernkonzept GmbH.
* Author(s): Steffen Liebergeld <steffen.liebergeld@kernkonzept.com>
* Jan Klötzke <jan.kloetzke@kernkonzept.com>
* Christian Pötzsch <christian.poetzsch@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#include "cpu_dev_array.h"
#include "guest.h"
#include "device/qemu_fw_cfg.h"
#include <l4/cxx/unique_ptr>
#include <l4/re/util/env_ns>
namespace {
/**
* Device to forward boot data over the qemu fw configuration interface.
*
* The qemu_fw_cfg node must have l4vmm,kernel, l4vmm,ramdisk and l4vmm,cmdline
* as additional properties. Their value can be an empty string.
*
* \code{.dtb}
* qemu_fw_if {
* compatible = "l4vmm,qemu-fw-cfg";
* reg = <0x1 0x510 0x0c>;
* l4vmm,kernel = "linux";
* l4vmm,ramdisk = "ramdisk";
* l4vmm,cmdline = "console=TTY0";
* };
* \endcode
*/
class Qemu_fw_cfg_boot : public Qemu_fw_cfg::Provider
{
enum Fw_cfg_item_selectors
{
// Item selectors defined by Qemu
Fw_cfg_cpu_count = 0x05,
Fw_cfg_kernel_size = 0x08,
Fw_cfg_initrd_size = 0x0b,
Fw_cfg_boot_menu = 0x0e,
Fw_cfg_kernel_data = 0x11,
Fw_cfg_commandline_size = 0x14,
Fw_cfg_commandline_data = 0x15,
Fw_cfg_kernel_setup_size = 0x17,
Fw_cfg_kernel_setup_data = 0x18,
Fw_cfg_initrd_data = 0x12,
// Added by KK
Fw_cfg_uvmm_dt = 0xe0,
};
void init(Vdev::Device_lookup * /*devs*/, Vdev::Dt_node const &node) override
{
_kernel = node.get_prop<char>("l4vmm,kernel", nullptr);
_ramdisk = node.get_prop<char>("l4vmm,ramdisk", nullptr);
_cmdline = node.get_prop<char>("l4vmm,cmdline", nullptr);
auto c = node.stringlist_count("l4vmm,items");
if (c > 0)
for (int i = 0; i < c; i++)
{
std::string arg(node.stringlist_get("l4vmm,items", i, NULL));
// Find the comma delimiter between "[name=]name" and "string=string".
// The name component should not be empty.
auto pos = arg.find(',');
if (pos == std::string::npos || pos == 0)
L4Re::throw_error(-L4_EINVAL, "fw_cfg items needs name");
// Strip the optional "name=" label from the name component.
auto name = arg.substr(0, pos);
if (name.substr(0, 5) == std::string("name="))
name = name.substr(5);
// Strip the required "string=" label from the string component.
auto string = arg.substr(pos);
if (string.substr(0, 8) != std::string(",string="))
L4Re::throw_error(-L4_EINVAL, "fw_cfg items only support strings");
string = string.substr(8);
_items.push_back(std::make_tuple(name, string));
}
};
void init_late(Vdev::Device_lookup *devs) override
{
if (!_kernel.empty())
{
_kernel_binary = cxx::make_unique<Boot::Binary_ds>(_kernel.c_str());
if (!_kernel_binary->is_valid())
L4Re::throw_error(-L4_EINVAL, "Kernel dataspace not found.");
if (_kernel_binary->is_elf_binary())
L4Re::throw_error(-L4_EINVAL, "Elf files not supported for qemu fw.");
l4_uint8_t num_setup_sects =
*((char *)_kernel_binary->get_data() + Vmm::Bp_setup_sects);
add_kernel(_kernel_binary->ds(), (num_setup_sects + 1) * 512);
}
if (!_ramdisk.empty())
{
_ramdisk_ds = L4Re::Util::Unique_cap<L4Re::Dataspace>(
L4Re::chkcap(L4Re::Util::Env_ns().query<L4Re::Dataspace>(
_ramdisk.c_str()),
"Ramdisk dataspace not found"));
add_initrd(_ramdisk_ds.get());
}
if (!_cmdline.empty())
add_cmdline(_cmdline.c_str());
for (auto const &s: _items)
Qemu_fw_cfg::put_file(std::get<0>(s).c_str(), std::get<1>(s));
add_dt_addr(devs->vmm()->dt_addr());
add_cpu_count(devs->cpus()->max_cpuid() + 1);
};
void add_cmdline(char const *cmdline)
{
size_t len = strlen(cmdline) + 1U;
Qemu_fw_cfg::set_item_u32le(Fw_cfg_commandline_size, len);
Qemu_fw_cfg::set_item(Fw_cfg_commandline_data, cmdline, len);
}
void add_kernel(L4::Cap<L4Re::Dataspace> kernel, l4_size_t setup_size)
{
size_t image_size = kernel->size();
Qemu_fw_cfg::set_item_u32le(Fw_cfg_kernel_setup_size, setup_size);
Qemu_fw_cfg::set_item(Fw_cfg_kernel_setup_data, kernel, 0, setup_size);
Qemu_fw_cfg::set_item_u32le(Fw_cfg_kernel_size, image_size - setup_size);
Qemu_fw_cfg::set_item(Fw_cfg_kernel_data, kernel, setup_size);
}
void add_initrd(L4::Cap<L4Re::Dataspace> initrd)
{
Qemu_fw_cfg::set_item_u32le(Fw_cfg_initrd_size, initrd->size());
Qemu_fw_cfg::set_item(Fw_cfg_initrd_data, initrd);
}
void add_dt_addr(l4_addr_t addr)
{
l4_uint64_t addr_le = htole64(addr);
Qemu_fw_cfg::set_item(Fw_cfg_uvmm_dt, &addr_le, sizeof(addr_le));
}
void add_cpu_count(l4_uint16_t num)
{
Qemu_fw_cfg::set_item_u16le(Fw_cfg_cpu_count, num);
}
std::string _kernel;
cxx::unique_ptr<Boot::Binary_ds> _kernel_binary;
std::string _ramdisk;
L4Re::Util::Unique_cap<L4Re::Dataspace> _ramdisk_ds;
std::string _cmdline;
std::vector<std::tuple<std::string, std::string>> _items;
};
static Qemu_fw_cfg_boot f;
}; // namespace

View File

@@ -0,0 +1,599 @@
/*
* Copyright (C) 2017-2019, 2021-2024 Kernkonzept GmbH.
* Author(s): Philipp Eppelt <philipp.eppelt@kernkonzept.com>
* Steffen Liebergeld <steffen.liebergeld@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
/**
* Minimal viable implementation of a CMOS RTC (Motorola MC146818A).
*
* We do not support setting new time values.
* We only support 24h mode (it is hard-wired).
* We do not support the century byte.
*
* On amd64 linux will assume the rtc is in BCD mode even when the format is
* set to binary.
*
* Example device tree entry:
*
* \code{.dtb}
* rtc {
* compatible = "virt-rtc";
* reg = <0x0 0x0 0x0 0x0>;
* interrupt-parent = <&IOAPIC>;
* interrupts = <8>;
* };
* \endcode
*
* Optionally this emulation can use wallclock-time from an external source.
*/
#include "device_factory.h"
#include "guest.h"
#include "device.h"
#include "io_device.h"
#include "timer.h"
#include "irq_dt.h"
#include "../device/rtc-hub.h"
#include <time.h>
#include <errno.h>
#include <l4/bid_config.h>
namespace Vdev {
class Rtc :
public Vdev::Timer,
public Vdev::Pm_device,
public Vmm::Io_device,
public Vdev::Device
{
enum Register : unsigned
{
Seconds = 0,
Seconds_alarm,
Minutes,
Minutes_alarm,
Hours,
Hours_alarm,
Weekday,
Day_of_month,
Month,
Year,
Status_a = 0xa,
Status_b = 0xb,
Reg_c = 0xc,
Reg_d = 0xd,
// Cmos_ram
Ram_start = 0xe,
Ram_end = 0x80,
Ram_size = Ram_end - Ram_start,
};
enum Status_reg_c : l4_uint8_t
{
Interrupt_request = 0x80,
Periodic_interrupt_flag = 0x40,
Alarm_interrupt_flag = 0x20,
Update_ended_interrupt_flag = 0x10,
};
enum Status_reg_d : l4_uint8_t
{
Valid_ram_and_time = 0x80,
};
struct Status_reg_a
{
l4_uint8_t reg = 0;
CXX_BITFIELD_MEMBER(0, 3, rate_selection_bits, reg);
CXX_BITFIELD_MEMBER(4, 6, divider_selection_bits, reg);
CXX_BITFIELD_MEMBER(7, 7, update_in_progress, reg);
};
struct Status_reg_b
{
l4_uint8_t reg = 0x2; // mode_24 == 1
CXX_BITFIELD_MEMBER(0, 0, daylight_savings_enable, reg);
CXX_BITFIELD_MEMBER(1, 1, mode_24, reg);
CXX_BITFIELD_MEMBER(2, 2, data_mode, reg);
CXX_BITFIELD_MEMBER(3, 3, square_wave_enable, reg);
CXX_BITFIELD_MEMBER(4, 4, update_ended_interrupt_enable, reg);
CXX_BITFIELD_MEMBER(5, 5, alarm_interrupt_enable, reg);
CXX_BITFIELD_MEMBER(6, 6, periodic_interrupt_enable, reg);
CXX_BITFIELD_MEMBER(7, 7, set, reg);
};
struct Alarm : public L4::Ipc_svr::Timeout_queue::Timeout
{
Rtc *_rtc;
Alarm(Rtc *rtc) : _rtc(rtc) {}
/**
* Handle expired alarms.
*
* This function is called from the timer thread.
*/
void expired() override
{
if (!_rtc->_reg_b.alarm_interrupt_enable())
{
trace().printf("Alarm interrupt but alarm interrupt enable not set.\n");
return;
}
{
std::lock_guard<std::mutex> lock(_rtc->_mutex);
_rtc->_reg_c |= Alarm_interrupt_flag;
_rtc->_reg_c |= Interrupt_request;
}
trace().printf("RTC Irq due to alarm expired()\n");
_rtc->_sink.inject();
}
}; // struct Alarm
// allow Alarm access to private Rtc members.
friend struct Alarm;
// convert internal binary representation to BCD if needed
l4_uint32_t convert_to_guest(int val)
{
if (_reg_b.data_mode())
return val;
// See https://de.wikipedia.org/wiki/BCD-Code
return (val % 10) + ((val / 10) << 4);
}
// convert what the guest gave us to internal binary representation
l4_uint8_t convert_from_guest(l4_uint8_t val)
{
if (_reg_b.data_mode()) // we are using binary mode
return val;
return (val & 0xf) + ((val & 0xf0) >> 4) * 10;
}
void handle_set_time(Status_reg_b r)
{
// As long as the set() bit is set, the guest assumes that the clock does
// not update. We redirect all writes to shadow registers, and those
// never get updated.
// The strategy for updating is:
// - the guest sets the set bit to 1
// - the guest writes the new time value to the shadow registers
// - the guest sets the set bit to 0
// - once the set bit is 0, Uvmm retrieves the new time value from the
// shadow registers and updates its internal time.
bool old_set_bit = _reg_b.set().get();
bool new_set_bit = r.set().get();
if (!old_set_bit || new_set_bit)
return;
time_t seconds = ns_to_s(L4rtc_hub::ns_since_epoch());
struct tm *t = gmtime(&seconds);
if (!t)
{
warn().printf("Could not determine time.\n");
return;
}
t->tm_sec = _shadow_registers[Seconds];
t->tm_min = _shadow_registers[Minutes];
t->tm_hour = _shadow_registers[Hours];
t->tm_mday = _shadow_registers[Day_of_month];
t->tm_mon = _shadow_registers[Month] - 1; // months start at '1'
int centuries_since_1900 = t->tm_year / 100 * 100;
// tm_year is defined as 'years since 1900'. The RTC spec instead
// specifies the Year register as 'year in the range of 0-99'. Here we use
// the previous centuries since 1900 (as calculated from "seconds since
// epoch") and add them to the register value from the guest.
t->tm_year = _shadow_registers[Year] + centuries_since_1900;
_seconds = timegm(t);
L4rtc_hub::set_ns_since_epoch(s_to_ns(_seconds));
trace().printf("set time to %04d-%02d-%02d %02d:%02d:%02d\n",
t->tm_year + 1900, t->tm_mon, t->tm_mday,
t->tm_hour, t->tm_min, t->tm_sec);
}
// return next timeout in seconds
time_t calc_next_alarm()
{
time_t seconds = ns_to_s(L4rtc_hub::ns_since_epoch());
struct tm *alarm_time = gmtime(&seconds);
struct tm *current_time = gmtime(&seconds);
if (dont_care_not_set(_shadow_registers[Seconds_alarm]))
alarm_time->tm_sec = _shadow_registers[Seconds_alarm];
else
{
trace().printf("wildcard seconds\n");
alarm_time->tm_sec += 1;
alarm_time->tm_sec %= 60;
}
if (dont_care_not_set(_shadow_registers[Minutes_alarm]))
alarm_time->tm_min = _shadow_registers[Minutes_alarm];
else
{
trace().printf("wildcard minutes\n");
alarm_time->tm_min += 1;
alarm_time->tm_min %= 60;
}
if (dont_care_not_set(_shadow_registers[Hours_alarm]))
alarm_time->tm_hour = _shadow_registers[Hours_alarm];
else
{
trace().printf("wildcard hours\n");
alarm_time->tm_hour += 1;
alarm_time->tm_hour %= 24;
}
time_t alarm_seconds = mktime(alarm_time);
if (alarm_seconds == -1)
trace().printf("error calculating alarm_seconds. Errno %i\n", errno);
time_t current_seconds = mktime(current_time);
if (current_seconds == -1)
trace().printf("error calculating current_seconds. Errno %i\n", errno);
if (alarm_seconds < current_seconds)
{
trace().printf("Alarm is in the past\n");
return ~0L;
}
trace().printf("alarm_seconds=%ld current_seconds=%ld\n", alarm_seconds,
current_seconds);
return (alarm_seconds - current_seconds);
}
void handle_alarms(Status_reg_b r)
{
time_t next_alarm = 0;
{
std::lock_guard<std::mutex> lock(_mutex);
if (r.update_ended_interrupt_enable())
{
trace().printf("Guest wants an update interrupt.\n");
l4_cpu_time_t current_second = ns_to_s(l4_tsc_to_ns(l4_rdtsc()));
_reg_c |= Update_ended_interrupt_flag;
if (current_second > _previous_alarm_second)
{
_previous_alarm_second = current_second;
_reg_c |= Interrupt_request;
_sink.inject();
trace().printf("Update ended interrupt injected immediately\n");
}
}
if (!r.alarm_interrupt_enable())
return;
trace().printf("Guest wants an alarm interrupt.\n");
next_alarm = calc_next_alarm();
if (next_alarm == ~0L) // do not fire for alarms of the past
return;
if (next_alarm == 0) // guest wants an alarm right now
{
l4_cpu_time_t current_second = ns_to_s(l4_tsc_to_ns(l4_rdtsc()));
_reg_c |= Alarm_interrupt_flag;
_reg_c |= Interrupt_request;
if (current_second > _previous_alarm_second)
{
_previous_alarm_second = current_second;
_sink.inject();
trace().printf("Alarm interrupt injected immediately\n");
}
return;
}
}
// guest alarm is at least 1 second in the future
// must not hold the lock when doing the IPC to the timer thread
enqueue_timeout(&_alarm_timeout,
l4_kip_clock(l4re_kip()) + s_to_us(next_alarm));
trace().printf("enqueue timeout %ld\n", next_alarm);
}
void handle_write(l4_uint32_t value)
{
trace().printf("write reg %d value = 0x%x\n", _reg_sel, value & 0xff);
l4_uint8_t val = value & 0xff;
switch (_reg_sel)
{
case Status_a:
{
std::lock_guard<std::mutex> lock(_mutex);
trace().printf("reg a: 0x%x\n", val);
_reg_a.reg = val;
}
break;
case Status_b:
{
trace().printf("reg b: 0x%x\n", val);
Status_reg_b r;
r.reg = val;
// set_time() and alarms() handle the lock themselves
handle_set_time(r);
handle_alarms(r);
{
std::lock_guard<std::mutex> lock(_mutex);
_reg_b.reg = val;
// we only allow mode_24
_reg_b.mode_24().set(1);
}
}
break;
case Reg_c:
case Reg_d:
warn().printf("Write to RO reg (%u)\n", _reg_sel);
break;
default:
if (_reg_sel <= Year)
_shadow_registers[_reg_sel] = convert_from_guest(val);
else if (_reg_sel >= Ram_start && _reg_sel < Ram_end)
cmos_write(_reg_sel - Ram_start, val);
else
warn().printf("Register write not handled (%u)\n", _reg_sel);
break;
}
}
l4_uint32_t handle_read()
{
trace().printf("read reg %d\n", _reg_sel);
// these registers need to always work
switch (_reg_sel)
{
case Status_a:
{
std::lock_guard<std::mutex> lock(_mutex);
return _reg_a.reg;
}
case Status_b:
{
std::lock_guard<std::mutex> lock(_mutex);
return _reg_b.reg;
}
case Reg_c:
{
std::lock_guard<std::mutex> lock(_mutex);
unsigned ret = _reg_c;
trace().printf("reg c: %x\n", _reg_c);
// reading clears the status bits
_reg_c = 0;
_sink.ack();
return ret;
}
case Reg_d:
return Valid_ram_and_time;
}
// only update time if guest does not currently try to set a new time
if (!_reg_b.set())
_seconds = ns_to_s(L4rtc_hub::ns_since_epoch());
struct tm *t = gmtime(&_seconds);
if (!t)
{
warn().printf("Could not determine time.\n");
return 0;
}
l4_uint32_t ret = 0;
switch (_reg_sel)
{
case Seconds:
ret = convert_to_guest(t->tm_sec);
break;
case Seconds_alarm:
ret = convert_to_guest(_shadow_registers[Seconds_alarm]);
break;
case Minutes:
ret = convert_to_guest(t->tm_min);
break;
case Minutes_alarm:
ret = convert_to_guest(_shadow_registers[Minutes_alarm]);
break;
case Hours:
ret = convert_to_guest(t->tm_hour);
break;
case Hours_alarm:
ret = convert_to_guest(_shadow_registers[Hours_alarm]);
break;
case Weekday:
ret = convert_to_guest(t->tm_wday);
break;
case Day_of_month:
ret = convert_to_guest(t->tm_mday);
break;
case Month:
ret = convert_to_guest(t->tm_mon + 1); // gmtime returns months counting from zero
break;
case Year:
ret = convert_to_guest(t->tm_year % 100);
break;
default:
if (Ram_start > _reg_sel || _reg_sel > Ram_end)
warn().printf("Unknown register read (%d)\n", _reg_sel);
else
ret = cmos_read(_reg_sel - Ram_start);
break;
}
return ret;
}
public:
Rtc(cxx::Ref_ptr<Gic::Ic> const &ic, int irq)
: Pm_device(), _alarm_timeout(this), _sink(ic, irq), _previous_alarm_second(0)
{
info().printf("Hello from RTC. Irq=%d\n", irq);
#if !defined(CONFIG_UVMM_EXTERNAL_RTC) and !(CONFIG_RELEASE_MODE)
warn().printf(
"No external clock source. Rtc time will not represent wallclock time.\n"
"Set CONFIG_UVMM_EXTERNAL_RTC = y if you have an external clock "
"source.\n");
#endif
_seconds = ns_to_s(L4rtc_hub::ns_since_epoch());
}
void pm_suspend() override
{}
void pm_resume() override
{
// tell the guest that the machine has resumed from suspend
// use the PS/2 shutdown status byte as expected by firmware
cmos_write(1, 0xfe);
}
char const *dev_name() const override
{ return "RTC"; }
/* IO write from the guest to device */
void io_out(unsigned port, Vmm::Mem_access::Width, l4_uint32_t value) override
{
switch (port)
{
case 0:
_reg_sel = value & 0x7f;
break;
case 1:
handle_write(value);
break;
default:
warn().printf("Unknown port written (%u).\n", port);
break;
}
}
/* IO read from the guest */
void io_in(unsigned port, Vmm::Mem_access::Width, l4_uint32_t *value) override
{
switch (port)
{
case 0:
*value = _reg_sel;
break;
case 1:
*value = handle_read();
break;
default:
warn().printf("Unknown port read (%u).\n", port);
break;
};
}
~Rtc()
{
dequeue_timeout(&_alarm_timeout);
}
private:
static Dbg info() { return Dbg(Dbg::Dev, Dbg::Info, "RTC"); }
static Dbg warn() { return Dbg(Dbg::Dev, Dbg::Warn, "RTC"); }
static Dbg trace() { return Dbg(Dbg::Dev, Dbg::Trace, "RTC"); }
static l4_uint64_t ns_to_s(l4_uint64_t ns) { return ns / 1'000'000'000; }
static l4_uint64_t s_to_us(l4_uint64_t s) { return s * 1'000'000; }
static l4_uint64_t s_to_ns(l4_uint64_t s) { return s * 1'000'000'000; }
/// Alarm registers with the highest bits set (0xC0 - 0xFF) are don't care.
static bool dont_care_not_set(l4_uint8_t reg)
{
enum { Dont_care_bits = 0xC0 };
return (reg & Dont_care_bits) != Dont_care_bits;
}
void cmos_write(l4_uint8_t regsel, l4_uint16_t value)
{
assert(regsel < Ram_size);
trace().printf("cmos write(%u, 0x%x)\n", regsel, value);
_cmos[regsel] = value;
}
l4_uint16_t cmos_read(l4_uint8_t regsel)
{
assert(regsel < Ram_size);
trace().printf("cmos read(%u) = 0x%x\n", regsel, _cmos[regsel]);
return _cmos[regsel];
}
l4_uint8_t _reg_sel = 0;
Status_reg_a _reg_a;
Status_reg_b _reg_b;
l4_uint8_t _reg_c = 0;
l4_uint8_t _reg_d = 0;
// These are written to by the guest.
l4_uint8_t _shadow_registers[Year + 1];
// protect members from concurrent access
std::mutex _mutex;
Alarm _alarm_timeout; //< Object handling timeout expired events.
// seconds since epoch as determined by external clock source
time_t _seconds;
l4_uint16_t _cmos[Ram_size];
Vmm::Irq_sink _sink;
l4_cpu_time_t _previous_alarm_second;
}; // class Rtc
} // namespace Vdev
namespace {
struct F : Vdev::Factory
{
static Dbg info() { return Dbg(Dbg::Dev, Dbg::Info, "RTC"); }
cxx::Ref_ptr<Vdev::Device> create(Vdev::Device_lookup *devs,
Vdev::Dt_node const &node) override
{
Vdev::Irq_dt_iterator it(devs, node);
if (it.next(devs) < 0)
return nullptr;
if (!it.ic_is_virt())
{
info().printf("RTC requires a virtual interrupt controller.");
return nullptr;
}
if (it.irq() != 8)
{
info().printf("DT Node must specify IRQ 8 for the RTC.");
return nullptr;
}
auto dev = Vdev::make_device<Vdev::Rtc>(it.ic(), it.irq());
auto region = Vmm::Io_region(0x70, 0x71, Vmm::Region_type::Virtual);
devs->vmm()->add_io_device(region, dev);
devs->vmm()->register_timer_device(dev);
return dev;
}
}; // struct F
static F f;
static Vdev::Device_type t = {"virt-rtc", nullptr, &f};
} // namespace

View File

@@ -0,0 +1,213 @@
/*
* Copyright (C) 2017, 2019, 2021-2024 Kernkonzept GmbH.
* Author(s): Philipp Eppelt <philipp.eppelt@kernkonzept.com>
* Benjamin Lamowski <benjamin.lamowski@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#include <l4/util/cpu.h>
#include "vcpu_ptr.h"
#include "vm_state_svm.h"
#include "vm_state_vmx.h"
#include "pt_walker.h"
#include "mad.h"
#include "guest.h"
namespace Vmm {
void
Vcpu_ptr::create_state(Vm_state::Type type)
{
if (type == Vm_state::Type::Vmx)
{
auto state = reinterpret_cast<l4_vm_vmx_vcpu_state_t *>(_s);
auto vmx = new Vmx_state(state);
_s->user_data[Reg_vmm_type] = reinterpret_cast<l4_umword_t>(vmx);
}
else if (type == Vm_state::Type::Svm)
_s->user_data[Reg_vmm_type] =
reinterpret_cast<l4_umword_t>(new Svm_state(extended_state()));
else
throw L4::Runtime_error(-L4_ENOSYS, "Unsupported HW virtualization type.");
}
Vm_state::Type
Vcpu_ptr::determine_vmm_type()
{
if (!l4util_cpu_has_cpuid())
throw L4::Runtime_error(-L4_ENOSYS,
"Platform does not support CPUID. Aborting!\n");
l4_umword_t ax, bx, cx, dx;
l4util_cpu_cpuid(0, &ax, &bx, &cx, &dx);
if (bx == 0x756e6547 && cx == 0x6c65746e && dx == 0x49656e69)
return Vm_state::Type::Vmx;
// AuthenticAMD
else if (bx == 0x68747541 && cx == 0x444d4163 && dx == 0x69746e65)
{
warn().printf(">>> CAUTION: Support for AMD SVM is experimental, use at your own risk! <<<\n");
// Check if the SVM features we need are present.
l4util_cpu_cpuid(0x8000000a, &ax, &bx, &cx, &dx);
if (!(dx & Svm_state::Cpuid_svm_feature_nrips))
L4Re::throw_error(-L4_ENOSYS,
"SVM does not support next_rip save. Aborting!\n");
// It should be safe to assume that the decode assists feature is
// present, since all modern AMD CPUs (starting with Bulldozer)
// implement it. However, QEMU or rather KVM-based nested virtualization
// does not report that the feature is present (see svm_set_cpu_caps()),
// but still provides decode assist information, e.g. for writes to CR0.
if (!(dx & Svm_state::Cpuid_svm_feature_decode_assists))
warn().printf("Platform does not support SVM decode assists (misreported on QEMU).\n");
return Vm_state::Type::Svm;
}
else
throw L4::Runtime_error(-L4_ENOSYS, "Platform not supported. Aborting!\n");
}
/// Mem_access::Kind::Other symbolises failure to decode.
Mem_access
Vcpu_ptr::decode_mmio() const
{
Mem_access m;
m.access = Mem_access::Other;
auto *vms = vm_state();
l4_uint64_t opcode;
try
{
// overwrite the virtual IP with the physical OP code
opcode = get_pt_walker()->walk(vms->cr3(), vms->ip());
}
catch (L4::Runtime_error &e)
{
warn().printf("[%3u] Could not determine opcode for MMIO access. Page table "
"walking failed for IP 0x%lx and reports: %s\n",
get_vcpu_id(), vms->ip(),
e.extra_str() ? e.extra_str() : "");
return m;
}
// amd64: vcpu regs == exc_regs
l4_exc_regs_t *reg = reinterpret_cast<l4_exc_regs_t *>(&_s->r);
using namespace L4mad;
unsigned char *inst_buf = reinterpret_cast<unsigned char *>(opcode);
// TODO: Limit inst_buf_len to size until the next non-contiguous page
// boundary if it is < Decoder::Max_instruction_len.
unsigned inst_buf_len = Decoder::Max_instruction_len;
Decoder decoder(reg, vms->ip(), inst_buf, inst_buf_len);
bool decoded = false;
Op op;
Desc tgt, src;
switch (decoder.decode(&op, &tgt, &src))
{
case Decoder::Result::Success: decoded = true; break;
case Decoder::Result::Unsupported: break;
case Decoder::Result::Invalid:
// TODO: If size of instruction buffer is < Decoder::Max_instruction_len,
// because instruction lies on a non-contiguous page boundary,
// use a temporary buffer to hold instruction bytes from both pages
// and retry decoding from that.
break;
}
if (!decoded)
{
unsigned char const *text = reinterpret_cast<unsigned char *>(opcode);
Dbg().printf("[%3u] Decoding failed at 0x%lx: %02x %02x %02x %02x %02x "
"%02x %02x <%02x> %02x %02x %02x %02x %02x %02x %02x %02x\n",
get_vcpu_id(), vms->ip(),
text[-7], text[-6], text[-5], text[-4], text[-3],
text[-2], text[-1], text[0], text[1], text[2], text[3],
text[4], text[5], text[6], text[7], text[8]);
return m;
}
if (0)
decoder.print_insn_info(op, tgt, src);
m.width = op.access_width;
if (tgt.dtype != L4mad::Desc_reg && tgt.dtype != L4mad::Desc_mem)
{
Dbg().printf("[%3u] tgt type invalid %i\n", get_vcpu_id(), tgt.dtype);
return m;
}
// SRC and TGT.val contain the register number of the MMIO access. In case of
// write, this register can be decoded to the value.
// In case of read I need to save the register number and write to this
// register in writeback_mmio.
// translate to Mem_access;
if (op.atype == L4mad::Read)
{
m.access = Mem_access::Load;
_s->user_data[Reg_mmio_read] = tgt.val >> tgt.shift;
}
else if (op.atype == L4mad::Write)
{
m.access = Mem_access::Store;
switch (src.dtype)
{
case L4mad::Desc_reg:
// src.val is the register number in MAD order; which is inverse to
// register order in l4_vcpu_regs_t.
m.value = *decode_reg_ptr(src.val) >> src.shift;
break;
case L4mad::Desc_imm:
m.value = src.val;
break;
default:
assert(false);
m.value = 0;
break;
}
}
// else unknown; Other already set.
return m;
}
l4_umword_t *
Vcpu_ptr::decode_reg_ptr(int value) const
{
return reinterpret_cast<l4_umword_t *>(&_s->r)
+ (L4mad::Num_registers - 1 - value);
}
void
Vcpu_ptr::reset(bool protected_mode)
{
vm_state()->init_state();
// If Uvmm is to boot a Linux kernel directly, it will do so in protected
// mode as is required in Linux' boot protocol. Otherwise the Boot and
// Application Processors are expected to come up in Real Mode.
if (protected_mode)
vm_state()->setup_linux_protected_mode(_s->r.ip, _s->r.sp);
else
vm_state()->setup_real_mode(_s->r.ip);
Guest::get_instance()->run_vm(*this);
}
void
Vcpu_ptr::hot_reset()
{
// assumption: reset while we already went through the normal reset once.
// intention: Do not call Guest::run_vm() again.
vm_state()->init_state();
vm_state()->setup_real_mode(_s->r.ip);
}
} // namespace Vmm

View File

@@ -0,0 +1,94 @@
/*
* Copyright (C) 2017-2020, 2022-2024 Kernkonzept GmbH.
* Author(s): Sarah Hoffmann <sarah.hoffmann@kernkonzept.com>
* Philipp Eppelt <philipp.eppelt@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include "generic_vcpu_ptr.h"
#include "mem_access.h"
#include "vm_state.h"
namespace Vmm {
class Pt_walker;
class Vcpu_ptr : public Generic_vcpu_ptr
{
public:
enum User_data_regs_arch
{
Reg_vmm_type = Reg_arch_base,
Reg_mmio_read,
// <insert further register usage here>
Reg_must_be_last_before_ucode,
Reg_ucode_rev = 6, // must be in sync with Fiasco
};
static_assert(Reg_ucode_rev >= Reg_must_be_last_before_ucode,
"Last user data register is reserved for microcode revision.");
explicit Vcpu_ptr(l4_vcpu_state_t *s) : Generic_vcpu_ptr(s)
{
if (s)
create_state(determine_vmm_type());
}
bool pf_write() const
{
return vm_state()->pf_write();
}
void thread_attach()
{
control_ext(L4::Cap<L4::Thread>());
}
Vm_state *vm_state() const
{ return reinterpret_cast<Vm_state *>(_s->user_data[Reg_vmm_type]);}
Mem_access decode_mmio() const;
void writeback_mmio(Mem_access const m)
{
// used to write read value back to register it is read to.
*decode_reg_ptr(_s->user_data[Reg_mmio_read]) = m.value;
}
void reset(bool protected_mode);
void hot_reset();
l4_umword_t ucode_revision() const
{ return _s->user_data[Reg_ucode_rev]; }
template <typename ERR_DBG>
void dump_regs_t(l4_addr_t vm_ip, ERR_DBG out) const
{
unsigned vcpu_id = get_vcpu_id();
l4_vcpu_regs_t *regs = &_s->r;
out.printf("[%3u] RAX 0x%lx\nRBX 0x%lx\nRCX 0x%lx\nRDX 0x%lx\nRSI 0x%lx\n"
"RDI 0x%lx\nRSP 0x%lx\nRBP 0x%lx\nR8 0x%lx\nR9 0x%lx\n"
"R10 0x%lx\nR11 0x%lx\nR12 0x%lx\nR13 0x%lx\nR14 0x%lx\n"
"R15 0x%lx\nRIP 0x%lx\nvCPU RIP 0x%lx\n",
vcpu_id, regs->ax, regs->bx, regs->cx, regs->dx, regs->si,
regs->di, regs->sp, regs->bp, regs->r8, regs->r9, regs->r10,
regs->r11, regs->r12, regs->r13, regs->r14, regs->r15, vm_ip,
regs->ip);
}
private:
void *extended_state() const
{
return (void *)(((char *)_s) + L4_VCPU_OFFSET_EXT_STATE);
}
Vm_state::Type determine_vmm_type();
void create_state(Vm_state::Type type);
l4_umword_t *decode_reg_ptr(int value) const;
}; // class Vcpu_ptr
} // namespace Vmm

View File

@@ -0,0 +1,431 @@
/*
* Copyright (C) 2017-2024 Kernkonzept GmbH.
* Author(s): Philipp Eppelt <philipp.eppelt@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#include <l4/re/env>
#include <l4/re/error_helper>
#include <l4/re/util/cap_alloc>
#include <l4/re/util/unique_cap>
#include <climits>
#include "debug.h"
#include "virt_lapic.h"
#include "mad.h"
#include "guest.h"
namespace Gic {
using L4Re::chkcap;
using L4Re::chksys;
Virt_lapic::Virt_lapic(unsigned id, cxx::Ref_ptr<Vmm::Cpu_dev> cpu)
: _lapic_irq(chkcap(L4Re::Util::make_unique_cap<L4::Irq>(),
"Allocate local APIC notification IRQ.")),
_lapic_x2_id(id),
_lapic_version(Lapic_version),
_x2apic_enabled(false),
_nmi_pending(false),
_cpu(cpu),
_registry(cpu->vcpu().get_ipc_registry())
{
trace().printf("Virt_lapic ctor; ID 0x%x\n", id);
chksys(L4Re::Env::env()->factory()->create(_lapic_irq.get()),
"Create APIC IRQ.");
// Set reset values of the LAPIC registers
memset(&_regs, 0, sizeof(_regs));
_regs.dfr = -1U;
_regs.cmci = _regs.therm = _regs.perf = 0x00010000;
_regs.lint[0] = _regs.lint[1] = _regs.err = 0x00010000;
_regs.svr = 0x000000ff;
_apic_timer = Vdev::make_device<Apic_timer>(this);
}
void
Virt_lapic::set(unsigned irq)
{
irq_trigger(irq);
}
void
Virt_lapic::set(Vdev::Msix::Data_register_format data)
{
// assumption 1: delivery_mode lowest prio already arbitrated
// assumption 2: only called if this APIC is destination
using namespace Vdev::Msix;
switch (data.delivery_mode())
{
case Dm_fixed: [[fallthrough]];
case Dm_lowest_prio:
irq_trigger(data.vector(), data.trigger_mode(), true);
break;
case Dm_smi: info().printf("SMI dropped at LAPIC 0x%x\n", id()); break;
case Dm_nmi: nmi(); break;
case Dm_init: init_ipi(); break;
case Dm_startup: startup_ipi(data); break;
case Dm_extint: irq_trigger(data.vector(), false, false); break;
default:
info().printf("LAPIC 0x%x drops unknown MSI. Delivery mode 0x%x, Vector "
"0x%x, data: 0x%llx\n",
id(), data.delivery_mode().get(), data.vector().get(),
data.raw);
break;
};
}
void
Virt_lapic::init_ipi()
{
// Only sleeping vCPUs must be rescheduled
if (_cpu->get_cpu_state() == Vmm::Cpu_dev::Sleeping)
_cpu->reschedule();
_cpu->send_init_ipi();
_sipi_cnt = 0;
}
void
Virt_lapic::startup_ipi(Vdev::Msix::Data_register_format data)
{
// only act on the first SIPI
if (_sipi_cnt++)
return;
enum : l4_uint32_t
{
Icr_startup_page_shift = 12
};
l4_addr_t start_eip = data.vector() << Icr_startup_page_shift;
start_cpu(start_eip);
_cpu->send_sipi();
}
void
Virt_lapic::start_cpu(l4_addr_t entry)
{
Vmm::Vcpu_ptr vcpu = _cpu->vcpu();
vcpu->r.sp = 0;
vcpu->r.ip = entry; // r.ip used to communicate entry to Vcpu_ptr.reset()
info().printf("Starting CPU %u on EIP 0x%lx\n", _lapic_x2_id, entry);
}
void
Virt_lapic::bind_irq_src_handler(unsigned irq, Irq_src_handler *handler)
{
assert (irq < 256); // sources array length
if(handler && _sources[irq] && handler != _sources[irq])
info().printf("[LAPIC 0x%x] IRQ src handler for IRQ %u already set to "
"%p, new %p\n",
_lapic_x2_id, irq, _sources[irq], handler);
_sources[irq] = handler;
}
Irq_src_handler *
Virt_lapic::get_irq_src_handler(unsigned irq) const
{
assert (irq < 256); // sources array length
return _sources[irq];
}
int
Virt_lapic::dt_get_interrupt(fdt32_t const *, int, int *) const
{ return 1; }
void
Virt_lapic::nmi()
{
_nmi_pending.store(true, std::memory_order_release);
_lapic_irq->trigger();
}
/**
* Enqueue an interrupt and trigger an IPC in the vCPU.
*
* \param irq Interrupt to inject.
*/
void
Virt_lapic::irq_trigger(l4_uint32_t irq, bool level, bool irr)
{
bool trigger = true;
{
std::lock_guard<std::mutex> lock(_int_mutex);
if (irr)
{
// don't trigger lapic_irq, if the IRR has this IRQ already queued.
trigger = !_regs.irr.set_irq(irq);
if (level)
_regs.tmr.set_irq(irq);
else
_regs.tmr.clear_irq(irq);
}
else
{
// don't trigger lapic_irq again, if an IRQ is already queued.
trigger = _non_irr_irqs.empty();
_non_irr_irqs.push(irq);
}
}
if (trigger)
_lapic_irq->trigger();
}
bool
Virt_lapic::next_pending_nmi()
{
bool expected = true;
return _nmi_pending.compare_exchange_strong(expected, false,
std::memory_order_acquire,
std::memory_order_relaxed);
}
bool
Virt_lapic::is_nmi_pending()
{ return _nmi_pending.load(std::memory_order_relaxed); }
int
Virt_lapic::next_pending_irq()
{
std::lock_guard<std::mutex> lock(_int_mutex);
if (!_non_irr_irqs.empty())
{
unsigned irq = _non_irr_irqs.front();
_non_irr_irqs.pop();
return irq;
}
auto highest_irr = _regs.irr.get_highest_irq();
if (highest_irr >= 0)
{
auto highest_isr = _regs.isr.get_highest_irq();
if (highest_irr > highest_isr)
{
_regs.isr.set_irq(highest_irr);
_regs.irr.clear_irq(highest_irr);
return highest_irr;
}
}
return -1;
}
bool
Virt_lapic::is_irq_pending()
{
std::lock_guard<std::mutex> lock(_int_mutex);
return !_non_irr_irqs.empty() || _regs.irr.has_irq();
}
bool
Virt_lapic::read_msr(unsigned msr, l4_uint64_t *value) const
{
switch (msr)
{
case Msr_ia32_apic_base: // APIC base, Vol. 3A 10.4.4
*value = Lapic_access_handler::Mmio_addr | Apic_base_enabled;
if (_lapic_x2_id == 0)
*value |= Apic_base_bsp_processor;
if (_x2apic_enabled)
*value |= Apic_base_x2_enabled;
break;
case Msr_ia32_tsc_deadline:
*value = _apic_timer->read_tsc_deadline_msr();
break;
case Msr_ia32_x2apic_apicid:
*value = _x2apic_enabled
? _lapic_x2_id
: (_lapic_x2_id << Xapic_mode_local_apic_id_shift);
break;
case Msr_ia32_x2apic_version: *value = _lapic_version; break;
case Msr_ia32_x2apic_tpr: *value = _regs.tpr; break;
case Msr_ia32_x2apic_ppr: *value = _regs.ppr; break;
case Msr_ia32_x2apic_ldr: *value = _regs.ldr; break;
case Mmio_apic_destination_format_register:
// not existent in x2apic mode
if (!_x2apic_enabled)
*value = _regs.dfr;
break;
case Msr_ia32_x2apic_sivr: *value = _regs.svr; break;
case 0x810:
case 0x811:
case 0x812:
case 0x813:
case 0x814:
case 0x815:
case 0x816:
case Msr_ia32_x2apic_isr7:
*value = _regs.isr.get_reg(msr - 0x810);
break;
case 0x818:
case 0x819:
case 0x81a:
case 0x81b:
case 0x81c:
case 0x81d:
case 0x81e:
case Msr_ia32_x2apic_tmr7:
*value = _regs.tmr.get_reg(msr - 0x818);
break;
case 0x820:
case 0x821:
case 0x822:
case 0x823:
case 0x824:
case 0x825:
case 0x826:
case Msr_ia32_x2apic_irr7:
*value = _regs.irr.get_reg(msr - 0x820);
break;
case Msr_ia32_x2apic_esr: *value = _regs.esr; break;
case Msr_ia32_x2apic_lvt_cmci: *value = _regs.cmci; break;
// 0x830 handled by Icr_handler
case Msr_ia32_x2apic_lvt_timer:
*value = _apic_timer->read_lvt_timer_reg();
break;
case Msr_ia32_x2apic_lvt_thermal: *value = _regs.therm; break;
case Msr_ia32_x2apic_lvt_pmi: *value = _regs.perf; break;
case Msr_ia32_x2apic_lvt_lint0: *value = _regs.lint[0]; break;
case Msr_ia32_x2apic_lvt_lint1: *value = _regs.lint[1]; break;
case Msr_ia32_x2apic_lvt_error: *value = _regs.err; break;
case Msr_ia32_x2apic_init_count:
*value = _apic_timer->read_tmr_init();
break;
case Msr_ia32_x2apic_cur_count: *value = _apic_timer->read_tmr_cur(); break;
case Msr_ia32_x2apic_div_conf:
*value = _apic_timer->read_divide_configuration_reg();
break;
default: return false;
}
if (0)
Dbg().printf("ReadAPIC MSR 0x%x. Result: 0x%x\n", (unsigned)msr,
(unsigned)*value);
return true;
}
bool
Virt_lapic::write_msr(unsigned msr, l4_uint64_t value)
{
switch(msr)
{
case Msr_ia32_apic_base:
_x2apic_enabled = value & Apic_base_x2_enabled;
if (_x2apic_enabled)
{
Dbg().printf("------ x2APIC enabled\n");
// from Intel SDM (October 2017)
// Logical x2APIC ID = [(x2APIC ID[19:4] « 16) | (1 « x2APIC ID[3:0])]
_regs.ldr =
(_lapic_x2_id & 0xffff0) << 16 | 1U << (_lapic_x2_id & 0xf);
}
// APIC Base field, Vol. 3A 10.4.4
if (!((value >> 12) & (Lapic_access_handler::Mmio_addr >> 12)))
// Vol. 3A 10.4.5
warn().printf(
"Relocating the Local APIC Registers is not supported.\n");
break;
case Msr_ia32_tsc_deadline:
_apic_timer->write_tsc_deadline_msr(value);
break;
case Msr_ia32_x2apic_version: break; // RO register: ignore write
case Msr_ia32_x2apic_tpr: _regs.tpr = value; break;
case Msr_ia32_x2apic_ldr:
// not writable in x2apic mode
if (!_x2apic_enabled)
_regs.ldr = value;
break;
case Mmio_apic_destination_format_register:
// not existent in x2apic mode; writes by system software only in
// disabled APIC state; which currently isn't supported. => write ignored
break;
case Msr_ia32_x2apic_sivr:
_regs.svr = value; break; // TODO react on APIC SW en/disable
case Msr_ia32_x2apic_eoi:
{
std::lock_guard<std::mutex> lock(_int_mutex);
int irq_num = _regs.isr.clear_highest_irq();
if (irq_num > 0)
{
Irq_src_handler *hdlr = get_irq_src_handler(irq_num);
if (hdlr)
hdlr->eoi();
}
}
if (value != 0)
{
Dbg().printf("WARNING: write to EOI not zero, 0x%llx\n", value);
}
break;
case Msr_ia32_x2apic_esr: _regs.esr = 0; break;
case Msr_ia32_x2apic_lvt_cmci: _regs.cmci = value; break;
// 0x830 handled by Icr_handler
case Msr_ia32_x2apic_lvt_timer:
_apic_timer->write_lvt_timer_reg(value);
break;
case Msr_ia32_x2apic_lvt_thermal: _regs.therm = value; break;
case Msr_ia32_x2apic_lvt_pmi: _regs.perf = value; break;
case Msr_ia32_x2apic_lvt_lint0: _regs.lint[0] = value; break;
case Msr_ia32_x2apic_lvt_lint1: _regs.lint[1] = value; break;
case Msr_ia32_x2apic_lvt_error: _regs.err = value; break;
case Msr_ia32_x2apic_init_count:
_apic_timer->write_tmr_init(value);
break;
case Msr_ia32_x2apic_div_conf:
_apic_timer->write_divide_configuration_reg(value);
break;
case Msr_ia32_x2apic_self_ipi:
if (_x2apic_enabled)
irq_trigger(value & 0xff);
else
// if X2APIC is not enabled, writing IA32_SELF_IPI incurs a #GP
return false;
break;
default: return false;
}
if (0 && msr != 0x80b)
Dbg().printf("WARNING: APIC write to 0x%x: 0x%llx\n", msr, value);
return true;
}
} // namepace Gic
#include "device_factory.h"
#include "guest.h"
namespace {
struct F : Vdev::Factory
{
cxx::Ref_ptr<Vdev::Device> create(Vdev::Device_lookup *devs,
Vdev::Dt_node const &) override
{
auto apics = devs->vmm()->apic_array();
auto msix_ctrl = Vdev::make_device<Gic::Msix_control>(apics);
devs->vmm()->icr_handler()->register_msix_ctrl(msix_ctrl);
return msix_ctrl;
}
};
static F f;
static Vdev::Device_type e = {"intel,msi-controller", nullptr, &f};
} // namespace

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,86 @@
/*
* Copyright (C) 2017-2019, 2022-2024 Kernkonzept GmbH.
* Author(s): Philipp Eppelt <philipp.eppelt@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include <l4/sys/types.h>
#include <l4/cxx/bitfield>
namespace Vmm {
/// Abstraction of the VMX and SVM event injection format.
struct Injection_event
{
l4_uint64_t raw = 0;
CXX_BITFIELD_MEMBER(0, 31, event, raw);
CXX_BITFIELD_MEMBER(32, 63, error, raw);
// SVM and VMX both use the same bit encoding in the lower 11 bits.
CXX_BITFIELD_MEMBER(0, 7, vector, raw);
CXX_BITFIELD_MEMBER(8, 10, type, raw);
CXX_BITFIELD_MEMBER(11, 11, error_valid, raw);
// SVM and VMX both use bit 31 to indicate validity of the value.
CXX_BITFIELD_MEMBER(31, 31, valid, raw);
Injection_event(l4_uint32_t ev, l4_uint32_t err)
{
event() = ev;
error() = err;
}
Injection_event(unsigned char v, unsigned char t, bool err_valid = false,
l4_uint32_t err_code = 0)
{
vector() = v;
type() = t;
error_valid() = err_valid;
error() = err_code;
valid() = 1;
}
explicit Injection_event(l4_uint64_t val) : raw(val) {}
};
class Event_recorder;
class Vm_state
{
public:
enum class Type { Vmx, Svm };
virtual ~Vm_state() = 0;
virtual Type type() const = 0;
virtual void init_state() = 0;
virtual void setup_linux_protected_mode(l4_addr_t entry,
l4_addr_t stack_addr) = 0;
virtual void setup_real_mode(l4_addr_t entry) = 0;
virtual l4_umword_t ip() const = 0;
virtual l4_umword_t sp() const = 0;
virtual bool pf_write() const = 0;
virtual l4_umword_t cr3() const = 0;
virtual l4_uint64_t xcr0() const = 0;
virtual bool read_msr(unsigned msr, l4_uint64_t *value) const = 0;
virtual bool write_msr(unsigned msr, l4_uint64_t value, Event_recorder *ev_rec) = 0;
virtual Injection_event pending_event_injection() = 0;
virtual void inject_event(Injection_event const &ev) = 0;
virtual bool can_inject_nmi() const = 0;
virtual bool can_inject_interrupt() const = 0;
virtual void disable_interrupt_window() = 0;
virtual void enable_interrupt_window() = 0;
virtual void disable_nmi_window() = 0;
virtual void enable_nmi_window() = 0;
// must only be called once per VM entry
virtual void advance_entry_ip(unsigned bytes) = 0;
};
} // namespace Vmm

View File

@@ -0,0 +1,699 @@
/*
* Copyright (C) 2021, 2023-2024 Kernkonzept GmbH.
* Author(s): Georg Kotheimer <georg.kotheimer@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#include <l4/re/error_helper>
#include "vm_state_svm.h"
#include "consts.h"
#include "mad.h"
namespace Vmm {
void
Svm_state::init_state()
{
// Does not matter, Linux overwrites it...
_vmcb->state_save_area.ldtr.selector = 0;
_vmcb->state_save_area.ldtr.attrib = 0;
_vmcb->state_save_area.ldtr.limit = 0;
_vmcb->state_save_area.ldtr.base = 0;
// TODO: Setup GDTR, IDTR? (not done on VMX)
// Always use nested paging!
_vmcb->control_area.np_enable = 1;
// Initiated to default values at reset: WB,WT,WC,UC,WB,WT,UC-,UC
_vmcb->state_save_area.g_pat = 0x0007040600010406ULL;
// Reset value of XCR0
_vmcb->state_save_area.xcr0 = 1ULL;
_vmcb->state_save_area.rflags = 0;
_vmcb->state_save_area.cr3 = 0;
_vmcb->state_save_area.dr6 = 0;
_vmcb->state_save_area.dr7 = 0;
_vmcb->control_area.eventinj = 0;
// Enable SVM
_vmcb->state_save_area.efer = Efer_svme_enable;
// Intercept DR accesses.
// The kernel enforces 0xff3f, to keep the behavior consistent with VMX, we
// intercept all DR accesses.
_vmcb->control_area.intercept_rd_drX = 0xffff;
_vmcb->control_area.intercept_wr_drX = 0xffff;
_vmcb->control_area.intercept_exceptions = 0;
_vmcb->control_area.intercept_instruction0 =
Intercept_intr | Intercept_nmi | Intercept_smi | Intercept_init
| Intercept_vintr | Intercept_cr0_sel_write | Intercept_rdpmc
| Intercept_cpuid | Intercept_invd | Intercept_hlt | Intercept_ioio
| Intercept_msr | Intercept_task_switch | Intercept_freeze
| Intercept_shutdown;
// TODO: These are the instructions intercepts that Fiasco enforces. Check
// if we intercept too less or too much...
_vmcb->control_area.intercept_instruction1 =
Intercept_vmrun | Intercept_vmmcall | Intercept_vmload
| Intercept_vmsave | Intercept_stgi | Intercept_clgi | Intercept_skinit
| Intercept_rdtscp | Intercept_monitor | Intercept_mwait
| Intercept_xsetbv;
mark_all_dirty();
}
void
Svm_state::setup_linux_protected_mode(l4_addr_t entry, l4_addr_t stack_addr)
{
_vmcb->state_save_area.cs.selector = 0x10;
_vmcb->state_save_area.cs.attrib = 0xc9a; // TYPE=10=Read/Execute, S, P, DB, G
_vmcb->state_save_area.cs.limit = 0xffffffff;
_vmcb->state_save_area.cs.base = 0;
_vmcb->state_save_area.ss.selector = 0x18;
_vmcb->state_save_area.ss.attrib = 0xc92; // TYPE=2=Read/Write, S, P, DB, G
_vmcb->state_save_area.ss.limit = 0xffffffff;
_vmcb->state_save_area.ss.base = 0;
_vmcb->state_save_area.ds.selector = 0x18;
_vmcb->state_save_area.ds.attrib = 0xc92;
_vmcb->state_save_area.ds.limit = 0xffffffff;
_vmcb->state_save_area.ds.base = 0;
_vmcb->state_save_area.es.selector = 0x18;
_vmcb->state_save_area.es.attrib = 0xc92;
_vmcb->state_save_area.es.limit = 0xffffffff;
_vmcb->state_save_area.es.base = 0;
_vmcb->state_save_area.fs.selector = 0x0;
_vmcb->state_save_area.fs.attrib = 0xcf3; // Equivalent to VMX
_vmcb->state_save_area.fs.limit = 0xffffffff;
_vmcb->state_save_area.fs.base = 0;
_vmcb->state_save_area.gs.selector = 0x0;
_vmcb->state_save_area.gs.attrib = 0xcf3;
_vmcb->state_save_area.gs.limit = 0xffffffff;
_vmcb->state_save_area.gs.base = 0;
_vmcb->state_save_area.tr.selector = 0x28;
_vmcb->state_save_area.tr.attrib = 0x8b; // TYPE=11, P
_vmcb->state_save_area.tr.limit = 0x67; // TODO: VMX uses 67 here
_vmcb->state_save_area.tr.base = 0;
_vmcb->state_save_area.rip = entry;
_vmcb->state_save_area.rsp = stack_addr;
_vmcb->state_save_area.cr0 = 0x10031;
_vmcb->state_save_area.cr4 = 0x690;
}
/**
* Setup the Real Mode startup procedure for AP startup and BSP resume.
*
* This follows the hardware reset behavior described in AMD APM "14.1.5
* Fetching the first instruction".
*/
void
Svm_state::setup_real_mode(l4_addr_t entry)
{
if (entry == 0xfffffff0U)
{
// Bootstrap Processor (BSP) boot
_vmcb->state_save_area.cs.selector = 0xf000U;
_vmcb->state_save_area.cs.base = 0xffff0000U;
_vmcb->state_save_area.rip = 0xfff0U;
}
else
{
// Application Processor (AP) boot via Startup IPI (SIPI) or resume
// from suspend.
// cs_base contains the cached address computed from cs_selector. After
// reset cs_base contains what we set until the first cs selector is
// loaded. We use the waking vector or SIPI vector directly, because
// tianocore cannot handle the CS_BASE + IP split.
_vmcb->state_save_area.cs.selector = entry >> 4;
_vmcb->state_save_area.cs.base = entry;
_vmcb->state_save_area.rip = 0;
}
_vmcb->state_save_area.cs.attrib = 0x9b; // TYPE=11, S, P
_vmcb->state_save_area.cs.limit = 0xffff;
_vmcb->state_save_area.ss.selector = 0x18;
_vmcb->state_save_area.ss.attrib = 0x93; // TYPE=3, S, P
_vmcb->state_save_area.ss.limit = 0xffff;
_vmcb->state_save_area.ss.base = 0;
_vmcb->state_save_area.ds.selector = 0x18;
_vmcb->state_save_area.ds.attrib = 0x93;
_vmcb->state_save_area.ds.limit = 0xffff;
_vmcb->state_save_area.ds.base = 0;
_vmcb->state_save_area.es.selector = 0x18;
_vmcb->state_save_area.es.attrib = 0x93;
_vmcb->state_save_area.es.limit = 0xffff;
_vmcb->state_save_area.es.base = 0;
_vmcb->state_save_area.fs.selector = 0x0;
_vmcb->state_save_area.fs.attrib = 0x93;
_vmcb->state_save_area.fs.limit = 0xffff;
_vmcb->state_save_area.fs.base = 0;
_vmcb->state_save_area.gs.selector = 0x0;
_vmcb->state_save_area.gs.attrib = 0x93;
_vmcb->state_save_area.gs.limit = 0xffff;
_vmcb->state_save_area.gs.base = 0;
_vmcb->state_save_area.tr.selector = 0x0;
_vmcb->state_save_area.tr.attrib = 0x8b; // TYPE=11, P
_vmcb->state_save_area.tr.limit = 0xffff;
_vmcb->state_save_area.tr.base = 0;
_vmcb->state_save_area.rsp = 0;
_vmcb->state_save_area.cr0 = 0x10030;
_vmcb->state_save_area.cr4 = 0x680;
// clear in SW state to prevent injection of pending events from before
// INIT/STARTUP IPI.
_vmcb->control_area.exitintinfo = 0ULL;
}
bool
Svm_state::determine_next_ip_from_ip(l4_vcpu_regs_t *regs,
unsigned char *inst_buf,
unsigned inst_buf_len)
{
using namespace L4mad;
Op op;
Desc tgt, src;
Decoder decoder(reinterpret_cast<l4_exc_regs_t *>(regs), ip(), inst_buf,
inst_buf_len);
if (decoder.decode(&op, &tgt, &src) != Decoder::Result::Success)
{
warn().printf("Could not decode instruction for current ip\n");
return false;
}
trace().printf("Advance instruction pointer n_rip = 0x%lx + 0x%x\n",
ip(), op.insn_len);
_vmcb->control_area.n_rip = ip() + op.insn_len;
return true;
}
bool
Svm_state::read_msr(unsigned msr, l4_uint64_t *value) const
{
switch (msr)
{
case 0x8b: // IA32_BIOS_SIGN_ID
case 0x1a0: // IA32_MISC_ENABLE
*value = 0U;
break;
case 0x3a: // IA32_FEATURE_CONTROL
// Lock register so the guest does not try to enable anything.
*value = 1U;
break;
case 0x277: // PAT
*value =_vmcb->state_save_area.g_pat;
break;
case 0xc0000080: // efer
// Hide SVME bit
*value = _vmcb->state_save_area.efer & ~Efer_svme_enable;
break;
case 0xc0010140: // OSVW_ID_Length
// TODO: Report errata to the guest? Allow direct read access to OSVW
// register in msrpm in Fiasco?
*value = 0U;
break;
case 0xc001001f: // MSR_AMD64_NB_CFG
// can all be savely ignored
*value = 0;
break;
default:
return false;
}
return true;
}
bool
Svm_state::write_msr(unsigned msr, l4_uint64_t value, Event_recorder *ev_rec)
{
switch (msr)
{
case 0x277: // PAT
// sanitization of 7 PAT values
// 0xF8 are reserved bits
// 0x2 and 0x3 are reserved encodings
// usage of reserved bits and encodings results in a #GP
if (value & 0xF8F8F8F8F8F8F8F8ULL)
{
ev_rec->make_add_event<Event_exc>(Event_prio::Exception, 13, 0);
break;
}
for (unsigned i = 0; i < 7; ++i)
{
l4_uint64_t const PAi_mask = (value & (0x7ULL << i * 8)) >> i * 8;
if ((PAi_mask == 0x2ULL) || (PAi_mask == 0x3ULL))
{
ev_rec->make_add_event<Event_exc>(Event_prio::Exception, 13, 0);
break;
}
}
_vmcb->state_save_area.g_pat = value;
break;
case 0xc0000080: // efer
{
// Force the SVME bit
l4_uint64_t efer = (value & Efer_guest_write_mask) | Efer_svme_enable;
l4_uint64_t old_efer = _vmcb->state_save_area.efer;
l4_uint64_t cr0 = _vmcb->state_save_area.cr0;
trace().printf("cr0: 0x%llx old efer 0x%llx new efer 0x%llx\n",
cr0, old_efer, efer);
// There is no going back from enabling long mode.
efer |= old_efer & Efer_lme;
if ((efer & Efer_lme) && (cr0 & Cr0_pg))
{
// indicate that long mode is active
efer |= Efer_lma;
}
trace().printf("efer: 0x%llx\n", efer);
_vmcb->state_save_area.efer = efer;
mark_dirty(Vmcb_crx);
break;
}
case 0xc001001f: // MSR_AMD64_NB_CFG
// can all be savely ignored
break;
default:
return false;
}
return true;
}
int
Svm_state::handle_cr0_write(l4_vcpu_regs_t *regs)
{
l4_uint64_t info1 = exit_info1();
if (!(info1 & Cr_valid))
{
// No decode assist information was provided for the access:
// "If the instruction is LMSW no additional information is provided."
Err().printf("LMSW write to CR0 not supported.\n");
return -1;
}
l4_umword_t newval = read_gpr(regs, info1 & Cr_gpr_mask);
auto old_cr0 = _vmcb->state_save_area.cr0;
trace().printf("Write to cr0: 0x%llx -> 0x%lx\n", old_cr0, newval);
// 0x10 => Extension Type; hardcoded to 1 see manual
_vmcb->state_save_area.cr0 = newval | 0x10;
mark_dirty(Vmcb_crx);
if ((newval & Cr0_pg)
&& (old_cr0 & Cr0_pg) == 0
&& (_vmcb->state_save_area.efer & Efer_lme))
{
// indicate that long mode is active
info().printf("Enable long mode\n");
_vmcb->state_save_area.efer |= Efer_lma;
}
if ((newval & Cr0_pg) == 0
&& (old_cr0 & Cr0_pg))
{
trace().printf("Disabling paging ...\n");
if (_vmcb->state_save_area.efer & Efer_lme)
_vmcb->state_save_area.efer &= ~Efer_lma;
}
return Jump_instr;
}
int
Svm_state::handle_xsetbv(l4_vcpu_regs_t *regs)
{
// TODO: We have to check that the current privilege level is 0, and inject
// a general protection exception into the guest otherwise!
if (_vmcb->state_save_area.cpl != 0)
{
warn().printf(
"Ignoring write to extended control register %ld from CPL %d.\n",
regs->cx, _vmcb->state_save_area.cpl);
return Jump_instr;
}
if (regs->cx == 0)
{
l4_uint64_t value = (l4_uint64_t(regs->ax) & 0xFFFFFFFF)
| (l4_uint64_t(regs->dx) << 32);
_vmcb->state_save_area.xcr0 = value;
trace().printf("Setting xcr0 to 0x%llx\n", value);
return Jump_instr;
}
info().printf("Writing unknown extended control register %ld\n", regs->cx);
return -L4_EINVAL;
}
int
Svm_state::handle_hardware_exception(Event_recorder *ev_rec, unsigned num)
{
Err err;
// Besides #DB and #AC all hardware exceptions are reflected to the guest.
// The print statements serve as (paranoid) debug help in case the reflection
// does not happen.
switch (num)
{
case 0: err.printf("Hardware exception: Divide error\n"); break;
case 1: // #DB
{
ev_rec->make_add_event<Event_exc>(Event_prio::Exception, num);
// #DB exceptions are either of fault type or of trap type. We reflect
// both to the guest, without changing state, thus don't change the IP.
return Retry;
}
case 3: err.printf("Hardware exception: Breakpoint\n"); break;
case 4: err.printf("Hardware exception: Overflow\n"); break;
case 5: err.printf("Hardware exception: Bound range\n"); break;
case 6: err.printf("Hardware exception: Invalid opcode\n"); break;
case 7: err.printf("Hardware exception: Device not available\n"); break;
case 8: err.printf("Hardware exception: Double fault\n"); break;
case 10: err.printf("Hardware exception: Invalid TSS\n"); break;
case 11: err.printf("Hardware exception: Segment not present\n"); break;
case 12: err.printf("Hardware exception: Stack-segment fault\n"); break;
case 13: err.printf("Hardware exception: General protection\n"); break;
case 14: err.printf("Hardware exception: Page fault\n"); break;
case 16: err.printf("Hardware exception: FPU error\n"); break;
case 17: // #AC
{
l4_uint64_t err_code = exit_info1();
ev_rec->make_add_event<Event_exc>(Event_prio::Exception, num, err_code);
return Retry;
}
case 18: err.printf("Hardware exception: Machine check\n"); break;
case 19: err.printf("Hardware exception: SIMD error\n"); break;
default: err.printf("Hardware exception: Unknown exception\n"); break;
}
return -L4_EINVAL;
}
l4_umword_t
Svm_state::read_gpr(l4_vcpu_regs_t *regs, unsigned reg) const
{
switch(reg)
{
case 0: return regs->ax;
case 1: return regs->cx;
case 2: return regs->dx;
case 3: return regs->bx;
case 4: return _vmcb->state_save_area.rsp;
case 5: return regs->bp;
case 6: return regs->si;
case 7: return regs->di;
case 8: return regs->r8;
case 9: return regs->r9;
case 10: return regs->r10;
case 11: return regs->r11;
case 12: return regs->r12;
case 13: return regs->r13;
case 14: return regs->r14;
case 15: return regs->r15;
default: L4Re::throw_error(-L4_EINVAL, "Invalid register num.");
}
}
const char *
Svm_state::str_exit_code(Exit exit)
{
l4_uint32_t code = static_cast<l4_uint32_t>(exit);
if (/* code >= 0x00 && */ code <= 0x0f)
return "Read of CR 0-15";
if (code >= 0x10 && code <= 0x1f)
return "Write of CR 0-15";
if (code >= 0x20 && code <= 0x2f)
return "Read of DR 0-15";
if (code >= 0x30 && code <= 0x3f)
return "Write of DR 0-15";
if (code >= 0x40 && code <= 0x5f)
return "Exception vector 0-31";
if (code >= 0x90 && code <= 0x9f)
return "Write of CR 0-15 (trap)";
switch (code)
{
case 0x60: return "Physical INTR (maskable interrupt)";
case 0x61: return "Physical NMI";
case 0x62: return "Physical SMI";
case 0x63: return "Physical INIT";
case 0x64: return "Virtual INTR";
case 0x65: return "Write of CR0 that changed any bits other than CR0.TS or CR0.MP";
case 0x66: return "Read of IDTR";
case 0x67: return "Read of GDTR";
case 0x68: return "Read of LDTR";
case 0x69: return "Read of TR";
case 0x6A: return "Write of IDTR";
case 0x6B: return "Write of GDTR";
case 0x6C: return "Write of LDTR";
case 0x6D: return "Write of TR";
case 0x6E: return "RDTSC instruction";
case 0x6F: return "RDPMC instruction";
case 0x70: return "PUSHF instruction";
case 0x71: return "POPF instruction";
case 0x72: return "CPUID instruction";
case 0x73: return "RSM instruction";
case 0x74: return "IRET instruction";
case 0x75: return "Software interrupt (INTn instructions)";
case 0x76: return "INVD instruction";
case 0x77: return "PAUSE instruction";
case 0x78: return "HLT instruction";
case 0x79: return "INVLPG instructions";
case 0x7A: return "INVLPGA instruction";
case 0x7B: return "IN or OUT accessing protected port";
case 0x7C: return "RDMSR or WRMSR access to protected MSR";
case 0x7D: return "Task switch";
case 0x7E: return "FP error freeze";
case 0x7F: return "Shutdown";
case 0x80: return "VMRUN instruction";
case 0x81: return "VMMCALL instruction";
case 0x82: return "VMLOAD instruction";
case 0x83: return "VMSAVE instruction";
case 0x84: return "STGI instruction";
case 0x85: return "CLGI instruction";
case 0x86: return "SKINIT instruction";
case 0x87: return "RDTSCP instruction";
case 0x88: return "ICEBP instruction";
case 0x89: return "WBINVD or WBNOINVD instruction";
case 0x8A: return "MONITOR or MONITORX instruction";
case 0x8B: return "MWAIT or MWAITX instruction";
case 0x8C: return "MWAIT or MWAITX instruction, if monitor hardware is armed.";
case 0x8E: return "RDPRU instruction";
case 0x8D: return "XSETBV instruction";
case 0x8F: return "Write of EFER MSR";
case 0xA3: return "MCOMMIT instruction";
case 0x400: return "Nested paging host-level page fault";
case 0x401: return "AVIC Virtual IPI delivery not completed";
case 0x402: return "AVIC Access to unaccelerated vAPIC register";
case 0x403: return "VMGEXIT instruction";
case -1U: return "Invalid guest state in VMCB";
default: return nullptr;
}
}
void
Svm_state::dump(l4_vcpu_regs_t const *regs) const
{
warn().printf("Registers:\n");
warn().printf("r15=0x%lx\n", regs->r15); /**< r15 register */
warn().printf("r14=0x%lx\n", regs->r14); /**< r14 register */
warn().printf("r13=0x%lx\n", regs->r13); /**< r13 register */
warn().printf("r12=0x%lx\n", regs->r12); /**< r12 register */
warn().printf("r11=0x%lx\n", regs->r11); /**< r11 register */
warn().printf("r10=0x%lx\n", regs->r10); /**< r10 register */
warn().printf("r9=0x%lx\n", regs->r9); /**< r9 register */
warn().printf("r8=0x%lx\n", regs->r8); /**< r8 register */
warn().printf("di=0x%lx\n", regs->di); /**< rdi register */
warn().printf("si=0x%lx\n", regs->si); /**< rsi register */
warn().printf("bp=0x%lx\n", regs->bp); /**< rbp register */
warn().printf("pfa=0x%lx\n", regs->pfa); /**< page fault address */
warn().printf("bx=0x%lx\n", regs->bx); /**< rbx register */
warn().printf("dx=0x%lx\n", regs->dx); /**< rdx register */
warn().printf("cx=0x%lx\n", regs->cx); /**< rcx register */
warn().printf("ax=0x%lx\n", regs->ax); /**< rax register */
warn().printf("trapno=0x%lx\n", regs->trapno); /**< trap number */
warn().printf("err=0x%lx\n", regs->err); /**< error code */
warn().printf("ip=0x%lx\n", regs->ip); /**< instruction pointer */
warn().printf("cs=0x%lx\n", regs->cs); /**< dummy \internal */
warn().printf("flags=0x%lx\n", regs->flags); /**< eflags */
warn().printf("sp=0x%lx\n", regs->sp); /**< stack pointer */
warn().printf("ss=0x%lx\n", regs->ss);
warn().printf("fs_base=0x%lx\n", regs->fs_base);
warn().printf("gs_base=0x%lx\n", regs->gs_base);
warn().printf("ds=0x%x\n", regs->ds);
warn().printf("es=0x%x\n", regs->es);
warn().printf("fs=0x%x\n", regs->fs);
warn().printf("gs=0x%x\n", regs->gs);
warn().printf("Control area:\n");
warn().printf("intercept_rd_crX=0x%x\n", _vmcb->control_area.intercept_rd_crX);
warn().printf("intercept_wr_crX=0x%x\n", _vmcb->control_area.intercept_wr_crX);
warn().printf("intercept_rd_drX=0x%x\n", _vmcb->control_area.intercept_rd_drX);
warn().printf("intercept_wr_drX=0x%x\n", _vmcb->control_area.intercept_wr_drX);
warn().printf("intercept_exceptions=0x%x\n", _vmcb->control_area.intercept_exceptions);
warn().printf("intercept_instruction0=0x%x\n", _vmcb->control_area.intercept_instruction0);
warn().printf("intercept_instruction1=0x%x\n", _vmcb->control_area.intercept_instruction1);
warn().printf("pause_filter_threshold=0x%x\n", _vmcb->control_area.pause_filter_threshold);
warn().printf("pause_filter_count=0x%x\n", _vmcb->control_area.pause_filter_count);
warn().printf("iopm_base_pa=0x%llx\n", _vmcb->control_area.iopm_base_pa);
warn().printf("msrpm_base_pa=0x%llx\n", _vmcb->control_area.msrpm_base_pa);
warn().printf("tsc_offset=0x%llx\n", _vmcb->control_area.tsc_offset);
warn().printf("guest_asid_tlb_ctl=0x%llx\n", _vmcb->control_area.guest_asid_tlb_ctl);
warn().printf("interrupt_ctl=0x%llx\n", _vmcb->control_area.interrupt_ctl);
warn().printf("interrupt_shadow=0x%llx\n", _vmcb->control_area.interrupt_shadow);
warn().printf("exitcode=0x%llx\n", _vmcb->control_area.exitcode);
warn().printf("exitinfo1=0x%llx\n", _vmcb->control_area.exitinfo1);
warn().printf("exitinfo2=0x%llx\n", _vmcb->control_area.exitinfo2);
warn().printf("exitintinfo=0x%llx\n", _vmcb->control_area.exitintinfo);
warn().printf("np_enable=0x%llx\n", _vmcb->control_area.np_enable);
warn().printf("eventinj=0x%llx\n", _vmcb->control_area.eventinj);
warn().printf("n_cr3=0x%llx\n", _vmcb->control_area.n_cr3);
warn().printf("lbr_virtualization_enable=0x%llx\n", _vmcb->control_area.lbr_virtualization_enable);
warn().printf("clean_bits=0x%llx\n", _vmcb->control_area.clean_bits);
warn().printf("n_rip=0x%llx\n", _vmcb->control_area.n_rip);
warn().printf("State save area:\n");
warn().printf("es: selector=0x%x, attrib=0x%x, limit=0x%x, base=0x%llx)\n",
_vmcb->state_save_area.es.selector,
_vmcb->state_save_area.es.attrib,
_vmcb->state_save_area.es.limit,
_vmcb->state_save_area.es.base);
warn().printf("cs: selector=0x%x, attrib=0x%x, limit=0x%x, base=0x%llx)\n",
_vmcb->state_save_area.cs.selector,
_vmcb->state_save_area.cs.attrib,
_vmcb->state_save_area.cs.limit,
_vmcb->state_save_area.cs.base);
warn().printf("ss: selector=0x%x, attrib=0x%x, limit=0x%x, base=0x%llx)\n",
_vmcb->state_save_area.ss.selector,
_vmcb->state_save_area.ss.attrib,
_vmcb->state_save_area.ss.limit,
_vmcb->state_save_area.ss.base);
warn().printf("ds: selector=0x%x, attrib=0x%x, limit=0x%x, base=0x%llx)\n",
_vmcb->state_save_area.ds.selector,
_vmcb->state_save_area.ds.attrib,
_vmcb->state_save_area.ds.limit,
_vmcb->state_save_area.ds.base);
warn().printf("fs: selector=0x%x, attrib=0x%x, limit=0x%x, base=0x%llx)\n",
_vmcb->state_save_area.fs.selector,
_vmcb->state_save_area.fs.attrib,
_vmcb->state_save_area.fs.limit,
_vmcb->state_save_area.fs.base);
warn().printf("gs: selector=0x%x, attrib=0x%x, limit=0x%x, base=0x%llx)\n",
_vmcb->state_save_area.gs.selector,
_vmcb->state_save_area.gs.attrib,
_vmcb->state_save_area.gs.limit,
_vmcb->state_save_area.gs.base);
warn().printf("gdtr: selector=0x%x, attrib=0x%x, limit=0x%x, base=0x%llx)\n",
_vmcb->state_save_area.gdtr.selector,
_vmcb->state_save_area.gdtr.attrib,
_vmcb->state_save_area.gdtr.limit,
_vmcb->state_save_area.gdtr.base);
warn().printf("ldtr: selector=0x%x, attrib=0x%x, limit=0x%x, base=0x%llx)\n",
_vmcb->state_save_area.ldtr.selector,
_vmcb->state_save_area.ldtr.attrib,
_vmcb->state_save_area.ldtr.limit,
_vmcb->state_save_area.ldtr.base);
warn().printf("idtr: selector=0x%x, attrib=0x%x, limit=0x%x, base=0x%llx)\n",
_vmcb->state_save_area.idtr.selector,
_vmcb->state_save_area.idtr.attrib,
_vmcb->state_save_area.idtr.limit,
_vmcb->state_save_area.idtr.base);
warn().printf("tr: selector=0x%x, attrib=0x%x, limit=0x%x, base=0x%llx)\n",
_vmcb->state_save_area.tr.selector,
_vmcb->state_save_area.tr.attrib,
_vmcb->state_save_area.tr.limit,
_vmcb->state_save_area.tr.base);
warn().printf("cpl=0x%x\n", _vmcb->state_save_area.cpl);
warn().printf("efer=0x%llx\n", _vmcb->state_save_area.efer);
warn().printf("cr4=0x%llx\n", _vmcb->state_save_area.cr4);
warn().printf("cr3=0x%llx\n", _vmcb->state_save_area.cr3);
warn().printf("cr0=0x%llx\n", _vmcb->state_save_area.cr0);
warn().printf("dr7=0x%llx\n", _vmcb->state_save_area.dr7);
warn().printf("dr6=0x%llx\n", _vmcb->state_save_area.dr6);
warn().printf("rflags=0x%llx\n", _vmcb->state_save_area.rflags);
warn().printf("rip=0x%llx\n", _vmcb->state_save_area.rip);
warn().printf("rsp=0x%llx\n", _vmcb->state_save_area.rsp);
warn().printf("rax=0x%llx\n", _vmcb->state_save_area.rax);
warn().printf("star=0x%llx\n", _vmcb->state_save_area.star);
warn().printf("lstar=0x%llx\n", _vmcb->state_save_area.lstar);
warn().printf("cstar=0x%llx\n", _vmcb->state_save_area.cstar);
warn().printf("sfmask=0x%llx\n", _vmcb->state_save_area.sfmask);
warn().printf("kernelgsbase=0x%llx\n", _vmcb->state_save_area.kernelgsbase);
warn().printf("sysenter_cs=0x%llx\n", _vmcb->state_save_area.sysenter_cs);
warn().printf("sysenter_esp=0x%llx\n", _vmcb->state_save_area.sysenter_esp);
warn().printf("sysenter_eip=0x%llx\n", _vmcb->state_save_area.sysenter_eip);
warn().printf("cr2=0x%llx\n", _vmcb->state_save_area.cr2);
warn().printf("g_pat=0x%llx\n", _vmcb->state_save_area.g_pat);
warn().printf("dbgctl=0x%llx\n", _vmcb->state_save_area.dbgctl);
warn().printf("br_from=0x%llx\n", _vmcb->state_save_area.br_from);
warn().printf("br_to=0x%llx\n", _vmcb->state_save_area.br_to);
warn().printf("lastexcpfrom=0x%llx\n", _vmcb->state_save_area.lastexcpfrom);
warn().printf("last_excpto=0x%llx\n", _vmcb->state_save_area.last_excpto);
// this field is _NOT_ part of the official VMCB specification
// a (userlevel) VMM needs this for proper FPU state virtualization
warn().printf("xcr0=0x%llx\n", _vmcb->state_save_area.xcr0);
}
} //namespace Vmm

View File

@@ -0,0 +1,469 @@
/*
* Copyright (C) 2021, 2023-2024 Kernkonzept GmbH.
* Author(s): Georg Kotheimer <georg.kotheimer@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include <l4/sys/vm>
#include <l4/cxx/bitfield>
#include "vm_state.h"
#include "debug.h"
#include "pt_walker.h"
#include "event_recorder.h"
namespace Vmm {
class Svm_state : public Vm_state
{
public:
enum Cpuid_svm
{
/// Indicates support for NEXT_RIP save on #VMEXIT.
Cpuid_svm_feature_nrips = 1UL << 3,
/// Indicates support for the decode assists.
Cpuid_svm_feature_decode_assists = 1UL << 7,
};
enum class Exit
{
Cr0_read = 0x00, // Cr_access
Cr15_read = 0x0f, // Cr_access
Cr0_write = 0x10, // Cr_access
Cr15_write = 0x1f, // Cr_access
Dr0_read = 0x20, // DR access
Dr1_read,
Dr2_read,
Dr3_read,
Dr4_read,
Dr5_read,
Dr6_read,
Dr7_read,
Dr8_read,
Dr9_read,
Dr10_read,
Dr11_read,
Dr12_read,
Dr13_read,
Dr14_read,
Dr15_read = 0x2f, // DR access
Dr0_write = 0x30, // DR access
Dr1_write,
Dr2_write,
Dr3_write,
Dr4_write,
Dr5_write,
Dr6_write,
Dr7_write,
Dr8_write,
Dr9_write,
Dr10_write,
Dr11_write,
Dr12_write,
Dr13_write,
Dr14_write,
Dr15_write = 0x3f, // DR access
Excp_0 = 0x40, // Exception_or_nmi
Excp_31 = 0x5f, // Exception_or_nmi
Intr = 0x60, // ??? Physical interrupt (maskable) -> directly handled by Fiasco!
Nmi = 0x61, // ??? Exception_or_nmi
Vintr = 0x64, // ??? Virtual interrupt
Cr0_sel_write = 0x65, // Cr_access
Rdpmc = 0x6f, // RDPMC instruction
Cpuid = 0x72, // Cpuid
Sw_int = 0x75, // INTn instruction
Hlt = 0x78, // Exec_halt
Ioio = 0x7b, // Io_access
Msr = 0x7c, // Exec_rdmsr and Exec_wrmsr
Shutdown = 0x7f, // Shutdown event
Vmrun = 0x80, // VMRUN instruction
Vmmcall = 0x81, // Exec_vmcall
Vmload = 0x82, // VMLOAD instruction
Vmsave = 0x83, // VMSAVE instruction
Stgi = 0x84, // STGI instruction
Clgi = 0x85, // CLGI instruction
Skinit = 0x86, // SKINIT instruction
Rdtscp = 0x87, // RDTSCP instruction
Icebp = 0x88, // INT1 instruction
Xsetbv = 0x8d, // Exec_xsetbv, write to xcr0 field in guest_state
Cr0_write_trap = 0x90, // Cr_access
Cr15_write_trap = 0x9f, // Cr_access
Nested_page_fault = 0x400, // Ept_violation
// TODO: intercept FERR_FREEZE event
// TODO: intercept INTR/NMI/SMI/INIT
// TODO: intercept INVD
// TODO: intercept task switch
// TODO: intercept iopm and msrpm
// TODO: intercept MONITOR/MWAIT
// TODO: intercept #AC and #DB (expections)
};
enum Intercept_inst0
{
Intercept_intr = 1 << 0,
Intercept_nmi = 1 << 1,
Intercept_smi = 1 << 2,
Intercept_init = 1 << 3,
Intercept_vintr = 1 << 4,
Intercept_cr0_sel_write = 1 << 5,
Intercept_rdpmc = 1 << 15,
Intercept_cpuid = 1 << 18,
Intercept_invd = 1 << 22,
Intercept_hlt = 1 << 24,
Intercept_ioio = 1 << 27,
Intercept_msr = 1 << 28,
Intercept_task_switch = 1 << 29,
Intercept_freeze = 1 << 30,
Intercept_shutdown = 1 << 31,
};
enum Intercept_inst1
{
Intercept_vmrun = 1 << 0,
Intercept_vmmcall = 1 << 1,
Intercept_vmload = 1 << 2,
Intercept_vmsave = 1 << 3,
Intercept_stgi = 1 << 4,
Intercept_clgi = 1 << 5,
Intercept_skinit = 1 << 6,
Intercept_rdtscp = 1 << 7,
Intercept_icebp = 1 << 8,
Intercept_wbinvd_wbnoinvd = 1 << 9,
Intercept_monitor = 1 << 10,
Intercept_mwait = 1 << 11,
Intercept_mwait_mon = 1 << 12,
Intercept_xsetbv = 1 << 13,
Intercept_rdpru = 1 << 14,
Intercept_efer_write = 1 << 15,
Intercept_cr0_cr15_write = 0xffff << 16,
};
enum Efer
{
Efer_lme = 1 << 8,
Efer_lma = 1 << 10,
Efer_svme_enable = 1 << 12,
// TODO: Efer has some additional bits above on AMD
Efer_guest_write_mask = 0xd01,
};
enum Cr0 : unsigned long
{
Cr0_pe = 1UL << 0,
Cr0_pg = 1UL << 31,
};
enum Decode_assist : unsigned long long
{
Cr_gpr_mask = 0xf,
Cr_valid = 1ULL << 63,
};
enum Flags : unsigned long
{
Interrupt_enabled = (1UL << 9),
Virtual_8086_mode = (1UL << 17),
};
struct Io_info
{
l4_uint32_t raw;
explicit Io_info(l4_uint32_t val) : raw(val) {}
CXX_BITFIELD_MEMBER( 0, 0, type, raw);
CXX_BITFIELD_MEMBER( 2, 2, str, raw);
CXX_BITFIELD_MEMBER( 3, 3, rep, raw);
CXX_BITFIELD_MEMBER( 4, 4, sz8, raw);
CXX_BITFIELD_MEMBER( 5, 5, sz16, raw);
CXX_BITFIELD_MEMBER( 6, 6, sz32, raw);
CXX_BITFIELD_MEMBER( 7, 7, a16, raw);
CXX_BITFIELD_MEMBER( 8, 8, a32, raw);
CXX_BITFIELD_MEMBER( 9, 9, a64, raw);
CXX_BITFIELD_MEMBER( 4, 6, data_size, raw);
CXX_BITFIELD_MEMBER( 7, 9, addr_size, raw);
CXX_BITFIELD_MEMBER(10, 12, seg, raw);
CXX_BITFIELD_MEMBER(16, 31, port, raw);
};
struct Npf_info
{
l4_uint64_t raw;
explicit Npf_info(l4_uint64_t val) : raw(val) {}
CXX_BITFIELD_MEMBER(0, 0, present, raw);
CXX_BITFIELD_MEMBER(1, 1, write, raw);
CXX_BITFIELD_MEMBER(2, 2, user, raw);
CXX_BITFIELD_MEMBER(4, 4, inst, raw);
};
struct Interrupt_ctl
{
l4_uint64_t raw;
explicit Interrupt_ctl(l4_uint64_t val) : raw(val) {}
CXX_BITFIELD_MEMBER( 0, 7, v_tpr, raw);
CXX_BITFIELD_MEMBER( 8, 8, v_irq, raw);
CXX_BITFIELD_MEMBER(16, 19, v_intr_prio, raw);
CXX_BITFIELD_MEMBER(20, 20, v_ign_tpr, raw);
CXX_BITFIELD_MEMBER(32, 39, v_intr_vector, raw);
};
Svm_state(void *vmcb) : _vmcb(static_cast<l4_vm_svm_vmcb_t *>(vmcb)) {}
~Svm_state() = default;
Type type() const override
{ return Type::Svm; }
enum Clean_bits
{
Vmcb_i = 1 << 0, // Intercepts: all the intercept vectors, TSC offset, Pause Filter Count
Vmcb_iopm = 1 << 1, // IOMSRPM: IOPM_BASE, MSRPM_BASE
Vmcb_asid = 1 << 2, // ASID
Vmcb_tpr = 1 << 3, // V_TPR, V_IRQ, V_INTR_PRIO, V_IGN_TPR, V_INTR_MASKING, V_INTR_VECTOR (Offset 60h67h)
Vmcb_np = 1 << 4, // Nested Paging: NCR3, PAT, Nested_Paging_En
Vmcb_crx = 1 << 5, // CR0, CR3, CR4, EFER
Vmcb_drx = 1 << 6, // DR6, DR7
Vmcb_dt = 1 << 7, // GDT/IDT Limit and Base
Vmcb_seg = 1 << 8, // CS/DS/SS/ES Sel/Base/Limit/Attr, CPL
Vmcb_cr2 = 1 << 9, // CR2
Vmcb_lbr = 1 << 10, // DbgCtlMsr, br_from/to, lastint_from/to
Vmcb_avic = 1 << 11, // AVIC APIC_BAR; AVIC APIC_BACKING_PAGE, AVIC PHYSICAL_TABLE and AVIC LOGICAL_TABLE Pointers
};
void mark_all_clean()
{ _vmcb->control_area.clean_bits = ~0U; }
void mark_all_dirty()
{ _vmcb->control_area.clean_bits = 0U; }
void mark_dirty(Clean_bits bits)
{ _vmcb->control_area.clean_bits &= ~bits; }
void init_state() override;
void setup_linux_protected_mode(l4_addr_t entry,
l4_addr_t stack_addr) override;
void setup_real_mode(l4_addr_t entry) override;
Injection_event pending_event_injection() override
{
return Injection_event(_vmcb->control_area.exitintinfo);
}
void invalidate_pending_event()
{
_vmcb->control_area.exitintinfo &=
~(1 << Injection_event::valid_bfm_t::Lsb);
}
bool pf_write() const override
{ return Npf_info(_vmcb->control_area.exitinfo1).write(); }
l4_umword_t ip() const override
{ return _vmcb->state_save_area.rip; }
l4_umword_t sp() const override
{ return _vmcb->state_save_area.rsp; }
l4_umword_t cr3() const override
{ return _vmcb->state_save_area.cr3; }
l4_uint64_t xcr0() const override
{ return _vmcb->state_save_area.xcr0; }
bool determine_next_ip_from_ip(l4_vcpu_regs_t *regs, unsigned char *inst_buf,
unsigned inst_buf_len);
void jump_instruction()
{
if (_vmcb->control_area.n_rip == 0)
warn().printf("Next instruction pointer is zero: rip=0x%llx -> nrip=0x%llx\n",
_vmcb->state_save_area.rip, _vmcb->control_area.n_rip);
_vmcb->state_save_area.rip = _vmcb->control_area.n_rip;
}
Exit exit_code() const
{ return Exit(_vmcb->control_area.exitcode); }
l4_uint64_t exit_info1() const
{ return _vmcb->control_area.exitinfo1; }
l4_uint64_t exit_info2() const
{ return _vmcb->control_area.exitinfo2; }
l4_vm_svm_vmcb_t *vmcb() const
{ return _vmcb; }
bool is_halted() const
{ return halted; }
void halt()
{ halted = true; }
void resume()
{ halted = false; }
bool interrupts_enabled() const
{
// TODO: Instead we could use interrupt_shadow bit 1 here = GUEST_INTERRUPT_MASK
return (_vmcb->state_save_area.rflags & Interrupt_enabled)
&& !(_vmcb->control_area.interrupt_shadow & 1);
}
void clear_sti_shadow()
{ _vmcb->control_area.interrupt_shadow &= (-1ULL << 1); }
/**
* Check if there is an event currently being injected.
*
* \return true iff an event is in the process of being injected
*/
bool event_injected() const
{ return Svm_event_info(_vmcb->control_area.eventinj).valid(); }
/**
* This function checks if interrupts are enabled and no event injection is
* in flight.
*
* \return true iff we can inject in an interrupt into the guest
*/
bool can_inject_interrupt() const override
{ return interrupts_enabled() && !event_injected(); }
void disable_interrupt_window() override
{
// Disable dummy virtual interrupt
Interrupt_ctl int_ctl(_vmcb->control_area.interrupt_ctl);
int_ctl.v_irq() = 0;
int_ctl.v_ign_tpr() = 0;
_vmcb->control_area.interrupt_ctl = int_ctl.raw;
mark_dirty(Vmcb_tpr);
}
void enable_interrupt_window() override
{
// Add dummy virtual interrupt, so that we get notified via the VINTR
// intercept, once the guest is ready to receive interrupts.
Interrupt_ctl int_ctl(_vmcb->control_area.interrupt_ctl);
int_ctl.v_irq() = 1;
int_ctl.v_intr_vector() = 0;
int_ctl.v_ign_tpr() = 1;
_vmcb->control_area.interrupt_ctl = int_ctl.raw;
mark_dirty(Vmcb_tpr);
}
/**
* Injecting NMIs is currently not supported on SVM, as in case something
* prevents an NMI from being injected (for example interrupt shadow), we
* would have to single-step the guest until the NMI injection is possible.
* In addition, we would have to intercept IRET to track NMI completion.
*
* Starting with Zen4, AMD SVM supports VNMI for efficient injection of NMIs.
*/
bool can_inject_nmi() const override
{ /* TODO */ return false; }
void disable_nmi_window() override
{ /* TODO */ }
void enable_nmi_window() override
{ /* TODO */ }
struct Svm_event_info
{
enum class Int_type : unsigned
{
External_interrupt = 0,
NMI = 2,
Exception = 3,
Software_interrupt = 4,
};
l4_uint64_t field;
CXX_BITFIELD_MEMBER(0, 7, vector, field);
CXX_BITFIELD_MEMBER(8, 10, type, field);
CXX_BITFIELD_MEMBER(11, 11, error_valid, field);
CXX_BITFIELD_MEMBER(31, 31, valid, field);
CXX_BITFIELD_MEMBER(32, 63, error_code, field);
Svm_event_info(l4_uint64_t raw) : field(raw) {}
Svm_event_info(unsigned i, Int_type t, unsigned err_valid = 0,
l4_uint32_t err_code = 0, unsigned v = 1)
: field(0)
{
vector() = i;
type() = static_cast<unsigned>(t);
error_valid() = err_valid;
valid() = v;
error_code() = err_code;
}
};
enum Deliver_error_code : unsigned
{
No_error_code = 0,
Push_error_code = 1,
};
void inject_event(Injection_event const &ev) override
{
assert(ev.valid());
_vmcb->control_area.eventinj = ev.raw;
}
void inject_event(int event_num, Svm_event_info::Int_type type,
Deliver_error_code deliver_err = No_error_code,
l4_uint32_t err_code = 0)
{
Svm_event_info info(event_num, type, deliver_err, err_code);
if (0)
warn().printf(
"-------------- Injecting interrupt/event 0x%x -> (0x%llx)\n",
event_num, info.field);
_vmcb->control_area.eventinj = info.field;
}
int handle_cr0_write(l4_vcpu_regs_t *regs);
int handle_xsetbv(l4_vcpu_regs_t *regs);
bool read_msr(unsigned msr, l4_uint64_t *value) const override;
bool write_msr(unsigned msr, l4_uint64_t value, Event_recorder *) override;
int handle_hardware_exception(Event_recorder *ev_rec, unsigned num);
l4_umword_t read_gpr(l4_vcpu_regs_t *regs, unsigned reg) const;
static const char *str_exit_code(Exit exit);
void dump(l4_vcpu_regs_t const *regs) const;
void advance_entry_ip(unsigned bytes) override
{ _vmcb->control_area.n_rip += bytes; }
void additional_failure_info(unsigned /* vcpu_id */) {}
private:
static Dbg warn()
{ return Dbg(Dbg::Cpu, Dbg::Warn, "SVM"); }
static Dbg info()
{ return Dbg(Dbg::Cpu, Dbg::Info, "SVM"); }
static Dbg trace()
{ return Dbg(Dbg::Cpu, Dbg::Trace, "SVM"); }
l4_vm_svm_vmcb_t *_vmcb;
bool halted = false;
};
} // namespace Vmm

View File

@@ -0,0 +1,843 @@
/*
* Copyright (C) 2017-2018, 2020-2024 Kernkonzept GmbH.
* Author(s): Sarah Hoffmann <sarah.hoffmann@kernkonzept.com>
* Philipp Eppelt <philipp.eppelt@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#include <l4/re/env>
#include "vm_state_vmx.h"
#include "consts.h"
#include "event_recorder.h"
namespace Vmm {
Vm_state::~Vm_state() = default;
enum : unsigned long
{
Misc_enable_fast_string = 1UL,
Cr0_pe_bit = 1UL,
Cr0_pg_bit = 1UL << 31,
Cr4_pae_bit = 1UL << 5,
Cr4_la57_bit = 1UL << 12,
Efer_syscall_enable_bit = 1UL,
Efer_lme_bit = 1UL << 8,
Efer_lma_bit = 1UL << 10,
Efer_nxe_bit = 1UL << 11,
// EFER.LMA writes are ignored. Other bits reserved.
Efer_write_mask = Efer_syscall_enable_bit | Efer_lme_bit | Efer_nxe_bit,
Entry_ctrl_ia32e_bit = 1UL << 9,
};
Vmx_state::Vmx_state(l4_vm_vmx_vcpu_state_t *state)
: _state(state),
_vmcs(&state->vmcs),
_hw_vmcs(L4Re::chkcap(L4Re::Util::make_unique_cap<L4::Vcpu_context>(),
"Failed to allocate hardware VMCS capability."))
{
// Create the hardware VMCS
auto *env = L4Re::Env::env();
auto ret = env->factory()->create(_hw_vmcs.get(), L4_PROTO_VCPU_CONTEXT);
if (l4_error(ret) < 0)
L4Re::chksys(ret, "Cannot create guest VM hardware VMCS. Virtualization "
"support may be missing.");
if (nested_abi_revision() != 0)
info().printf("vCPU interface supports nested virtualization. However, "
"uvmm does not implement nested virtualization.\n");
}
/**
* Handle exits due to HW/SW exceptions, NMIs, and external interrupts.
*
* Bit 11, error_valid, is not set if, an external interrupt occurred and
* 'acknowledge interrupt on exit' is not set in the exit controls.
*/
int
Vmx_state::handle_exception_nmi_ext_int(Event_recorder *ev_rec)
{
Vm_exit_int_info interrupt_info = exit_int_info();
l4_uint32_t interrupt_error = 0;
if (interrupt_info.error_valid())
interrupt_error =
(l4_uint32_t)vmx_read(VMCS_VM_EXIT_INTERRUPT_ERROR);
trace().printf("Exception, NMI or external interrupt exit: 0x%x/0x%x\n",
interrupt_info.field, (unsigned)interrupt_error);
switch ((interrupt_info.type()))
{
// Pin-based controlls not set, Ext_int and NMI should not happen.
case 0x0: warn().printf("External interrupt\n"); break;
case 0x2: warn().printf("NMI\n"); break;
case 0x3:
return handle_hardware_exception(ev_rec, interrupt_info.vector(),
interrupt_error);
case 0x4: // software interrupt: INT n
// Software interrupt event record
using Event_sw_int = Event_sw_generic<4>;
ev_rec->make_add_event<Event_sw_int>(Event_prio::Sw_intN,
interrupt_info.vector(),
2U); // opcode + operand
return Retry;
case 0x5: // priviledged software exception: INT1
// Priveledged software exception event record
using Event_priv_sw_exc = Event_sw_generic<5>;
ev_rec->make_add_event<Event_priv_sw_exc>(Event_prio::Sw_int1, 1, 1U);
return Retry;
case 0x6: // software exception: INT3, INTO
{
// Software exception event record
using Event_sw_exc = Event_sw_generic<6>;
unsigned vec = interrupt_info.vector();
if (vec == 3)
{
ev_rec->make_add_event<Event_sw_exc>(Event_prio::Sw_int3, vec, 1U);
return Retry;
}
else if (vec == 4)
{
ev_rec->make_add_event<Event_sw_exc>(Event_prio::Sw_intO, vec, 1U);
return Retry;
}
else
// not defined in Intel SDM; leave this here as debug hint.
warn().printf("Unknown software exception %u\n", vec);
break;
}
default:
warn().printf("Unknown interrupt type: %u, vector: %u\n",
interrupt_info.type().get(), interrupt_info.vector().get());
break;
}
return -L4_ENOSYS;
}
bool
Vmx_state::read_msr(unsigned msr, l4_uint64_t *value) const
{
unsigned shadow = msr_shadow_reg(msr);
if (shadow > 0)
{
*value = vmx_read(shadow);
}
else
{
switch (msr)
{
case 0x17: // IA32_PLATFORM_ID
*value = 0U;
break;
case 0x1a0: // IA32_MISC_ENABLE
*value = Misc_enable_fast_string;
break;
case 0x3a: // IA32_FEATURE_CONTROL
// Lock register so the guest does not try to enable anything.
*value = 1U;
break;
case 0x277: // IA32_PAT
*value = vmx_read(VMCS_GUEST_IA32_PAT);
break;
case 0xc0000080: // efer
*value = vmx_read(VMCS_GUEST_IA32_EFER);
break;
/*
* Non-architectural MSRs known to be probed by Linux that can be
* safely ignored:
* 0xce // MSR_PLATFORM_INFO
* 0x33 // TEST_CTRL
* 0x34 // MSR_SMI_COUNT
* 0x140 // MISC_FEATURE_ENABLES
* 0x64e // MSR_PPERF
* 0x639 // MSR_PP0_ENERGY_STATUS
* 0x611 // MSR_PKG_ENERGY_STATUS
* 0x619 // MSR_DRAM_ENERGY_STATUS
* 0x641 // MSR_PP1_ENERGY_STATUS
* 0x64d // MSR_PLATFORM_ENERGY_COUNTER
* 0x606 // MSR_RAPL_POWER_UNIT
*/
default:
return false;
}
}
return true;
}
bool
Vmx_state::write_msr(unsigned msr, l4_uint64_t value, Event_recorder *ev_rec)
{
unsigned shadow = msr_shadow_reg(msr);
if (shadow > 0)
{
vmx_write(shadow, value);
return true;
}
switch (msr)
{
case 0x277: // IA32_PAT
// sanitization of 7 PAT values
// 0xF8 are reserved bits
// 0x2 and 0x3 are reserved encodings
// usage of reserved bits and encodings results in a #GP
if (value & 0xF8F8F8F8F8F8F8F8ULL)
{
ev_rec->make_add_event<Event_exc>(Event_prio::Exception, 13, 0);
break;
}
for (unsigned i = 0; i < 7; ++i)
{
l4_uint64_t const PAi_mask = (value & (0x7ULL << i * 8)) >> i * 8;
if ((PAi_mask == 0x2ULL) || (PAi_mask == 0x3ULL))
{
ev_rec->make_add_event<Event_exc>(Event_prio::Exception, 13, 0);
break;
}
}
vmx_write(VMCS_GUEST_IA32_PAT, value);
break;
case 0xc0000080: // efer
{
l4_uint64_t old_efer = vmx_read(VMCS_GUEST_IA32_EFER);
// LMA writes are ignored.
l4_uint64_t efer = (value & Efer_write_mask) | (old_efer & Efer_lma_bit);
l4_uint64_t cr0 = vmx_read(VMCS_GUEST_CR0);
trace().printf("IA32_EFER write: CR0: 0x%llx, old efer 0x%llx, "
"new efer 0x%llx\n",
cr0, old_efer, efer);
if (cr0 & Cr0_pg_bit)
{
// Can't change LME while CR0.PG is set. SDM vol 3. 4.1
if ((efer & Efer_lme_bit) != (old_efer & Efer_lme_bit))
{
// Inject GPF and do not write IA32_EFER
ev_rec->make_add_event<Event_exc>(Event_prio::Exception, 13, 0);
break;
}
}
vmx_write(VMCS_GUEST_IA32_EFER, efer);
break;
}
case 0x8b: // IA32_BIOS_SIGN_ID
case 0x140: // unknown in Intel 6th gen, but MISC_FEATURE register for xeon
break;
case 0x1a0:
warn().printf("Writing MSR 0x%x IA32_MISC_ENABLED 0x%llx\n", msr, value);
break;
case 0xe01: // MSR_UNC_PERF_GLOBAL_CTRL
// can all be savely ignored
break;
default:
return false;
}
return true;
}
int
Vmx_state::handle_cr_access(l4_vcpu_regs_t *regs)
{
auto qual = vmx_read(VMCS_EXIT_QUALIFICATION);
int crnum;
l4_umword_t newval;
switch ((qual >> 4) & 3)
{
case 0: // mov to cr
crnum = qual & 0xF;
switch ((qual >> 8) & 0xF)
{
case 0: newval = regs->ax; break;
case 1: newval = regs->cx; break;
case 2: newval = regs->dx; break;
case 3: newval = regs->bx; break;
case 4: newval = vmx_read(VMCS_GUEST_RSP); break;
case 5: newval = regs->bp; break;
case 6: newval = regs->si; break;
case 7: newval = regs->di; break;
case 8: newval = regs->r8; break;
case 9: newval = regs->r9; break;
case 10: newval = regs->r10; break;
case 11: newval = regs->r11; break;
case 12: newval = regs->r12; break;
case 13: newval = regs->r13; break;
case 14: newval = regs->r14; break;
case 15: newval = regs->r15; break;
default:
warn().printf("Loading CR from unknown register\n");
return -L4_EINVAL;
}
break;
case 2: // clts
crnum = 0;
newval = vmx_read(VMCS_GUEST_CR0) & ~(1ULL << 3);
break;
default:
warn().printf("Unknown CR action %lld.\n", (qual >> 4) & 3);
return -L4_EINVAL;
}
switch (crnum)
{
case 0:
{
auto old_cr0 = vmx_read(VMCS_GUEST_CR0);
trace().printf("Write to cr0: 0x%llx -> 0x%lx\n", old_cr0, newval);
l4_uint64_t cr4 = vmx_read(VMCS_GUEST_CR4);
l4_uint64_t efer = vmx_read(VMCS_GUEST_IA32_EFER);
// enable paging
if ((newval & Cr0_pg_bit) && !(old_cr0 & Cr0_pg_bit))
{
if ( (!(cr4 & Cr4_pae_bit) && (efer & Efer_lme_bit))
|| (!(efer & Efer_lme_bit) && (cr4 & Cr4_la57_bit)))
{
// inject GPF and do not write CR0
return General_protection;
}
// LA57: Cr4.PAE, EFER.LME, Cr4.LA57
// IA32e: Cr4.PAE, EFER.LME, !Cr4.LA57
// PAE: Cr4.PAE, !EFER.LME, !Cr4.LA57
// 32bit: !Cr4.PAE, !EFER.LME, !Cr4.LA57
if ((cr4 & Cr4_pae_bit) && (efer & Efer_lme_bit))
{
if (cr4 & Cr4_la57_bit)
info().printf("Enable LA57 paging\n");
else
info().printf("Enable IA32e paging\n");
vmx_write(VMCS_VM_ENTRY_CTLS,
vmx_read(VMCS_VM_ENTRY_CTLS) | Entry_ctrl_ia32e_bit);
// Contrary to SDM Vol 3, 24.8.1. IA32_EFER.LMA is not set to
// the value of ENTRY_CTLS.IA32e on VMentry.
vmx_write(VMCS_GUEST_IA32_EFER, efer | Efer_lma_bit);
}
else if (cr4 & Cr4_pae_bit) // && !EFER.LME
trace().printf("Enable PAE paging.\n");
else
trace().printf("Enable 32-bit paging\n");
}
// disable paging
if (!(newval & Cr0_pg_bit) && (old_cr0 & Cr0_pg_bit))
{
trace().printf("Disabling paging ...\n");
vmx_write(VMCS_VM_ENTRY_CTLS,
vmx_read(VMCS_VM_ENTRY_CTLS) & ~Entry_ctrl_ia32e_bit);
// Contrary to SDM Vol 3, 24.8.1. IA32_EFER.LMA is not set to
// the value of ENTRY_CTLS.IA32e on VMentry.
vmx_write(VMCS_GUEST_IA32_EFER, efer & ~Efer_lma_bit);
}
// 0x10 => Extension Type; hardcoded to 1 see manual
vmx_write(VMCS_GUEST_CR0, newval | 0x10);
vmx_write(VMCS_CR0_READ_SHADOW, newval);
break;
}
case 4:
{
trace().printf("mov to cr4: 0x%lx, RIP 0x%lx\n", newval, ip());
l4_uint64_t old_cr4 = vmx_read(VMCS_GUEST_CR4);
if (vmx_read(VMCS_GUEST_CR0) & Cr0_pg_bit)
{
if ((newval & Cr4_la57_bit) != (old_cr4 & Cr4_la57_bit))
{
// inject GPF and do not write CR4
return General_protection;
}
l4_uint64_t efer = vmx_read(VMCS_GUEST_IA32_EFER);
if (!(newval & Cr4_pae_bit) && (efer & Efer_lme_bit))
{
// inject GPF and do not write CR4
return General_protection;
}
// !EFER.LME means either PAE or 32-bit paging. Transitioning
// between these two while Cr0.PG is set is allowed.
}
// We don't support 5-level page tables, be quirky and don't allow
// setting this bit. (Or fix page-table walker.)
if (newval & Cr4_la57_bit)
{
info().printf("Cr4 Guest wants to enable LA57. Filtering...\n");
newval &= ~Cr4_la57_bit;
}
// CR4 0x2000 = VMXEnable bit
// force VMXEnable bit, but hide it from guest
vmx_write(VMCS_GUEST_CR4, newval | 0x2000);
vmx_write(VMCS_CR4_READ_SHADOW, newval);
break;
}
default:
warn().printf("Unknown CR access.\n");
return -L4_EINVAL;
}
return Jump_instr;
}
int
Vmx_state::handle_hardware_exception(Event_recorder *ev_rec, unsigned num,
l4_uint32_t err_code)
{
if (in_real_mode())
{
// In real mode, exceptions do not push an error code.
ev_rec->make_add_event<Real_mode_exc>(Event_prio::Exception, num);
return Retry;
}
// Reflect all hardware exceptions to the guest. Exceptions pushing an error
// code are handled specially.
switch (num)
{
case 8: // #DF
case 10: // #TS
case 11: // #NP
case 12: // #SS
case 13: // #GP
case 14: // #PF
case 17: // #AC
case 21: // #CP
ev_rec->make_add_event<Event_exc>(Event_prio::Exception, num, err_code);
break;
case 1: // #DB
// #DB exceptions are either of fault type or of trap type. We reflect
// both to the guest, without changing state, thus don't change the IP.
[[fallthrough]];
default:
ev_rec->make_add_event<Event_exc>(Event_prio::Exception, num);
break;
}
return Retry;
}
int
Vmx_state::store_io_value(l4_vcpu_regs_t *regs, cxx::Ref_ptr<Pt_walker> ptw,
Vmx_insn_info_field info, Mem_access::Width op_width,
l4_uint32_t value)
{
switch (op_width)
{
case Mem_access::Wd8:
return store_io_value_t<l4_uint8_t>(regs, ptw, info, value);
case Mem_access::Wd16:
return store_io_value_t<l4_uint16_t>(regs, ptw, info, value);
case Mem_access::Wd32:
return store_io_value_t<l4_uint32_t>(regs, ptw, info, value);
default:
break;
}
return Invalid_opcode;
}
int
Vmx_state::load_io_value(l4_vcpu_regs_t *regs, cxx::Ref_ptr<Pt_walker> ptw,
Vmx_insn_info_field info, Mem_access::Width op_width,
l4_uint32_t *value)
{
switch (op_width)
{
case Mem_access::Wd8:
return load_io_value_t<l4_uint8_t>(regs, ptw, info, value);
case Mem_access::Wd16:
return load_io_value_t<l4_uint16_t>(regs, ptw, info, value);
case Mem_access::Wd32:
return load_io_value_t<l4_uint32_t>(regs, ptw, info, value);
default:
break;
}
return Invalid_opcode;
}
int
Vmx_state::rep_prefix_condition(l4_vcpu_regs_t *regs, Vmx_insn_info_field info,
bool *next)
{
// Prepare a mask for the register size.
l4_uint64_t mask;
int ret = address_size_mask(info.address_size(), &mask);
if (ret != Jump_instr)
return ret;
// Check the condition.
*next = (regs->cx & mask) != 0;
// Decrement the register.
if (*next)
regs->cx = (regs->cx & ~mask) | ((regs->cx - 1) & mask);
return Jump_instr;
}
bool
Vmx_state::is_paging_enabled() const
{
return vmx_read(VMCS_GUEST_CR0) & Cr0_pg_bit;
}
bool
Vmx_state::in_real_mode() const
{
return (vmx_read(VMCS_GUEST_CR0) & Cr0_pe_bit) == 0;
}
bool
Vmx_state::in_long_mode() const
{
return vmx_read(VMCS_GUEST_IA32_EFER) & Efer_lma_bit;
}
template <typename TYPE>
int
Vmx_state::store_io_value_t(l4_vcpu_regs_t *regs, cxx::Ref_ptr<Pt_walker> ptw,
Vmx_insn_info_field info, l4_uint32_t value)
{
// Get the INS instruction argument and the offset register.
unsigned offset_reg;
TYPE *ptr;
int ret = get_io_argument(regs, ptw, info, true, &offset_reg, &ptr);
if (ret != Jump_instr)
return ret;
// Store the IO value to memory.
*ptr = value;
// Update the instruction argument (i.e. advance the offset register).
return advance_gpr(regs, offset_reg, info.address_size(), sizeof(TYPE));
}
template <typename TYPE>
int
Vmx_state::load_io_value_t(l4_vcpu_regs_t *regs, cxx::Ref_ptr<Pt_walker> ptw,
Vmx_insn_info_field info, l4_uint32_t *value)
{
// Get the INS instruction argument and the offset register.
unsigned offset_reg;
TYPE *ptr;
int ret = get_io_argument(regs, ptw, info, false, &offset_reg, &ptr);
if (ret != Jump_instr)
return ret;
// Load the IO value from memory.
*value = *ptr;
// Update the instruction argument (i.e. advance the offset register).
return advance_gpr(regs, offset_reg, info.address_size(), sizeof(TYPE));
}
template <typename TYPE>
int
Vmx_state::get_io_argument(l4_vcpu_regs_t *regs, cxx::Ref_ptr<Pt_walker> ptw,
Vmx_insn_info_field info, bool store,
unsigned *offset_reg, TYPE **ptr)
{
// Non-paged modes are not supported.
if (!is_paging_enabled())
return Invalid_opcode;
unsigned segment_reg;
if (store)
{
// For INS, the address is always determined by ES:DI, ES:EDI or
// RDI.
segment_reg = 0;
*offset_reg = 7;
}
else
{
// For OUTS, the address offset is determined by SI, ESI or RSI. The
// segment register can be overriden by a segment prefix.
segment_reg = info.segment();
*offset_reg = 6;
}
l4_uint64_t offset;
int ret = read_gpr(regs, *offset_reg, &offset);
if (ret != Jump_instr)
return ret;
// Truncate the effective address according to the operand size.
switch (info.address_size())
{
case 0:
offset &= 0xffffU;
break;
case 1:
offset &= 0xffffffffU;
break;
case 2:
/* No op */
break;
default:
return Invalid_opcode;
}
// Compute the linear (guest-virtual) address (taking segmentation into
// account).
l4_uint64_t addr;
ret = compute_linear_addr<TYPE>(segment_reg, offset, true, &addr);
if (ret != Jump_instr)
return ret;
// Walk the page tables to convert the guest-virtual address to the
// host-virtual address.
*ptr = reinterpret_cast<TYPE *>(ptw->walk(cr3(), addr));
return Jump_instr;
}
int
Vmx_state::read_gpr(l4_vcpu_regs_t *regs, unsigned reg, l4_uint64_t *value)
{
if (reg > 15)
return Invalid_opcode;
if (reg == 4)
*value = vmx_read(VMCS_GUEST_RSP);
else
*value = *(&(regs->ax) - reg);
return Jump_instr;
}
int
Vmx_state::write_gpr(l4_vcpu_regs_t *regs, unsigned reg, l4_uint64_t value)
{
if (reg > 15)
return Invalid_opcode;
if (reg == 4)
vmx_write(VMCS_GUEST_RSP, value);
else
*(&(regs->ax) - reg) = value;
return Jump_instr;
}
int
Vmx_state::advance_gpr(l4_vcpu_regs_t *regs, unsigned reg,
unsigned address_size, size_t advancement)
{
// Determine the direction of the advancement.
bool decrement = vmx_read(VMCS_GUEST_RFLAGS) & Direction_bit;
// Prepare a mask for the register size.
l4_uint64_t mask;
int ret = address_size_mask(address_size, &mask);
if (ret != Jump_instr)
return ret;
// Advance the register value.
l4_uint64_t value;
ret = read_gpr(regs, reg, &value);
if (ret != Jump_instr)
return ret;
if (decrement)
value = (value & ~mask) | ((value - advancement) & mask);
else
value = (value & ~mask) | ((value + advancement) & mask);
return write_gpr(regs, reg, value);
}
template <typename TYPE>
int
Vmx_state::compute_linear_addr(unsigned segment, l4_uint64_t offset,
bool store, l4_uint64_t *linear)
{
if (in_real_mode())
return Invalid_opcode;
bool valid;
if (in_long_mode())
{
// In long mode, segmentation is essentially non-existent except for the
// potentially non-zero base of the FS and GS segments.
l4_uint64_t sgbase;
switch (segment)
{
case 0:
case 1:
case 2:
case 3:
sgbase = 0;
break;
case 4:
sgbase = vmx_read(VMCS_GUEST_FS_BASE);
break;
case 5:
sgbase = vmx_read(VMCS_GUEST_GS_BASE);
break;
default:
return Invalid_opcode;
}
*linear = sgbase + offset;
// Guard against non-canonical addresses.
valid = is_cannonical_addr(*linear);
}
else
{
// In compatibility mode and protected mode, standard segmentation
// rules apply.
l4_uint64_t sgbase;
l4_uint32_t access;
l4_uint32_t slimit;
switch (segment)
{
case 0:
sgbase = vmx_read(VMCS_GUEST_ES_BASE);
access = vmx_read(VMCS_GUEST_ES_ACCESS_RIGHTS);
slimit = vmx_read(VMCS_GUEST_ES_LIMIT);
break;
case 1:
sgbase = vmx_read(VMCS_GUEST_CS_BASE);
access = vmx_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
slimit = vmx_read(VMCS_GUEST_CS_LIMIT);
break;
case 2:
sgbase = vmx_read(VMCS_GUEST_SS_BASE);
access = vmx_read(VMCS_GUEST_SS_ACCESS_RIGHTS);
slimit = vmx_read(VMCS_GUEST_SS_LIMIT);
break;
case 3:
sgbase = vmx_read(VMCS_GUEST_DS_BASE);
access = vmx_read(VMCS_GUEST_DS_ACCESS_RIGHTS);
slimit = vmx_read(VMCS_GUEST_DS_LIMIT);
break;
case 4:
sgbase = vmx_read(VMCS_GUEST_FS_BASE);
access = vmx_read(VMCS_GUEST_FS_ACCESS_RIGHTS);
slimit = vmx_read(VMCS_GUEST_FS_LIMIT);
break;
case 5:
sgbase = vmx_read(VMCS_GUEST_GS_BASE);
access = vmx_read(VMCS_GUEST_GS_ACCESS_RIGHTS);
slimit = vmx_read(VMCS_GUEST_GS_LIMIT);
break;
default:
return Invalid_opcode;
}
// Linear addresses are truncated to 32 bits.
*linear = (sgbase + offset) & 0xffffffffU;
if (store)
{
// Guard against read-only data segments and code segments.
if (((access & 0x0a) == 0) || (access & 0x08))
return General_protection;
}
else
{
// Guard against execute-only code segments.
if ((access & 0x0a) == 0x08)
return General_protection;
}
// Guard against unusable segments.
valid = ((access & 0x10000U) == 0);
// Unless the segment is flat (i.e. having base 0 with maximal limit,
// being any code segment or a non-expand-down data segment), guard
// against memory operands outside the segment limit.
if ((valid) && ((sgbase != 0) || (slimit != 0xffffffffU)
|| (!(access & 0x08) && (access & 0x04))))
valid = (*linear + sizeof(TYPE) - 1 <= slimit);
}
if (!valid)
{
if (segment == 2) // SS
return Stack_fault;
else
return General_protection;
}
return Jump_instr;
}
l4_uint64_t
Vmx_state::extend_cannonical_addr(l4_uint64_t addr) const
{
unsigned int bits = (vmx_read(VMCS_GUEST_CR4) & Cr4_la57_bit) ? 7 : 16;
return extend_sign64(addr, bits);
}
bool
Vmx_state::is_cannonical_addr(l4_uint64_t addr) const
{
return extend_cannonical_addr(addr) == addr;
}
int
Vmx_state::address_size_mask(unsigned address_size, l4_uint64_t *mask)
{
switch (address_size)
{
case 0:
*mask = 0xffffU;
break;
case 1:
*mask = 0xffffffffU;
break;
case 2:
*mask = ~0UL;
break;
default:
return Invalid_opcode;
}
return Jump_instr;
}
l4_uint64_t
Vmx_state::extend_sign64(l4_uint64_t value, unsigned extension)
{
return static_cast<l4_int64_t>(value << extension) >> extension;
}
} //namespace Vmm

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,267 @@
/*
* Copyright (C) 2017, 2024 Kernkonzept GmbH.
* Author(s): Philipp Eppelt <philipp.eppelt@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
/*
* The constants are defined in the Intel Software Developer Manual Volume 3,
* Appendix B.
*/
/**
* 16-bit width VMCS fields
*/
enum Vmx_vmcs_16bit_fields
{
/* Control fields */
VMCS_VPID = 0x0000,
VMCS_PIR_NOTIFICATION_VECTOR = 0x0002,
VMCS_EPTP_INDEX = 0x0004,
VMCS_HLAT_PREFIX_SIZE = 0x0006,
VMCS_LAST_PID_PTR_INDEX = 0x0008,
/* Guest-state fields */
VMCS_GUEST_ES_SELECTOR = 0x0800,
VMCS_GUEST_CS_SELECTOR = 0x0802,
VMCS_GUEST_SS_SELECTOR = 0x0804,
VMCS_GUEST_DS_SELECTOR = 0x0806,
VMCS_GUEST_FS_SELECTOR = 0x0808,
VMCS_GUEST_GS_SELECTOR = 0x080a,
VMCS_GUEST_LDTR_SELECTOR = 0x080c,
VMCS_GUEST_TR_SELECTOR = 0x080e,
VMCS_GUEST_INTERRUPT_STATUS = 0x0810,
VMCS_GUEST_PML_INDEX = 0x0812,
VMCS_GUEST_UINV = 0x0814,
/* Host-state fields */
VMCS_HOST_ES_SELECTOR = 0x0c00,
VMCS_HOST_CS_SELECTOR = 0x0c02,
VMCS_HOST_SS_SELECTOR = 0x0c04,
VMCS_HOST_DS_SELECTOR = 0x0c06,
VMCS_HOST_FS_SELECTOR = 0x0c08,
VMCS_HOST_GS_SELECTOR = 0x0c0a,
VMCS_HOST_TR_SELECTOR = 0x0c0c,
};
/**
* 32-bit width VMCS fields
*/
enum Vmx_vmcs_32bit_fields
{
/* Control fields */
VMCS_PIN_BASED_VM_EXEC_CTLS = 0x4000,
VMCS_PRI_PROC_BASED_VM_EXEC_CTLS = 0x4002,
VMCS_EXCEPTION_BITMAP = 0x4004,
VMCS_PAGE_FAULT_ERROR_MASK = 0x4006,
VMCS_PAGE_FAULT_ERROR_MATCH = 0x4008,
VMCS_CR3_TARGET_COUNT = 0x400a,
VMCS_VM_EXIT_CTLS = 0x400c,
VMCS_VM_EXIT_MSR_STORE_COUNT = 0x400e,
VMCS_VM_EXIT_MSR_LOAD_COUNT = 0x4010,
VMCS_VM_ENTRY_CTLS = 0x4012,
VMCS_VM_ENTRY_MSR_LOAD_COUNT = 0x4014,
VMCS_VM_ENTRY_INTERRUPT_INFO = 0x4016,
VMCS_VM_ENTRY_EXCEPTION_ERROR = 0x4018,
VMCS_VM_ENTRY_INSN_LEN = 0x401a,
VMCS_TPR_THRESHOLD = 0x401c,
VMCS_SEC_PROC_BASED_VM_EXEC_CTLS = 0x401e,
VMCS_PLE_GAP = 0x4020,
VMCS_PLE_WINDOW = 0x4022,
VMCS_INSTRUCTION_TIMEOUT_CTRL = 0x4024,
/* Read-only data fields */
VMCS_VM_INSN_ERROR = 0x4400,
VMCS_EXIT_REASON = 0x4402,
VMCS_VM_EXIT_INTERRUPT_INFO = 0x4404,
VMCS_VM_EXIT_INTERRUPT_ERROR = 0x4406,
VMCS_IDT_VECTORING_INFO = 0x4408,
VMCS_IDT_VECTORING_ERROR = 0x440a,
VMCS_VM_EXIT_INSN_LENGTH = 0x440c,
VMCS_VM_EXIT_INSN_INFO = 0x440e,
/* Guest-state fields */
VMCS_GUEST_ES_LIMIT = 0x4800,
VMCS_GUEST_CS_LIMIT = 0x4802,
VMCS_GUEST_SS_LIMIT = 0x4804,
VMCS_GUEST_DS_LIMIT = 0x4806,
VMCS_GUEST_FS_LIMIT = 0x4808,
VMCS_GUEST_GS_LIMIT = 0x480a,
VMCS_GUEST_LDTR_LIMIT = 0x480c,
VMCS_GUEST_TR_LIMIT = 0x480e,
VMCS_GUEST_GDTR_LIMIT = 0x4810,
VMCS_GUEST_IDTR_LIMIT = 0x4812,
VMCS_GUEST_ES_ACCESS_RIGHTS = 0x4814,
VMCS_GUEST_CS_ACCESS_RIGHTS = 0x4816,
VMCS_GUEST_SS_ACCESS_RIGHTS = 0x4818,
VMCS_GUEST_DS_ACCESS_RIGHTS = 0x481a,
VMCS_GUEST_FS_ACCESS_RIGHTS = 0x481c,
VMCS_GUEST_GS_ACCESS_RIGHTS = 0x481e,
VMCS_GUEST_LDTR_ACCESS_RIGHTS = 0x4820,
VMCS_GUEST_TR_ACCESS_RIGHTS = 0x4822,
VMCS_GUEST_INTERRUPTIBILITY_STATE = 0x4824,
VMCS_GUEST_ACTIVITY_STATE = 0x4826,
VMCS_GUEST_SMBASE = 0x4828,
VMCS_GUEST_IA32_SYSENTER_CS = 0x482a,
VMCS_PREEMPTION_TIMER_VALUE = 0x482e,
/* Host-state fields */
VMCS_HOST_IA32_SYSENTER_CS = 0x4c00,
};
/**
* Natural-width VMCS fields
*/
enum Vmx_vmcs_natural_fields
{
/* Control fields */
VMCS_CR0_GUEST_HOST_MASK = 0x6000,
VMCS_CR4_GUEST_HOST_MASK = 0x6002,
VMCS_CR0_READ_SHADOW = 0x6004,
VMCS_CR4_READ_SHADOW = 0x6006,
VMCS_CR3_TARGET_VALUE0 = 0x6008,
VMCS_CR3_TARGET_VALUE1 = 0x600a,
VMCS_CR3_TARGET_VALUE2 = 0x600c,
VMCS_CR3_TARGET_VALUE3 = 0x600e,
/* Read-only data fields */
VMCS_EXIT_QUALIFICATION = 0x6400,
VMCS_IO_RCX = 0x6402,
VMCS_IO_RSI = 0x6404,
VMCS_IO_RDI = 0x6406,
VMCS_IO_RIP = 0x6408,
VMCS_GUEST_LINEAR_ADDRESS = 0x640a,
/* Guest-state fields */
VMCS_GUEST_CR0 = 0x6800,
VMCS_GUEST_CR3 = 0x6802,
VMCS_GUEST_CR4 = 0x6804,
VMCS_GUEST_ES_BASE = 0x6806,
VMCS_GUEST_CS_BASE = 0x6808,
VMCS_GUEST_SS_BASE = 0x680a,
VMCS_GUEST_DS_BASE = 0x680c,
VMCS_GUEST_FS_BASE = 0x680e,
VMCS_GUEST_GS_BASE = 0x6810,
VMCS_GUEST_LDTR_BASE = 0x6812,
VMCS_GUEST_TR_BASE = 0x6814,
VMCS_GUEST_GDTR_BASE = 0x6816,
VMCS_GUEST_IDTR_BASE = 0x6818,
VMCS_GUEST_DR7 = 0x681a,
VMCS_GUEST_RSP = 0x681c,
VMCS_GUEST_RIP = 0x681e,
VMCS_GUEST_RFLAGS = 0x6820,
VMCS_GUEST_PENDING_DBG_EXCEPTIONS = 0x6822,
VMCS_GUEST_IA32_SYSENTER_ESP = 0x6824,
VMCS_GUEST_IA32_SYSENTER_EIP = 0x6826,
VMCS_GUEST_IA32_S_CET = 0x6828,
VMCS_GUEST_SSP = 0x682a,
VMCS_GUEST_IA32_INTR_SSP_TBL_ADDR = 0x682c,
/* Host-state fields */
VMCS_HOST_CR0 = 0x6c00,
VMCS_HOST_CR3 = 0x6c02,
VMCS_HOST_CR4 = 0x6c04,
VMCS_HOST_FS_BASE = 0x6c06,
VMCS_HOST_GS_BASE = 0x6c08,
VMCS_HOST_TR_BASE = 0x6c0a,
VMCS_HOST_GDTR_BASE = 0x6c0c,
VMCS_HOST_IDTR_BASE = 0x6c0e,
VMCS_HOST_IA32_SYSENTER_ESP = 0x6c10,
VMCS_HOST_IA32_SYSENTER_EIP = 0x6c12,
VMCS_HOST_RSP = 0x6c14,
VMCS_HOST_RIP = 0x6c16,
VMCS_HOST_IA32_S_CET = 0x6c18,
VMCS_HOST_SSP = 0x6c1a,
VMCS_HOST_IA32_INTR_SSP_TBL_ADDR = 0x6c1c,
};
/**
* 64-bit width VMCS fields
*/
enum Vmx_vmcs_64bit_fields
{
/* Control fields */
VMCS_ADDRESS_IO_BITMAP_A = 0x2000,
VMCS_ADDRESS_IO_BITMAP_B = 0x2002,
VMCS_ADDRESS_MSR_BITMAP = 0x2004,
VMCS_VM_EXIT_MSR_STORE_ADDRESS = 0x2006,
VMCS_VM_EXIT_MSR_LOAD_ADDRESS = 0x2008,
VMCS_VM_ENTRY_MSR_LOAD_ADDRESS = 0x200a,
VMCS_EXECUTIVE_VMCS_POINTER = 0x200c,
VMCS_TSC_OFFSET = 0x2010,
VMCS_VIRTUAL_APIC_ADDRESS = 0x2012,
VMCS_APIC_ACCESS_ADDRESS = 0x2014,
VMCS_PIR_DESCRIPTOR = 0x2016,
VMCS_VM_FUNCTION_CONTROL = 0x2018,
VMCS_EPT_POINTER = 0x201a,
VMCS_EOI_EXIT_BITMAP0 = 0x201c,
VMCS_EOI_EXIT_BITMAP1 = 0x201e,
VMCS_EOI_EXIT_BITMAP2 = 0x2020,
VMCS_EOI_EXIT_BITMAP3 = 0x2022,
VMCS_EPTP_LIST_ADDRESS = 0x2024,
VMCS_VMREAD_BITMAP_ADDRESS = 0x2026,
VMCS_VMWRITE_BITMAP_ADDRESS = 0x2028,
VMCS_VIRT_EXCP_INFO_ADDRESS = 0x202a,
VMCS_XSS_EXITING_BITMAP = 0x202c,
VMCS_ENCLS_EXITING_BITMAP = 0x202e,
VMCS_SUBPAGE_PERMISSION_TBL_PTR = 0x2030,
VMCS_TSC_MULTIPLIER = 0x2032,
VMCS_TER_PROC_BASED_VM_EXEC_CTLS = 0x2034,
VMCS_ENCLV_EXITING_BITMAP = 0x2036,
VMCS_LOW_PASID_DIR_ADDRESS = 0x2038,
VMCS_HIGH_PASID_DIR_ADDRESS = 0x203a,
VMCS_SHARED_EPT_POINTER = 0x203c,
VMCS_PCONFIG_EXITING_BITMAP = 0x203e,
VMCS_HLATP = 0x2040,
VMCS_PID_POINTER_TABLE_ADDRESS = 0x2042,
VMCS_SEC_VM_EXIT_CTLS = 0x2044,
VMCS_IA32_SPEC_CTRL_MASK = 0x204a,
VMCS_IA32_SPEC_CTRL_SHADOW = 0x204c,
/* Read-only data fields */
VMCS_GUEST_PHYSICAL_ADDRESS = 0x2400,
/* Guest-state fields */
VMCS_LINK_POINTER = 0x2800,
VMCS_GUEST_IA32_DEBUGCTL = 0x2802,
VMCS_GUEST_IA32_PAT = 0x2804,
VMCS_GUEST_IA32_EFER = 0x2806,
VMCS_GUEST_IA32_PERF_GLOBAL_CTRL = 0x2808,
VMCS_GUEST_PDPTE0 = 0x280a,
VMCS_GUEST_PDPTE1 = 0x280c,
VMCS_GUEST_PDPTE2 = 0x280e,
VMCS_GUEST_PDPTE3 = 0x2810,
VMCS_GUEST_IA32_BNDCFGS = 0x2812,
VMCS_GUEST_IA32_RTIT_CTL = 0x2814,
VMCS_GUEST_IA32_LBR_CTL = 0x2816,
VMCS_GUEST_IA32_PKRS = 0x2818,
/* Host-state fields */
VMCS_HOST_IA32_PAT = 0x2c00,
VMCS_HOST_IA32_EFER = 0x2c02,
VMCS_HOST_IA32_PERF_GLOBAL_CTRL = 0x2c04,
VMCS_HOST_IA32_PKRS = 0x2c06,
};

View File

@@ -0,0 +1,94 @@
/*
* Copyright (C) 2017, 2022, 2024 Kernkonzept GmbH.
* Author(s): Sarah Hoffmann <sarah.hoffmann@kernkonzept.com>
* Philipp Eppelt <philipp.eppelt@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
namespace Vmm {
static const char *str_exit_reason[] =
{
/* 0 */ "Exception or NMI",
/* 1 */ "External interrupt",
/* 2 */ "Triple fault",
/* 3 */ "INIT signal",
/* 4 */ "Start-up IPI",
/* 5 */ "I/O system-management interrupt",
/* 6 */ "Other SMI",
/* 7 */ "Interrupt window",
/* 8 */ "NMI window",
/* 9 */ "Task switch",
/* 10 */ "CPUID instruction",
/* 11 */ "GETSEC instruction",
/* 12 */ "HLT instruction",
/* 13 */ "INVD instruction",
/* 14 */ "INVLPG instruction",
/* 15 */ "RDPMC instruction",
/* 16 */ "RDTSC instruction",
/* 17 */ "RSM instruction",
/* 18 */ "VMCALL instruction",
/* 19 */ "VMCLEAR instruction",
/* 20 */ "VMLAUNCH instruction",
/* 21 */ "VMPTRLD instruction",
/* 22 */ "VMPTRST instruction",
/* 23 */ "VMREAD instruction",
/* 24 */ "VMRESUME instruction",
/* 25 */ "VMWRITE instruction",
/* 26 */ "VMXOFF instruction",
/* 27 */ "VMXON instruction",
/* 28 */ "Control-register accesses",
/* 29 */ "MOV DR",
/* 30 */ "I/O instruction",
/* 31 */ "RDMSR instruction",
/* 32 */ "WRMSR instruction",
/* 33 */ "VM-entry failure due to invalid guest state",
/* 34 */ "VM-entry failure due to MSR loading",
/* 35 */ "",
/* 36 */ "MWAIT instruction",
/* 37 */ "Monitor trap flag",
/* 38 */ "",
/* 39 */ "MONITOR instruction",
/* 40 */ "PAUSE instruction",
/* 41 */ "VM-entry failure due to machine-check event",
/* 42 */ "",
/* 43 */ "TPR below threshold",
/* 44 */ "APIC access",
/* 45 */ "Virtualized EOI",
/* 46 */ "Access to GDTR or IDTR",
/* 47 */ "Access to LDTR or TR",
/* 48 */ "EPT violation",
/* 49 */ "EPT misconfiguration",
/* 50 */ "INVEPT instruction",
/* 51 */ "RDTSCP instruction",
/* 52 */ "VMX-preemption timer expired",
/* 53 */ "INVVPID instruction",
/* 54 */ "WBINVD instruction",
/* 55 */ "XSETBV instruction",
/* 56 */ "APIC write",
/* 57 */ "RDRAND instruction",
/* 58 */ "INVPCID instruction",
/* 59 */ "VM function invoked",
/* 60 */ "ENCLS instruction",
/* 61 */ "RDSEED instruction",
/* 62 */ "Page-modification log full event",
/* 63 */ "XSAVES instruction",
/* 64 */ "XRSTORS instruction",
/* 65 */ "",
/* 66 */ "SPP-related event",
/* 67 */ "UMWAIT instruction",
/* 68 */ "TPAUSE instruction",
/* 69 */ "LOADIWKEY instruction"
};
// Returns a human readable string for any given exit reason.
inline char const *exit_reason_to_str(l4_uint64_t exit_reason)
{
if (exit_reason < (sizeof(str_exit_reason) / sizeof(char const *)))
return str_exit_reason[exit_reason];
return "Unknown";
}
} // namespace Vmm

View File

@@ -0,0 +1,201 @@
/*
* Copyright (C) 2017, 2021-2022, 2024 Kernkonzept GmbH.
* Author(s): Philipp Eppelt <philipp.eppelt@kernkonzept.com>
* Jean Wolter <jean.wolter@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#include "zeropage.h"
#include "acpi.h"
namespace Vmm {
void Zeropage::add_cmdline(char const *line)
{
info().printf("Cmd_line: %s\n", line);
// strlen excludes the terminating '\0', strcpy copies it. The length check
// must care for that additional byte.
if (strlen(line) >= Max_cmdline_size - 1)
L4Re::chksys(-L4_EINVAL, "Maximal command line size is 4095 characters.");
strcpy(_cmdline, line);
}
void Zeropage::add_ramdisk(l4_uint64_t start, l4_uint64_t sz)
{
_ramdisk_start = start;
_ramdisk_size = sz;
}
void Zeropage::cfg_e820(Vm_ram *ram)
{
l4_addr_t last_addr = 0;
ram->foreach_region([this, &last_addr](Vmm::Ram_ds const &r)
{
if (_e820_idx < Max_e820_entries)
add_e820_entry(r.vm_start().get(), r.size(),
r.writable() ? E820_ram : E820_reserved);
last_addr = r.vm_start().get() + r.size();
});
auto facs = Acpi::Facs_storage::get()->mem_region();
add_e820_entry(facs.start.get(), facs.end - facs.start + 1,
E820_reserved);
// e820 memory map: Linux expects at least two entries to be present to
// qualify as a e820 map. From our side, the second entry is currently
// unused and has no backing memory. see linux/boot/x86/kernel/e820.c
if (last_addr && _e820_idx < 2)
add_e820_entry(last_addr, L4_PAGESIZE , E820_reserved);
}
void Zeropage::add_dtb(l4_addr_t dt_addr, l4_size_t size)
{
_dtb_boot_addr = dt_addr;
_dtb_size = size;
}
void Zeropage::set_screen_callback(std::function<void (void *)> cb)
{
assert(!_screen_cb);
_screen_cb = cb;
}
void Zeropage::write(Vm_ram *ram, Boot::Binary_type const gt)
{
memset(ram->guest2host<void *>(_gp_addr), 0, L4_PAGESIZE);
// boot_params are setup according to v.2.07
unsigned boot_protocol_version = 0x207;
switch (gt)
{
case Boot::Binary_type::Elf:
// Note: The _kbinary variable contains the ELF binary entry
write_dtb(ram);
set_header<l4_addr_t>(ram, Bp_code32_start, _kbinary.get());
set_header<l4_uint32_t>(ram, Bp_signature, 0x53726448); // "HdrS"
boot_protocol_version = 0x209; // DTS needs v.2.09
info().printf("Elf guest zeropage: dtb 0x%llx, entry 0x%lx\n",
get_header<l4_uint64_t>(ram, Bp_setup_data),
get_header<l4_addr_t>(ram, Bp_code32_start));
break;
case Boot::Binary_type::Linux:
{
// Note: The _kbinary variable contains start of the kernel binary
// constants taken from $lx_src/Documentation/x86/boot.txt
l4_uint8_t hsz = *ram->guest2host<unsigned char *>(_kbinary + 0x0201);
// calculate size of the setup_header in the zero page/boot params
l4_size_t boot_hdr_size = (0x0202 + hsz) - Bp_boot_header;
memcpy(ram->guest2host<void *>(_gp_addr + Bp_boot_header),
ram->guest2host<void *>(_kbinary + Bp_boot_header),
boot_hdr_size);
break;
}
default:
L4Re::throw_error(-L4_EINVAL, "Unsupported binary type.");
break;
}
write_cmdline(ram);
// write e820
assert(_e820_idx > 0);
memcpy(ram->guest2host<void *>(_gp_addr + Bp_e820_map), _e820,
sizeof(E820_entry) * _e820_idx);
set_header<l4_uint8_t>(ram, Bp_e820_entries, _e820_idx);
// write RAM disk
set_header<l4_uint32_t>(ram, Bp_ramdisk_image, _ramdisk_start);
set_header<l4_uint32_t>(ram, Bp_ramdisk_size, _ramdisk_size);
if ((_ramdisk_start + _ramdisk_size) >> 32 > 0)
{
Xloadflags xlf;
xlf.can_be_loaded_above_4g() = 1;
set_header<l4_uint16_t>(ram, Bp_xloadflags, xlf.raw);
set_header<l4_uint32_t>(ram, Bp_ext_ramdisk_image, _ramdisk_start >> 32);
set_header<l4_uint32_t>(ram, Bp_ext_ramdisk_size, _ramdisk_size >> 32);
boot_protocol_version = 0x212; // xloadflags needs v.2.12
}
// misc stuff in the boot header
set_header<l4_uint8_t>(ram, Bp_type_of_loader, 0xff);
set_header<l4_uint16_t>(ram, Bp_version, boot_protocol_version);
set_header<l4_uint8_t>(ram, Bp_loadflags,
get_header<l4_uint8_t>(ram, Bp_loadflags)
| Bp_loadflags_keep_segments_bit);
// add screen info if necessary
if (_screen_cb)
_screen_cb(ram->guest2host<void *>(addr()));
}
void Zeropage::add_e820_entry(l4_uint64_t addr, l4_uint64_t size, l4_uint32_t type)
{
assert(_e820_idx < Max_e820_entries);
_e820[_e820_idx].addr = addr;
_e820[_e820_idx].size = size;
_e820[_e820_idx].type = type;
_e820_idx++;
}
// add an entry to the single-linked list of Setup_data
void Zeropage::add_setup_data(Vm_ram *ram, Setup_data *sd, l4_addr_t guest_addr)
{
sd->next = get_header<l4_uint64_t>(ram, Bp_setup_data);
set_header<l4_uint64_t>(ram, Bp_setup_data, guest_addr);
}
void Zeropage::write_cmdline(Vm_ram *ram)
{
if (*_cmdline == 0)
return;
// place the command line behind the boot parameters
auto cmdline_addr = (_gp_addr + Bp_end).round_page();
strcpy(ram->guest2host<char *>(cmdline_addr), _cmdline);
set_header<l4_uint32_t>(ram, Bp_cmdline_ptr, cmdline_addr.get());
set_header<l4_uint32_t>(ram, Bp_cmdline_size, strlen(_cmdline));
info().printf("cmdline check: %s\n", ram->guest2host<char *>(cmdline_addr));
}
void Zeropage::write_dtb(Vm_ram *ram)
{
if (_dtb_boot_addr == 0 || _dtb_size == 0)
return;
// dt_boot_addr is the guest address of the DT memory; Setup_data.data
// must be the first byte of the DT. The rest of the Setup_data struct
// must go right before it. Hopefully, there is space.
unsigned sd_hdr_size = sizeof(Setup_data) + sizeof(Setup_data::data);
auto dtb = ram->boot2guest_phys(_dtb_boot_addr);
auto *sd = ram->guest2host<Setup_data *>(dtb - sd_hdr_size);
for (unsigned i = sd_hdr_size; i > 0; i -= sizeof(char))
{
auto *sd_ptr = reinterpret_cast<char *>(sd);
if (*sd_ptr)
L4Re::chksys(-L4_EEXIST, "DTB Setup_data header memory in use.");
sd_ptr++;
}
sd->type = Setup_dtb;
sd->len = _dtb_size;
// sd->data is the first DT byte.
add_setup_data(ram, sd, _dtb_boot_addr - sd_hdr_size);
}
std::function<void (void *)> Zeropage::_screen_cb;
}

View File

@@ -0,0 +1,160 @@
/*
* Copyright (C) 2017-2018, 2021-2022, 2024 Kernkonzept GmbH.
* Author(s): Philipp Eppelt <philipp.eppelt@kernkonzept.com>
* Jean Wolter <jean.wolter@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include <l4/sys/types.h>
#include <functional>
#include "debug.h"
#include "vm_ram.h"
#include "binary_loader.h"
namespace Vmm {
enum Boot_param
{
Bp_ext_ramdisk_image = 0x0c0,
Bp_ext_ramdisk_size = 0x0c4,
Bp_ext_cmd_line_ptr = 0x0c8,
Bp_e820_entries = 0x1e8,
Bp_boot_header = 0x1f1,
Bp_setup_sects = 0x1f1,
Bp_signature = 0x202,
Bp_version = 0x206,
Bp_type_of_loader = 0x210,
Bp_loadflags = 0x211,
Bp_code32_start = 0x214,
Bp_ramdisk_image = 0x218,
Bp_ramdisk_size = 0x21c,
Bp_ext_loader_ver = 0x226,
Bp_ext_loader_type = 0x227,
Bp_cmdline_ptr = 0x228,
Bp_xloadflags = 0x236,
Bp_cmdline_size = 0x238,
Bp_setup_data = 0x250,
Bp_init_size = 0x260,
Bp_e820_map = 0x2d0,
Bp_end = 0xeed, // after EDD data array
};
class Zeropage
{
struct Setup_data
{
l4_uint64_t next;
l4_uint32_t type;
l4_uint32_t len;
l4_uint8_t data[0];
};
enum Setup_data_types
{
Setup_none = 0,
Setup_e820_ext,
Setup_dtb,
Setup_pci,
Setup_efi,
};
enum E820_types
{
E820_ram = 1,
E820_reserved = 2
};
struct E820_entry
{
l4_uint64_t addr; // start of segment
l4_uint64_t size;
l4_uint32_t type;
} __attribute__((packed));
struct Xloadflags
{
l4_uint16_t raw = 0;
/// Kernel has the legacy 64-bit entry point at 0x200.
CXX_BITFIELD_MEMBER(0, 0, kernel_64, raw);
/// Kernel/Boot_params/cmdline/ramdisk can be above 4G
CXX_BITFIELD_MEMBER(1, 1, can_be_loaded_above_4g, raw);
// bits 4:2 are EFI related; the remaining bits are unused;
};
enum
{
Max_cmdline_size = 4096,
Max_e820_entries = 5,
Bp_loadflags_keep_segments_bit = 0x40
};
Vmm::Guest_addr _gp_addr; ///< VM physical address of the zero page
Vmm::Guest_addr const _kbinary; // VM physical address of the kernel binary
char _cmdline[Max_cmdline_size];
E820_entry _e820[Max_e820_entries];
unsigned _e820_idx = 0;
l4_uint64_t _ramdisk_start = 0;
l4_uint64_t _ramdisk_size = 0;
l4_addr_t _dtb_boot_addr = 0;
l4_size_t _dtb_size = 0;
public:
Zeropage(Vmm::Guest_addr addr, l4_addr_t kernel)
: _gp_addr(addr), _kbinary(kernel)
{
info().printf("Zeropage @ 0x%lx, Kernel @ 0x%lx\n", addr.get(), kernel);
memset(_cmdline, 0, Max_cmdline_size);
memset(_e820, 0, Max_e820_entries * sizeof(E820_entry));
}
void add_cmdline(char const *line);
void add_ramdisk(l4_uint64_t start, l4_uint64_t sz);
void cfg_e820(Vm_ram *ram);
/**
* Add a device tree.
*
* \param dt_addr Address of the device tree in guest RAM.
* \param size Size of the device tree.
*/
void add_dtb(l4_addr_t dt_addr, l4_size_t size);
static void set_screen_callback(std::function<void (void *)> cb);
void write(Vm_ram *ram, Boot::Binary_type const gt);
Vmm::Guest_addr addr() const { return _gp_addr; }
l4_uint32_t entry(Vm_ram *ram)
{ return get_header<l4_uint32_t>(ram, Bp_code32_start); }
private:
static std::function<void (void *)> _screen_cb;
static Dbg trace() { return Dbg(Dbg::Core, Dbg::Trace); }
static Dbg info() { return Dbg(Dbg::Core, Dbg::Info); }
void add_e820_entry(l4_uint64_t addr, l4_uint64_t size, l4_uint32_t type);
// add an entry to the single-linked list of Setup_data
void add_setup_data(Vm_ram *ram, Setup_data *sd, l4_addr_t guest_addr);
void write_cmdline(Vm_ram *ram);
void write_dtb(Vm_ram *ram);
template <typename T>
void set_header(Vm_ram *ram, unsigned field, T value)
{ *ram->guest2host<T *>(_gp_addr + field) = value; }
template <typename T>
T get_header(Vm_ram *ram, unsigned field)
{ return *ram->guest2host<T *>(_gp_addr + field); }
};
} // namespace Vmm

View File

@@ -0,0 +1,20 @@
#pragma once
#include <arm_hyp.h>
#include <l4/re/error_helper>
inline void arm_subarch_setup(void *vcpu, bool, bool pmsa)
{
unsigned long id_mmfr0;
asm ("mrc p15, 0, %0, c0, c1, 4": "=r" (id_mmfr0));
// VPIDR default: Use MIDR of this host CPU.
unsigned long midr;
asm volatile ("mrc p15, 0, %0, c0, c0, 0" : "=r" (midr));
l4_vcpu_e_write_32(vcpu, L4_VCPU_E_VPIDR, midr);
if (pmsa && (id_mmfr0 & 0xf0) == 0)
L4Re::throw_error(-L4_ENOSYS, "CPU does not support PMSA");
else if (!pmsa && (id_mmfr0 & 0x0f) == 0)
L4Re::throw_error(-L4_ENOSYS, "CPU does not support VMSA");
}

View File

@@ -0,0 +1,7 @@
/*
* Copyright (C) 2017-2021, 2024 Kernkonzept GmbH.
* Author(s): Sarah Hoffmann <sarah.hoffmann@kernkonzept.com>
* Alexander Warg <alexander.warg@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/

View File

@@ -0,0 +1,45 @@
/*
* Copyright (C) 2017, 2019-2021, 2023-2024 Kernkonzept GmbH.
* Author(s): Sarah Hoffmann <sarah.hoffmann@kernkonzept.com>
* Alexander Warg <alexander.warg@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#include <l4/sys/compiler.h>
extern "C" void vcpu_entry(l4_vcpu_state_t *vcpu);
asm
(
"vcpu_entry: \n"
" mov r4, r0 \n" // r4: save r0
" mrc p15, 0, r5, c13, c0, 2 \n" // r5: save TPIDRURW
" ldr r2, [r0, #0x140] \n" // l4_vcpu_e_info_user()[0]
" ldr r3, [r0, #0x24] \n" // vcpu->r.err
" mcr p15, 0, r2, c13, c0, 2 \n"
" lsr r3, r3, #24 \n"
" bic r3, r3, #3 \n"
#ifdef __PIC__
" ldr r12, 2f \n" // load offset to vcpu_entries
"1:add r12, pc, r12 \n" // convert to absolute address
#else
" movw r12, #:lower16:vcpu_entries \n"
" movt r12, #:upper16:vcpu_entries \n"
#endif
" add r12, r12, r3 \n"
" ldr r12, [r12] \n"
" blx r12 \n"
" mov r0, r4 \n"
" bl prepare_guest_entry \n" // sets MR[0] = L4_THREAD_VCPU_RESUME_OP
" movw r2, #0xf803 \n" // and tag(L4_PROTO_THREAD, 1, 0, 0)
" movt r2, #0xffff \n" // dest = L4_INVALID_CAP, flags = call
" mov r3, #0 \n" // timeout never
" mcr p15, 0, r5, c13, c0, 2 \n" // restore TPIDRURW from r5
" mov r5, #" L4_stringify(L4_SYSCALL_INVOKE) " \n"
" hvc #0 \n"
" \n"
#ifdef __PIC__
"2: .word vcpu_entries - (1b + 8)\n"
#endif
);

View File

@@ -0,0 +1,91 @@
/*
* Copyright (C) 2021, 2024 Kernkonzept GmbH.
* Author(s): Georg Kotheimer <georg.kotheimer@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#include "arm_exc.h"
#include "guest.h"
namespace Vmm {
namespace {
using namespace Arm;
void enter_exception(Vcpu_ptr vcpu, unsigned mode, Aarch32::Exc_offset off)
{
l4_uint32_t spsr = vcpu->r.flags;
vcpu->r.flags = Aarch32::get_except_flags(vcpu, mode);
l4_addr_t return_addr =
vcpu->r.ip + Aarch32::get_return_offset(off, spsr & Aarch32::Psr_t);
// SPSR and LR are banked registers for the PE mode the exception is taken to.
if (mode == Aarch32::Psr_m_und)
{
asm volatile("msr SPSR_und, %0" : : "r"(spsr));
asm volatile("msr LR_und, %0" : : "r"(return_addr));
}
else if (mode == Aarch32::Psr_m_abt)
{
asm volatile("msr SPSR_abt, %0" : : "r"(spsr));
asm volatile("msr LR_abt, %0" : : "r"(return_addr));
}
l4_uint32_t vbar;
if (l4_vcpu_e_read_32(*vcpu, L4_VCPU_E_SCTLR) & Aarch32::Sctlr_v)
// The guest uses high exception vectors.
vbar = 0xffff0000;
else
asm volatile ("mrc p15, 0, %0, c12, c0, 0" : "=r"(vbar)); // VBAR
vcpu->r.ip = vbar + static_cast<unsigned>(off);
}
}
bool Guest::fault_mode_supported(Fault_mode mode)
{
return mode == Fault_mode::Inject
|| Generic_guest::fault_mode_supported(mode);
}
bool Guest::inject_abort(Vcpu_ptr vcpu, bool inst, l4_addr_t addr)
{
l4_uint32_t ttbcr;
#ifdef CONFIG_MMU
asm volatile ("mrc p15, 0, %0, c2, c0, 2" : "=r"(ttbcr)); // TTBCR
#else
ttbcr = Ttbcr_eae;
#endif
l4_uint32_t fsr = Aarch32::get_abort_fsr(ttbcr);
Aarch32::Exc_offset off;
if (inst)
{
off = Aarch32::Exc_offset::Prefetch_abort;
asm volatile("mcr p15, 0, %0, c6, c0, 2 " : : "r"(addr)); // IFAR
asm volatile("mcr p15, 0, %0, c5, c0, 1 " : : "r"(fsr)); // IFSR
}
else
{
off = Aarch32::Exc_offset::Data_abort;
asm volatile("mcr p15, 0, %0, c6, c0, 0 " : : "r"(addr)); // DFAR
asm volatile("mcr p15, 0, %0, c5, c0, 0 " : : "r"(fsr)); // DFSR
}
enter_exception(vcpu, Aarch32::Psr_m_abt, off);
return true;
}
bool Guest::inject_undef(Vcpu_ptr vcpu)
{
enter_exception(vcpu, Aarch32::Psr_m_und,
Aarch32::Exc_offset::Undefined_inst);
return true;
}
}

View File

@@ -0,0 +1,40 @@
/*
* Copyright (C) 2017, 2020, 2022-2024 Kernkonzept GmbH.
* Author(s): Christian Pötzsch <christian.poetzsch@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#include <l4/sys/thread.h>
#include <l4/re/elf_aux.h>
#include "guest.h"
L4RE_ELF_AUX_ELEM_T(l4re_elf_aux_mword_t, __ex_regs_flags,
L4RE_ELF_AUX_T_EX_REGS_FLAGS,
L4_THREAD_EX_REGS_ARM_SET_EL_EL1);
// Override the syscall symbol from the l4sys library. Relies on the ELF
// linking behaviour which ignores symbols from libraries that are already
// defined by the program or some other library before (in link order).
asm (
".global __l4_sys_syscall\n"
".type __l4_sys_syscall, #function\n"
"__l4_sys_syscall:\n"
" hvc #0\n"
" bx lr\n"
);
namespace Vmm {
void
Guest::add_sys_reg_aarch64(unsigned, unsigned,
unsigned, unsigned,
unsigned,
cxx::Ref_ptr<Vmm::Arm::Sys_reg> const &)
{}
void
Guest::subarch_init()
{}
}

View File

@@ -0,0 +1,14 @@
/*
* Copyright (C) 2017-2020, 2022, 2024 Kernkonzept GmbH.
* Author(s): Alexander Warg <alexander.warg@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
namespace Vmm {
enum { Guest_64bit_supported = false };
}

View File

@@ -0,0 +1,70 @@
/*
* Copyright (C) 2019, 2021, 2023-2024 Kernkonzept GmbH.
* Author(s): Sarah Hoffmann <sarah.hoffmann@kernkonzept.com>
* Alexander Warg <alexander.warg@kernkonzept.com>
* Timo Nicolai <timo.nicolai@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include <cstdio>
#include <cstring>
#include "vcpu_ptr.h"
#include "monitor/monitor.h"
#include "monitor/monitor_args.h"
namespace Monitor {
template<bool, typename T>
class Cpu_dev_cmd_handler {};
template<typename T>
class Cpu_dev_cmd_handler<true, T> : public Cmd
{
public:
char const *help() const override
{ return "CPU state"; }
void usage(FILE *f) const override
{
fprintf(f, "%s\n"
"* 'cpu <i> regs': dump CPU registers\n",
help());
}
void complete(FILE *f, Completion_request *compl_req) const override
{ compl_req->complete(f, "regs"); }
void exec(FILE *f, Arglist *args) override
{
if (*args == "regs")
show_regs(f);
else
argument_error("Invalid subcommand");
}
void show_regs(FILE *f) const
{
auto vcpu = get_vcpu();
auto regs = vcpu->r;
fprintf(f, "pc=%08lx lr=%08lx sp=%08lx flags=%08lx\n",
regs.ip, vcpu.get_lr(), vcpu.get_sp(), regs.flags);
fprintf(f, " r0=%08lx r1=%08lx r2=%08lx r3=%08lx\n",
regs.r[0], regs.r[1], regs.r[2], regs.r[3]);
fprintf(f, " r4=%08lx r5=%08lx r6=%08lx r7=%08lx\n",
regs.r[4], regs.r[5], regs.r[6], regs.r[7]);
fprintf(f, " r8=%08lx r9=%08lx r10=%08lx r11=%08lx\n",
vcpu.get_gpr(8), vcpu.get_gpr(9), vcpu.get_gpr(10),
vcpu.get_gpr(11));
fprintf(f, "r12=%08lx\n", vcpu.get_gpr(12));
}
private:
Vmm::Vcpu_ptr get_vcpu() const
{ return static_cast<T const *>(this)->vcpu(); }
};
}

View File

@@ -0,0 +1,253 @@
/*
* Copyright (C) 2015-2021, 2024 Kernkonzept GmbH.
* Author(s): Sarah Hoffmann <sarah.hoffmann@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include <cassert>
#include "aarch32_hyp.h"
#include "generic_vcpu_ptr.h"
#include "mem_access.h"
namespace Vmm {
class Vcpu_ptr : public Generic_vcpu_ptr
{
public:
explicit Vcpu_ptr(l4_vcpu_state_t *s) : Generic_vcpu_ptr(s) {}
bool pf_write() const
{ return hsr().pf_write(); }
static l4_uint32_t cntfrq()
{
l4_uint32_t x;
asm volatile("mrc p15, 0, %0, c14, c0, 0" : "=r" (x));
return x;
}
static l4_uint64_t cntvct()
{
l4_uint64_t x;
asm volatile ("mrrc p15, 1, %Q0, %R0, c14" : "=r"(x));
return x;
}
static l4_uint64_t cntv_cval()
{
l4_uint64_t x;
asm volatile ("mrrc p15, 3, %Q0, %R0, c14" : "=r"(x));
return x;
}
void thread_attach()
{
control_ext(L4::Cap<L4::Thread>());
reinterpret_cast<l4_utcb_t **>(l4_vcpu_e_info_user(_s))[0] = l4_utcb();
}
Arm::Hsr hsr() const
{ return Arm::Hsr(_s->r.err); }
void jump_instruction() const
{ _s->r.ip += 2 << hsr().il(); }
/**
* Check whether register 'x' is a user mode register for the current mode
*
* \retval true Register is a normal register accessible in l4_vcpu_state_t
* \retval false Register is a banked register which needs special treatment
*/
bool use_ureg(unsigned x) const
{
// registers < 8 are always the user registers
if (x < 8)
return true;
// one byte for each (legal) mode, where a set bit (x - 8) means
// register r[x] is a user register, modes are
//
// usr, fiq, irq, svc,
// , , mon, abt,
// , , hyp, und,
// , , , sys
//
// fiq is handled separately, mon/hyp are invalid (trap to el2/el3).
static l4_uint8_t const i[] =
{ 0xff, 0x00, 0x3f, 0x3f,
0x00, 0x00, 0x00, 0x3f,
0x00, 0x00, 0x00, 0x3f,
0x00, 0x00, 0x00, 0xff };
return i[_s->r.flags & 0x0f] & (1 << (x - 8));
}
/**
* Caculate jump offset used for accessing non-user SP and LR in
* 'irq', 'svc', 'abt' or 'und' mode
*
* The calculation does not check whether the mode is valid.
*
* \return Jump offset
*/
unsigned mode_offs() const
{
// mode (lower 5bits of flags):
//
// 0x12 -> 0, irq
// 0x13 -> 2, svc
// 0x17 -> 4, abt
// 0x1b -> 6, und
//
// all other (non hyp) modes use all user registers, are handled
// separately (fiq) or are illegal
return ((_s->r.flags + 1) >> 1) & 0x6;
}
l4_umword_t get_gpr(unsigned x) const
{
if (L4_UNLIKELY(x > 14))
return 0;
if (use_ureg(x))
switch (x)
{
case 14: return _s->r.lr;
case 13: return _s->r.sp;
default: return _s->r.r[x];
}
if (0)
printf("SPECIAL GET GPR: m=%2lx x=%u\n", (_s->r.flags & 0x1f), x);
l4_umword_t res;
if ((_s->r.flags & 0x1f) == 0x11) // FIQ
{
switch (x - 8)
{
case 0: asm ("mrs %[res], R8_fiq " : [res]"=r"(res)); break;
case 1: asm ("mrs %[res], R9_fiq " : [res]"=r"(res)); break;
case 2: asm ("mrs %[res], R10_fiq" : [res]"=r"(res)); break;
case 3: asm ("mrs %[res], R11_fiq" : [res]"=r"(res)); break;
case 4: asm ("mrs %[res], R12_fiq" : [res]"=r"(res)); break;
case 5: asm ("mrs %[res], SP_fiq " : [res]"=r"(res)); break;
case 6: asm ("mrs %[res], LR_fiq " : [res]"=r"(res)); break;
default: __builtin_unreachable();
}
return res;
}
// Should we check whether we have a valid mode (irq, svc, abt, und) here?
switch (x - 13 + mode_offs())
{
case 0: asm ("mrs %[res], SP_irq" : [res]"=r"(res)); break;
case 1: asm ("mrs %[res], LR_irq" : [res]"=r"(res)); break;
case 2: asm ("mrs %[res], SP_svc" : [res]"=r"(res)); break;
case 3: asm ("mrs %[res], LR_svc" : [res]"=r"(res)); break;
case 4: asm ("mrs %[res], SP_abt" : [res]"=r"(res)); break;
case 5: asm ("mrs %[res], LR_abt" : [res]"=r"(res)); break;
case 6: asm ("mrs %[res], SP_und" : [res]"=r"(res)); break;
case 7: asm ("mrs %[res], LR_und" : [res]"=r"(res)); break;
default: __builtin_unreachable();
}
return res;
}
void set_gpr(unsigned x, l4_umword_t value) const
{
if (L4_UNLIKELY(x > 14))
return;
if (use_ureg(x))
switch (x)
{
case 14: _s->r.lr = value; return;
case 13: _s->r.sp = value; return;
default: _s->r.r[x] = value; return;
}
if (0)
printf("SPECIAL SET GPR: m=%2lx x=%u\n", (_s->r.flags & 0x1f), x);
if ((_s->r.flags & 0x1f) == 0x11) // FIQ
{
switch (x - 8)
{
case 0: asm ("msr R8_fiq, %[v]" : : [v]"r"(value)); break;
case 1: asm ("msr R9_fiq, %[v]" : : [v]"r"(value)); break;
case 2: asm ("msr R10_fiq, %[v]" : : [v]"r"(value)); break;
case 3: asm ("msr R11_fiq, %[v]" : : [v]"r"(value)); break;
case 4: asm ("msr R12_fiq, %[v]" : : [v]"r"(value)); break;
case 5: asm ("msr SP_fiq, %[v]" : : [v]"r"(value)); break;
case 6: asm ("msr LR_fiq, %[v]" : : [v]"r"(value)); break;
default: __builtin_unreachable();
}
return;
}
// Should we check whether we have a valid mode (irq, svc, abt, und) here?
switch (x - 13 + mode_offs())
{
case 0: asm ("msr SP_irq, %[v]" : : [v]"r"(value)); break;
case 1: asm ("msr LR_irq, %[v]" : : [v]"r"(value)); break;
case 2: asm ("msr SP_svc, %[v]" : : [v]"r"(value)); break;
case 3: asm ("msr LR_svc, %[v]" : : [v]"r"(value)); break;
case 4: asm ("msr SP_abt, %[v]" : : [v]"r"(value)); break;
case 5: asm ("msr LR_abt, %[v]" : : [v]"r"(value)); break;
case 6: asm ("msr SP_und, %[v]" : : [v]"r"(value)); break;
case 7: asm ("msr LR_und, %[v]" : : [v]"r"(value)); break;
default: __builtin_unreachable();
}
}
l4_umword_t get_sp() const
{
return get_gpr(13);
}
l4_umword_t get_lr() const
{
return get_gpr(14);
}
Mem_access decode_mmio() const
{
Mem_access m;
if (!hsr().pf_isv() || hsr().pf_srt() > 14)
{
m.access = Mem_access::Other;
return m;
}
m.width = hsr().pf_sas();
m.access = hsr().pf_write() ? Mem_access::Store : Mem_access::Load;
if (m.access == Mem_access::Store)
m.value = get_gpr(hsr().pf_srt());
return m;
}
void writeback_mmio(Mem_access const &m) const
{
assert(m.access == Mem_access::Load);
l4_umword_t v = reg_extend_width(m.value, hsr().pf_sas(), hsr().pf_sse());
set_gpr(hsr().pf_srt(), v);
}
Arm::Gic_h::Vcpu_ppi_cfg vtmr() const
{
return Arm::Gic_h::Vcpu_ppi_cfg(l4_vcpu_e_read_32(_s, L4_VCPU_E_VTMR_CFG));
}
void vtmr(Arm::Gic_h::Vcpu_ppi_cfg cfg)
{ l4_vcpu_e_write_32(_s, L4_VCPU_E_VTMR_CFG, cfg.raw); }
};
} // namespace

View File

@@ -0,0 +1,40 @@
/*
* Copyright (C) 2017-2018, 2021, 2024 Kernkonzept GmbH.
* Author(s): Alexander Warg <alexander.warg@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include <arm_hyp.h>
#include <l4/re/error_helper>
inline void arm_subarch_setup(void *vcpu, bool guest_64bit, bool pmsa)
{
if (guest_64bit)
{
l4_umword_t hcr = l4_vcpu_e_read(vcpu, L4_VCPU_E_HCR);
hcr |= 1UL << 18; // TID3: Trap ID Group 3 (feature system registers)
hcr |= 1UL << 31; // set RW bit
l4_vcpu_e_write(vcpu, L4_VCPU_E_HCR, hcr);
}
unsigned long id_aa64mmfr0_el1;
asm("mrs %0, S3_0_C0_C7_0" : "=r"(id_aa64mmfr0_el1));
unsigned msa = (id_aa64mmfr0_el1 >> 48) & 0x0fU;
unsigned msa_frac = (id_aa64mmfr0_el1 >> 52) & 0x0fU;
// See Armv8-R AArch64 supplement (ARM DDI 0600A)
if (pmsa && (msa == 0 || msa != 0xf || (msa_frac != 1 && msa_frac != 2)))
L4Re::throw_error(-L4_ENOSYS, "CPU does not support PMSA");
else if (!pmsa && !(msa == 0 || (msa == 0xf && msa_frac == 2)))
L4Re::throw_error(-L4_ENOSYS, "CPU does not support VMSA");
l4_vcpu_e_write_64(vcpu, L4_VCPU_E_VTCR, pmsa ? 0 : (1ULL << 31));
// VPIDR default: Use MIDR_EL1 of this host CPU.
l4_uint64_t midr_el1;
asm volatile ("mrs %0, midr_el1" : "=r"(midr_el1));
l4_vcpu_e_write_32(vcpu, L4_VCPU_E_VPIDR, midr_el1);
}

View File

@@ -0,0 +1,8 @@
/*
* Copyright (C) 2017-2020, 2024 Kernkonzept GmbH.
* Author(s): Sarah Hoffmann <sarah.hoffmann@kernkonzept.com>
* Alexander Warg <alexander.warg@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/

View File

@@ -0,0 +1,38 @@
/*
* Copyright (C) 2017, 2019-2021, 2023-2024 Kernkonzept GmbH.
* Author(s): Sarah Hoffmann <sarah.hoffmann@kernkonzept.com>
* Alexander Warg <alexander.warg@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
extern "C" void vcpu_entry(l4_vcpu_state_t *vcpu);
asm
(
"vcpu_entry: \n"
" mrs x20, TPIDR_EL0 \n"
" mov x21, x0 \n"
" ldr x8, [x0, #0x248] \n" // l4_vcpu_e_info_user()[1]
" ldr w9, [x0, #0x148] \n" // vcpu->r.err
" msr TPIDR_EL0, x8 \n"
" lsr x9, x9, #23 \n"
" bic x9, x9, #7 \n"
#ifdef __PIC__
" adrp x10, vcpu_entries \n"
" add x10, x10, :lo12:vcpu_entries\n"
#else
" ldr x10, =vcpu_entries \n"
#endif
" add x10, x10, x9 \n"
" ldr x11, [x10] \n"
" blr x11 \n"
" mov x0, x21 \n"
" bl prepare_guest_entry \n" // sets MR[0] = L4_THREAD_VCPU_RESUME_OP
// and tag(L4_PROTO_THREAD, 1, 0, 0)
" mov x2, #0xfffffffffffff803 \n" // dest = L4_INVALID_CAP, flags = call
" mov x3, #0 \n" // timeout never
" msr TPIDR_EL0, x20 \n"
" hvc #0 \n"
);

View File

@@ -0,0 +1,169 @@
/*
* Copyright (C) 2021, 2023-2024 Kernkonzept GmbH.
* Author(s): Georg Kotheimer <georg.kotheimer@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#include "arm_exc.h"
#include "guest.h"
namespace Vmm {
namespace {
using namespace Arm;
void enter_exception64(Vcpu_ptr vcpu)
{
l4_umword_t old_flags = vcpu->r.flags;
l4_umword_t target_mode = Aarch64::Spsr_m_el1h;
vcpu->r.flags = Aarch64::get_except_flags(vcpu, target_mode);
asm volatile("msr SPSR_EL1, %0" : : "r"(old_flags));
l4_umword_t mode = old_flags & Aarch64::Spsr_m_mask;
unsigned exc_offset = Aarch64::get_except_offset(mode, target_mode);
// Save current instruction pointer
asm volatile("msr ELR_EL1, %0" : : "r"(vcpu->r.ip));
// Set exception vector instruction pointer
l4_umword_t vbar;
asm volatile ("mrs %0, VBAR_EL1" : "=r"(vbar));
vcpu->r.ip = vbar + exc_offset;
}
void enter_exception32(Vcpu_ptr vcpu, unsigned mode, Aarch32::Exc_offset off)
{
l4_uint32_t spsr = vcpu->r.flags;
vcpu->r.flags = Aarch32::get_except_flags(vcpu, mode);
l4_addr_t return_addr =
vcpu->r.ip + Aarch32::get_return_offset(off, spsr & Aarch32::Psr_t);
// SPSR and LR are banked registers for the PE mode the exception is taken to.
if (mode == Aarch32::Psr_m_und)
{
// TODO: The SPSR_und register can only be accessed from EL2 mode...
asm volatile("msr SPSR_und, %x0" : : "r"(spsr));
// LR_und is mapped to GPR X22 on Aarch64
vcpu->r.r[22] = return_addr;
}
else if (mode == Aarch32::Psr_m_abt)
{
// TODO: The SPSR_abt register can only be accessed from EL2 mode...
asm volatile("msr SPSR_abt, %x0" : : "r"(spsr));
// LR_abt is mapped to GPR X20 on Aarch64
vcpu->r.r[20] = return_addr;
}
l4_uint32_t vbar;
if (l4_vcpu_e_read_32(*vcpu, L4_VCPU_E_SCTLR) & Aarch32::Sctlr_v)
// The guest uses high exception vectors.
vbar = 0xffff0000;
else
asm volatile ("mrs %x0, VBAR_EL1" : "=r"(vbar)); // VBAR
vcpu->r.ip = vbar + static_cast<unsigned>(off);
}
/* Inject abort into Aarch32 guest on Aarch64 host */
__attribute__ ((unused))
void inject_abort32(Vcpu_ptr vcpu, bool inst, l4_uint32_t addr)
{
l4_uint32_t ttbcr;
asm volatile ("mrs %x0, TCR_EL1" : "=r"(ttbcr));
l4_uint32_t fsr = Aarch32::get_abort_fsr(ttbcr);
l4_uint64_t far;
asm volatile ("mrs %0, FAR_EL1" : "=r"(far));
Aarch32::Exc_offset off;
if (inst)
{
off = Aarch32::Exc_offset::Prefetch_abort;
// IFAR is mapped to FAR_EL1 bits [63:32]
far &= (~0xffffffffULL) << 32;
far |= static_cast<l4_uint64_t>(addr) << 32;
// TODO: The IFSR32_EL2 register can only be accessed from EL2 mode...
asm volatile("msr IFSR32_EL2, %x0" : : "r"(fsr));
}
else
{
off = Aarch32::Exc_offset::Data_abort;
// DFAR is mapped to FAR_EL1 bits [31:0]
far &= ~0xffffffffULL;
far |= addr;
asm volatile("msr ESR_EL1, %x0" : : "r"(fsr));
}
asm volatile("msr FAR_EL1, %0" : : "r"(far));
enter_exception32(vcpu, Aarch32::Psr_m_abt, off);
}
/* Inject abort into Aarch64 guest on Aarch64 host */
void inject_abort64(Vcpu_ptr vcpu, bool inst, l4_addr_t addr)
{
asm volatile("msr FAR_EL1, %0" : : "r"(addr));
Hsr esr = Aarch64::get_abort_esr(vcpu, inst);
asm volatile("msr ESR_EL1, %x0" : : "r"(esr.raw()));
enter_exception64(vcpu);
}
__attribute__ ((unused))
void inject_undef32(Vcpu_ptr vcpu)
{
enter_exception32(vcpu, Aarch32::Psr_m_und,
Aarch32::Exc_offset::Undefined_inst);
}
void inject_undef64(Vcpu_ptr vcpu)
{
Hsr esr { 0 };
esr.il() = vcpu.hsr().il();
esr.ec() = Hsr::Ec_unknown;
asm volatile("msr ESR_EL1, %x0" : : "r"(esr.raw()));
enter_exception64(vcpu);
}
}
bool Guest::fault_mode_supported(Fault_mode mode)
{
return mode == Fault_mode::Inject
|| Generic_guest::fault_mode_supported(mode);
}
bool
Guest::inject_abort(Vcpu_ptr vcpu, bool inst, l4_addr_t addr)
{
if (Aarch64::is_aarch32(vcpu->r.flags))
// TODO: inject_abort32(vcpu, inst, addr);
return false;
else
inject_abort64(vcpu, inst, addr);
return true;
}
bool
Guest::inject_undef(Vcpu_ptr vcpu)
{
if (Aarch64::is_aarch32(vcpu->r.flags))
// TODO: inject_undef32(vcpu);
return false;
else
inject_undef64(vcpu);
return true;
}
}

View File

@@ -0,0 +1,535 @@
/*
* Copyright (C) 2022-2024 Kernkonzept GmbH.
* Author(s): Christian Pötzsch <christian.poetzsch@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#include <l4/sys/thread.h>
#include <l4/re/elf_aux.h>
#include "guest.h"
L4RE_ELF_AUX_ELEM_T(l4re_elf_aux_mword_t, __ex_regs_flags,
L4RE_ELF_AUX_T_EX_REGS_FLAGS,
L4_THREAD_EX_REGS_ARM64_SET_EL_EL1);
asm (
".global __l4_sys_syscall\n"
".type __l4_sys_syscall, @function\n"
"__l4_sys_syscall:\n"
" hvc #0\n"
" ret\n"
);
namespace {
bool
has_aarch32()
{
l4_uint64_t aa64pfr0;
asm ("mrs %0, ID_AA64PFR0_EL1" : "=r"(aa64pfr0));
return (aa64pfr0 & 0x0f) == 2;
}
}
namespace Vmm {
void
Guest::add_sys_reg_aarch64(unsigned op0, unsigned op1,
unsigned crn, unsigned crm,
unsigned op2,
cxx::Ref_ptr<Vmm::Arm::Sys_reg> const &r)
{
_sys_regs[Vmm::Arm::Sys_reg::Key::sr(op0, op1, crn, crm, op2)] = r;
}
#define CP(coproc,opc1,CRn,CRm,opc2) "S" #coproc "_" #opc1 "_C" #CRn "_C" #CRm "_" #opc2
#define ADD_FEAT_REG(coproc, opc1, CRn, CRm, opc2, ...) \
add_sys_reg_aarch64(coproc, opc1, CRn, CRm, opc2, cxx::make_ref_obj<Sys_reg_feat>([](){ \
l4_uint64_t regval; \
asm volatile("mrs %0, " CP(coproc, opc1, CRn, CRm, opc2) : "=r"(regval)); \
return regval & (__VA_ARGS__); \
}))
void
Guest::subarch_init()
{
using namespace Arm;
// Registers sorted according to encoding. See chapter
//
// D22.3.1 Instructions for accessing non-debug System registers
//
// in Arm Architecture Reference Manual ARM DDI 0487K.a.
if (has_aarch32())
{
// ID_PFR0_EL1
ADD_FEAT_REG(3, 0, 0, 1, 0,
( 0UL << 28) // Mask RAS
| (0xfUL << 24) // DIT
| ( 0UL << 20) // Mask AMU
| (0xfUL << 16) // CSV2
| (0xfUL << 12) // State3 (T32EE)
| (0xfUL << 8) // State2 (Jazelle)
| (0xfUL << 4) // State1 (T32)
| (0xfUL << 0) // State0 (A32)
);
// ID_PFR1_EL1
ADD_FEAT_REG(3, 0, 0, 1, 1,
(0xfUL << 28) // GIC
| (0xfUL << 24) // Virt_frac
| (0xfUL << 20) // Sec_frac
| (0xfUL << 16) // GenTimer
| (0xfUL << 12) // Virtualization
| (0xfUL << 8) // MProgMod
| (0xfUL << 4) // Security
| (0xfUL << 0) // ProgMod
);
// ID_DFR0_EL1
ADD_FEAT_REG(3, 0, 0, 1, 2,
( 0UL << 28) // Mask TraceFilt
| ( 0UL << 24) // Mask PerfMon
| ( 0UL << 20) // Mask MProfDbg
| ( 0UL << 16) // Mask MMapTrc
| ( 0UL << 12) // Mask CopTrc
| ( 0UL << 8) // Mask MMapDbg
| ( 0UL << 4) // Mask CopSDbg
| (0xfUL << 0) // CopDbg
);
// ID_AFR0_EL1 skipped intentionally
// ID_MMFR0_EL1
ADD_FEAT_REG(3, 0, 0, 1, 4,
(0xfUL << 28) // InnerShr
| (0xfUL << 24) // FCSE
| (0xfUL << 20) // AuxReg
| (0xfUL << 16) // TCM
| (0xfUL << 12) // ShareLvl
| (0xfUL << 8) // OuterShr
| (0xfUL << 4) // PMSA
| (0xfUL << 0) // VMSA
);
// ID_MMFR1_EL1
ADD_FEAT_REG(3, 0, 0, 1, 5,
(0xfUL << 28) // BPred
| (0xfUL << 24) // L1TstCln
| (0xfUL << 20) // L1Uni
| (0xfUL << 16) // L1Hvd
| (0xfUL << 12) // L1UniSW
| (0xfUL << 8) // L1HvdSW
| (0xfUL << 4) // L1UniVA
| (0xfUL << 0) // L1HvdVA
);
// ID_MMFR2_EL1
ADD_FEAT_REG(3, 0, 0, 1, 6,
(0xfUL << 28) // HWAccFlg
| (0xfUL << 24) // WFIStall
| (0xfUL << 20) // MemBarr
| (0xfUL << 16) // UniTLB
| (0xfUL << 12) // HvdTLB
| (0xfUL << 8) // L1HvdRng
| (0xfUL << 4) // L1HvdBG
| (0xfUL << 0) // L1HvdFG
);
// ID_MMFR3_EL1
ADD_FEAT_REG(3, 0, 0, 1, 7,
(0xfUL << 28) // Supersec
| (0xfUL << 24) // CMemSz
| (0xfUL << 20) // CohWalk
| (0xfUL << 16) // PAN
| (0xfUL << 12) // MaintBcst
| (0xfUL << 8) // BPMaint
| (0xfUL << 4) // CMaintSW
| (0xfUL << 0) // CMaintVA
);
// ID_ISAR0_EL1
ADD_FEAT_REG(3, 0, 0, 2, 0,
( 0UL << 28) // RES0
| (0xfUL << 24) // Divide
| (0xfUL << 20) // Debug
| (0xfUL << 16) // Coproc
| (0xfUL << 12) // CmpBranch
| (0xfUL << 8) // BitField
| (0xfUL << 4) // BitCount
| (0xfUL << 0) // Swap
);
// ID_ISAR1_EL1
ADD_FEAT_REG(3, 0, 0, 2, 1,
(0xfUL << 28) // Jazelle
| (0xfUL << 24) // Interwork
| (0xfUL << 20) // Immediate
| (0xfUL << 16) // IfThen
| (0xfUL << 12) // Extend
| (0xfUL << 8) // Except_AR
| (0xfUL << 4) // Except
| (0xfUL << 0) // Endian
);
// ID_ISAR2_EL1
ADD_FEAT_REG(3, 0, 0, 2, 2,
(0xfUL << 28) // Reversal
| (0xfUL << 24) // PSR_AR
| (0xfUL << 20) // MultU
| (0xfUL << 16) // MultS
| (0xfUL << 12) // Mult
| (0xfUL << 8) // MultiAccessInt
| (0xfUL << 4) // MemHint
| (0xfUL << 0) // LoadStore
);
// ID_ISAR3_EL1
ADD_FEAT_REG(3, 0, 0, 2, 3,
(0xfUL << 28) // T32EE
| (0xfUL << 24) // TrueNOP
| (0xfUL << 20) // T32Copy
| (0xfUL << 16) // TabBranch
| (0xfUL << 12) // SynchPrim
| (0xfUL << 8) // SVC
| (0xfUL << 4) // SIMD
| (0xfUL << 0) // Saturate
);
// ID_ISAR4_EL1
ADD_FEAT_REG(3, 0, 0, 2, 4,
(0xfUL << 28) // SWP_frac
| (0xfUL << 24) // PSR_M
| (0xfUL << 20) // SynchPrim_frac
| (0xfUL << 16) // Barrier
| (0xfUL << 12) // SMC
| (0xfUL << 8) // Writeback
| (0xfUL << 4) // WithShifts
| (0xfUL << 0) // Unpriv
);
// ID_ISAR5_EL1
ADD_FEAT_REG(3, 0, 0, 2, 5,
(0xfUL << 28) // VCMA
| (0xfUL << 24) // RDM
| ( 0UL << 20) // RES0
| (0xfUL << 16) // CRC32
| (0xfUL << 12) // SHA2
| (0xfUL << 8) // SHA1
| (0xfUL << 4) // AES
| (0xfUL << 0) // SEVL
);
// ID_MMFR4_EL1
ADD_FEAT_REG(3, 0, 0, 2, 6,
(0xfUL << 28) // EVT
| (0xfUL << 24) // CCIDX
| (0xfUL << 20) // LSM
| (0xfUL << 16) // HPDS
| (0xfUL << 12) // CnP
| (0xfUL << 8) // XNX
| (0xfUL << 4) // AC2
| ( 0UL << 0) // Mask SpecSEI (RAS)
);
// ID_ISAR6_EL1
ADD_FEAT_REG(3, 0, 0, 2, 7,
(0xfUL << 28) // CLRBHB
| (0xfUL << 24) // I8MM
| (0xfUL << 20) // BF16
| (0xfUL << 16) // SPECRES
| (0xfUL << 12) // SB
| (0xfUL << 8) // FHM
| (0xfUL << 4) // DP
| (0xfUL << 0) // JSCVT
);
// ID_MVFR0_EL1
ADD_FEAT_REG(3, 0, 0, 3, 0,
(0xfUL << 28) // FPRound
| (0xfUL << 24) // FPShVec
| (0xfUL << 20) // FPSqrt
| (0xfUL << 16) // FPDivide
| (0xfUL << 12) // FPTrap
| (0xfUL << 8) // FPDP
| (0xfUL << 4) // FPSP
| (0xfUL << 0) // SIMDReg
);
// ID_MVFR1_EL1
ADD_FEAT_REG(3, 0, 0, 3, 1,
(0xfUL << 28) // SIMDFMAC
| (0xfUL << 24) // FPHP
| (0xfUL << 20) // SIMDHP
| (0xfUL << 16) // SIMDSP
| (0xfUL << 12) // SIMDInt
| (0xfUL << 8) // SIMDLS
| (0xfUL << 4) // FPDNaN
| (0xfUL << 0) // FPFtZ
);
// ID_MVFR2_EL1
ADD_FEAT_REG(3, 0, 0, 3, 2,
(0xfUL << 4) // FPMisc
| (0xfUL << 0) // SIMDMisc
);
// ID_PFR2_EL1
ADD_FEAT_REG(3, 0, 0, 3, 4,
( 0UL << 8) // Mask RAS_frac (RAS)
| (0xfUL << 4) // SSBS
| (0xfUL << 0) // CSV3
);
// ID_DFR1_EL1
ADD_FEAT_REG(3, 0, 0, 3, 5,
( 0UL << 4) // Mask HPMN0 (PMU)
| ( 0UL << 0) // Mask MTPMU (PMU)
);
// ID_MMFR5_EL1
ADD_FEAT_REG(3, 0, 0, 3, 6,
(0xfUL << 4) // nTLBPA
| (0xfUL << 0) // ETS
);
}
// ID_AA64PFR0_EL1
ADD_FEAT_REG(3, 0, 0, 4, 0,
(0xfUL << 60) // CSV3
| (0xfUL << 56) // CSV2
| ( 0UL << 52) // Mask RME
| (0xfUL << 48) // DIT
| ( 0UL << 44) // Mask AMU
| ( 0UL << 40) // Mask MPAM
| (0xfUL << 36) // SEL2
| ( 0UL << 32) // Mask SVE
| ( 0UL << 28) // Mask RAS
| (0xfUL << 24) // GIC
| (0xfUL << 20) // AdvSIMD
| (0xfUL << 16) // FP
| (0xfUL << 12) // EL3
| (0xfUL << 8) // EL2
| (0xfUL << 4) // EL1
| (0xfUL << 0) // EL0
);
// ID_AA64PFR1_EL1
ADD_FEAT_REG(3, 0, 0, 4, 1,
( 0UL << 60) // Mask PFAR
| ( 0UL << 56) // Mask DF2 (RAS related)
| ( 0UL << 52) // Mask MTEX
| ( 0UL << 48) // Mask THE
| ( 0UL << 44) // Mask GCS
| ( 0UL << 40) // Mask MTE_frac
| (0xfUL << 36) // NMI
| (0xfUL << 32) // CSV2_frac
| (0xfUL << 28) // RNDR_trap
| ( 0UL << 24) // Mask SME
| ( 0UL << 20) // RES0
| ( 0UL << 16) // Mask MPAM_frac
| ( 0UL << 12) // Mask RAS_frac
| ( 0UL << 8) // Mask MTE
| (0xfUL << 4) // SSBS
| (0xfUL << 0) // BT
);
// ID_AA64PFR2_EL1
ADD_FEAT_REG(3, 0, 0, 4, 2,
(0xfUL << 8) // Mask MTEFAR (MTE)
| (0xfUL << 4) // Mask MTESTOREONLY (MTE)
| (0xfUL << 0) // Mask MTEPERM (MTE)
);
// ID_AA64ZFR0_EL1 skipped intentionally (SVE Feature ID Register 0)
// ID_AA64SMFR0_EL1 skipped intentionally (SME Feature ID Register 0)
// ID_AA64DFR0_EL1
ADD_FEAT_REG(3, 0, 0, 5, 0,
( 0UL << 60) // Mask HPMN0
| ( 0UL << 56) // Mask ExtTrcBuff
| ( 0UL << 52) // Mask BRBE
| ( 0UL << 48) // Mask MTPMU
| ( 0UL << 44) // Mask TraceBuffer
| ( 0UL << 40) // Mask TraceFilt
| ( 0UL << 36) // Mask DoubleLock
| ( 0UL << 32) // Mask PMSVer
| ( 0UL << 28) // Mask CTX_CMPs
| ( 0UL << 24) // Mask SEBEP
| ( 0UL << 20) // Mask WRPs
| ( 0UL << 16) // Mask PMSS
| ( 0UL << 12) // Mask BRPs
| ( 0UL << 8) // Mask PMUVer
| ( 0UL << 4) // Mask TraceVer
| (0xfUL << 0) // DebugVer
);
// ID_AA64DFR1_EL1
ADD_FEAT_REG(3, 0, 0, 5, 1,
( 0UL << 56) // Mask ABL_CMPs
| ( 0UL << 52) // Mask DPFZS
| ( 0UL << 48) // Mask EBEP
| ( 0UL << 44) // Mask ITE
| ( 0UL << 40) // Mask ABLE
| ( 0UL << 36) // Mask PMICNTR
| ( 0UL << 32) // Mask SPMU
| ( 0UL << 24) // Mask CTX_CMPs
| ( 0UL << 16) // Mask WRPs
| ( 0UL << 8) // Mask BRPs
| ( 0UL << 0) // Mask SYSPMUID
);
// ID_AA64AFR0_EL1 skipped intentionally
// ID_AA64AFR1_EL1 skipped intentionally
// ID_AA64ISAR0_EL1
ADD_FEAT_REG(3, 0, 0, 6, 0,
(0xfUL << 60) // RNDR
| (0xfUL << 56) // TLB
| (0xfUL << 52) // TS
| (0xfUL << 48) // FHM
| (0xfUL << 44) // DP
| (0xfUL << 40) // SM4
| (0xfUL << 36) // SM3
| (0xfUL << 32) // SHA3
| (0xfUL << 28) // RDM
| (0xfUL << 24) // TME
| (0xfUL << 20) // Atomic
| (0xfUL << 16) // CRC32
| (0xfUL << 12) // SHA2
| (0xfUL << 8) // SHA1
| (0xfUL << 4) // AES
| ( 0UL << 0) // RES0
);
// ID_AA64ISAR1_EL1
ADD_FEAT_REG(3, 0, 0, 6, 1,
(0xfUL << 60) // LS64
| (0xfUL << 56) // XS
| (0xfUL << 52) // I8MM
| (0xfUL << 48) // DGH
| (0xfUL << 44) // BF16
| (0xfUL << 40) // SPECRES
| (0xfUL << 36) // SB
| (0xfUL << 32) // FRINTTS
| ( 0UL << 28) // Mask GPI (FEAT_PAuth)
| ( 0UL << 24) // Mask GPA (FEAT_PAuth)
| ( 0UL << 20) // Mask LRCPC (FEAT_LRCPC)
| (0xfUL << 16) // FCMA
| (0xfUL << 12) // JSCVT
| ( 0UL << 8) // Mask API (FEAT_PAuth)
| ( 0UL << 4) // Mask APA (FEAT_PAuth)
| (0xfUL << 0) // DPB
);
// ID_AA64ISAR2_EL1
ADD_FEAT_REG(3, 0, 0, 6, 2,
(0xfUL << 60) // ATS1A
| ( 0UL << 56) // RES0
| (0xfUL << 52) // CSSC
| (0xfUL << 48) // RPRFM
| ( 0UL << 44) // RES0
| (0xfUL << 40) // PRFMSLC
| (0xfUL << 36) // SYSINSTR_128
| (0xfUL << 32) // SYSREG_128
| (0xfUL << 28) // CLRBHB
| ( 0UL << 24) // Mask PAC_frac (FEAT_PAuth2)
| (0xfUL << 20) // BC
| (0xfUL << 16) // MOPS
| ( 0UL << 12) // Mask APA3 (FEAT_PAuth)
| ( 0UL << 8) // Mask GPA3 (FEAT_PAuth)
| (0xfUL << 4) // RPRES
| (0xfUL << 0) // WFxT
);
// ID_AA64MMFR0_EL1
ADD_FEAT_REG(3, 0, 0, 7, 0,
(0xfUL << 60) // ECV
| (0xfUL << 56) // FGT
| (0xfUL << 52) // MSA_frac
| (0xfUL << 48) // MSA
| (0xfUL << 44) // ExS
| (0xfUL << 40) // TGran4_2
| (0xfUL << 36) // TGran64_2
| (0xfUL << 32) // TGran16_2
| (0xfUL << 28) // TGran4
| (0xfUL << 24) // TGran64
| (0xfUL << 20) // TGran16
| (0xfUL << 16) // BigEndEL0
| (0xfUL << 12) // SNSMem
| (0xfUL << 8) // BigEnd
| (0xfUL << 4) // ASIDBits
| (0xfUL << 0) // PARange
);
// ID_AA64MMFR1_EL1
ADD_FEAT_REG(3, 0, 0, 7, 1,
(0xfUL << 60) // ECBHB
| (0xfUL << 56) // CMOW
| (0xfUL << 52) // TIDCP1
| (0xfUL << 48) // nTLBPA
| (0xfUL << 44) // AFP
| (0xfUL << 40) // HCX
| (0xfUL << 36) // ETS
| (0xfUL << 32) // TWED
| (0xfUL << 28) // XNX
| ( 0UL << 24) // Mask SpecSEI (RAS)
| (0xfUL << 20) // PAN
| ( 0UL << 16) // Mask LO (FEAT_LOR)
| (0xfUL << 12) // HPDS
| (0xfUL << 8) // VH
| (0xfUL << 4) // VMIDBits
| (0xfUL << 0) // HAFDBS
);
// ID_AA64MMFR2_EL1
ADD_FEAT_REG(3, 0, 0, 7, 2,
(0xfUL << 60) // E0PD
| (0xfUL << 56) // EVT
| (0xfUL << 52) // BBM
| (0xfUL << 48) // TTL
| ( 0UL << 44) // RES0
| (0xfUL << 40) // FWB
| (0xfUL << 36) // IDS
| (0xfUL << 32) // AT
| (0xfUL << 28) // ST
| (0xfUL << 24) // NV
| (0xfUL << 20) // CCIDX
| (0xfUL << 16) // VARange
| ( 0UL << 12) // Mask IESB (FEAT_IESB + FEAT_RAS)
| (0xfUL << 8) // LSM
| (0xfUL << 4) // UAO
| (0xfUL << 0) // CnP
);
// ID_AA64MMFR3_EL1
ADD_FEAT_REG(3, 0, 0, 7, 3,
( 0UL << 60) // Mask Spec_FPACC (FEAT_PAuth++)
| ( 0UL << 56) // Mask ADERR (RASv2)
| ( 0UL << 52) // Mask SDERR (RASv2)
| ( 0UL << 48) // RES0
| ( 0UL << 44) // Mask ANERR (RASv2)
| ( 0UL << 40) // Mask SNERR (RASv2)
| (0xfUL << 36) // D128_2
| (0xfUL << 32) // D128
| ( 0UL << 28) // Mask MEC
| (0xfUL << 24) // AIE
| (0xfUL << 20) // S2POE
| (0xfUL << 16) // S1POE
| (0xfUL << 12) // S2PIE
| (0xfUL << 8) // S1PIE
| (0xfUL << 4) // SCTLRX
| (0xfUL << 0) // TCRX
);
// ID_AA64MMFR4_EL1
ADD_FEAT_REG(3, 0, 0, 7, 4,
0UL << 4 // EIESB (RAS)
);
}
}

View File

@@ -0,0 +1,14 @@
/*
* Copyright (C) 2017-2020, 2022, 2024 Kernkonzept GmbH.
* Author(s): Alexander Warg <alexander.warg@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
namespace Vmm {
enum { Guest_64bit_supported = true };
}

View File

@@ -0,0 +1,68 @@
/*
* Copyright (C) 2019, 2023-2024 Kernkonzept GmbH.
* Author(s): Sarah Hoffmann <sarah.hoffmann@kernkonzept.com>
* Alexander Warg <alexander.warg@kernkonzept.com>
* Timo Nicolai <timo.nicolai@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include <cstdio>
#include <cstring>
#include <l4/sys/vcpu.h>
#include "vcpu_ptr.h"
#include "monitor/monitor.h"
#include "monitor/monitor_args.h"
namespace Monitor {
template<bool, typename T>
class Cpu_dev_cmd_handler {};
template<typename T>
class Cpu_dev_cmd_handler<true, T> : public Cmd
{
public:
char const *help() const override
{ return "CPU state"; }
void usage(FILE *f) const override
{
fprintf(f, "%s\n"
"* 'cpu <i> regs': dump CPU registers\n",
help());
}
void complete(FILE *f, Completion_request *compl_req) const override
{ compl_req->complete(f, "regs"); }
void exec(FILE *f, Arglist *args) override
{
if (*args == "regs")
show_regs(f);
else
argument_error("Invalid subcommand");
}
void show_regs(FILE *f) const
{
auto vcpu = get_vcpu();
auto regs = vcpu->r;
for (unsigned i = 0; i < 31; ++i)
fprintf(f, "x%2d:%16lx%s", i, regs.r[i], (i % 4) == 3 ? "\n" : " ");
fprintf(f, "\npc=%lx sp=%lx psr=%lx sctlr=%x\n",
regs.ip, regs.sp, regs.flags,
l4_vcpu_e_read_32(*vcpu, L4_VCPU_E_SCTLR));
}
private:
Vmm::Vcpu_ptr get_vcpu() const
{ return static_cast<T const *>(this)->vcpu(); }
};
}

View File

@@ -0,0 +1,124 @@
/*
* Copyright (C) 2015-2020, 2023-2024 Kernkonzept GmbH.
* Author(s): Sarah Hoffmann <sarah.hoffmann@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include <cassert>
#include "aarch64_hyp.h"
#include "generic_vcpu_ptr.h"
#include "mem_access.h"
namespace Vmm {
class Vcpu_ptr : public Generic_vcpu_ptr
{
public:
explicit Vcpu_ptr(l4_vcpu_state_t *s) : Generic_vcpu_ptr(s) {}
bool pf_write() const
{ return hsr().pf_write(); }
static l4_uint32_t cntfrq()
{
l4_uint32_t x;
asm volatile ("mrs %x0, CNTFRQ_EL0" : "=r"(x));
return x;
}
static l4_uint64_t cntvct()
{
l4_uint64_t x;
asm volatile ("mrs %0, CNTVCT_EL0" : "=r"(x));
return x;
}
static l4_uint64_t cntv_cval()
{
l4_uint64_t x;
asm volatile ("mrs %0, CNTV_CVAL_EL0" : "=r"(x));
return x;
}
void *saved_tls() const
{ return reinterpret_cast<void *>(l4_vcpu_e_info_user(_s)[1]); }
l4_utcb_t *restore_on_entry() const
{
asm volatile("msr TPIDR_EL0, %0" : : "r"(saved_tls()));
return reinterpret_cast<l4_utcb_t *>(l4_vcpu_e_info_user(_s)[0]);
}
void thread_attach()
{
control_ext(L4::Cap<L4::Thread>());
void **x = reinterpret_cast<void **>(l4_vcpu_e_info_user(_s));
x[0] = l4_utcb();
asm volatile ("mrs %0, TPIDR_EL0" : "=r"(x[1]));
}
Arm::Hsr hsr() const
{ return Arm::Hsr(_s->r.err); }
void jump_instruction() const
{ _s->r.ip += 2 << hsr().il(); }
l4_umword_t get_gpr(unsigned x) const
{
if (x < 31)
return _s->r.r[x];
else
return 0;
}
void set_gpr(unsigned x, l4_umword_t value) const
{
if (x < 31)
_s->r.r[x] = value;
}
l4_umword_t get_lr() const
{
return _s->r.r[30];
}
Mem_access decode_mmio() const
{
Mem_access m;
if (!hsr().pf_isv())
{
m.access = Mem_access::Other;
return m;
}
m.width = hsr().pf_sas();
m.access = hsr().pf_write() ? Mem_access::Store : Mem_access::Load;
if (m.access == Mem_access::Store)
m.value = get_gpr(hsr().pf_srt());
return m;
}
void writeback_mmio(Mem_access const &m) const
{
assert(m.access == Mem_access::Load);
l4_umword_t v = reg_extend_width(m.value, hsr().pf_sas(), hsr().pf_sse());
set_gpr(hsr().pf_srt(), v);
}
Arm::Gic_h::Vcpu_ppi_cfg vtmr() const
{
return Arm::Gic_h::Vcpu_ppi_cfg(l4_vcpu_e_read_32(_s, L4_VCPU_E_VTMR_CFG));
}
void vtmr(Arm::Gic_h::Vcpu_ppi_cfg cfg)
{ l4_vcpu_e_write_32(_s, L4_VCPU_E_VTMR_CFG, cfg.raw); }
};
} // namespace

View File

@@ -0,0 +1,23 @@
/*
* Copyright (C) 2022, 2024 Kernkonzept GmbH.
* Author(s): Christian Pötzsch <christian.potzsch@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
namespace Boot {
static int raw_load_image(std::shared_ptr<Binary_ds> image, Vmm::Vm_ram *ram,
Vmm::Ram_free_list *free_list, l4_addr_t *entry)
{
l4_addr_t start = *entry == ~0ul ? 0x100000 : *entry;
image->load_as_raw(ram, Vmm::Guest_addr(start), free_list);
*entry = ram->guest_phys2boot(Vmm::Guest_addr(0x100400));
return L4_EOK;
}
}

View File

@@ -0,0 +1,199 @@
/*
* Copyright (C) 2017-2019, 2023-2024 Kernkonzept GmbH.
* Author(s): Sarah Hoffmann <sarah.hoffmann@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include <cassert>
#include "mmio_device.h"
#include "vm_memmap.h"
#include "cpc.h"
namespace Vdev {
/**
* Virtual Mips coherency manager.
*
* This device only implements the global registers of the CM and
* only emulatates functionality necessary for SMP support.
* Access to the CPU-local registers are forwarded to the
* CPC.
*
* Each VCPU is reported as a separate core with exactly one VPE.
*/
class Coherency_manager : public Vmm::Mmio_device_t<Coherency_manager>
{
enum Memmap
{
Base_address = 0x1fbf8000,
Cm_size = 0x10000,
Control_block_size = 0x2000,
Core_local_base = 0x2000,
Core_other_base = 0x4000
};
enum Global_control_block
{
Gcr_config = 0x0,
Gcr_base = 0x8,
Gcr_control = 0x10,
Gcr_rev = 0x30,
Gcr_gic_base = 0x80,
Gcr_cpc_base = 0x88,
Gcr_gic_status = 0xd0,
Gcr_cpc_status = 0xf0,
Gcr_sys_config2 = 0x150,
Gcr_bev_base = 0x680
};
public:
struct Cpc_base_addr_reg
{
Cpc_base_addr_reg() = default;
explicit Cpc_base_addr_reg(l4_umword_t value) : raw(value) {}
l4_umword_t raw;
CXX_BITFIELD_MEMBER(0, 0, enable, raw);
#ifdef __mips64
CXX_BITFIELD_MEMBER_UNSHIFTED_RO(15, 47, base_addr, raw);
#else
CXX_BITFIELD_MEMBER_UNSHIFTED_RO(15, 31, base_addr, raw);
#endif
};
Coherency_manager(Vmm::Vm_mem *memmap)
: _memmap(memmap), _gic_base(0), _cpc_base(0)
{}
static Vmm::Region mem_region()
{
return Vmm::Region::ss(Vmm::Guest_addr(Base_address), Cm_size,
Vmm::Region_type::Virtual);
}
void register_cpc(cxx::Ref_ptr<Vdev::Mips_cpc> const &cpc) { _cpc = cpc; }
l4_umword_t read(unsigned reg, char, unsigned cpuid)
{
Dbg dbg(Dbg::Cpu, Dbg::Info, "CM");
if (reg >= Core_local_base && reg < Core_local_base + Control_block_size)
{
if (!_cpc)
return 0;
return _cpc->cm_read_core(reg - Core_local_base, cpuid, false);
}
if (reg >= Core_other_base && reg < Core_other_base + Control_block_size)
{
if (!_cpc)
return 0;
return _cpc->cm_read_core(reg - Core_other_base, cpuid, true);
}
Dbg(Dbg::Cpu, Dbg::Trace, "CM").printf("reading GCR @ 0x%x\n", reg);
switch (reg)
{
case Gcr_config:
return _cpc->max_cpuid(); // no ICUs
case Gcr_base:
return Base_address;
case Gcr_rev:
return 8 << 8; // CM3
case Gcr_gic_base:
return _gic_base;
case Gcr_cpc_base:
return _cpc_base.raw;
case Gcr_gic_status:
return 1; // GIC is CM-controlled
case Gcr_cpc_status:
return 1; // CPC enabled
case Gcr_sys_config2:
return 1; // maximum 1 VP per core
case Gcr_bev_base:
return _cpc ? _cpc->bev_base() : 0;
}
Dbg(Dbg::Cpu, Dbg::Info, "CM").printf("reading @ 0x%x ignored.\n", reg);
return 0;
}
void write(unsigned reg, char, l4_umword_t value, unsigned cpuid)
{
if (reg >= Core_local_base && reg < Core_local_base + Control_block_size)
{
if (_cpc)
_cpc->cm_write_core(reg - Core_local_base, value, cpuid, false);
return;
}
if (reg >= Core_other_base && reg < Core_other_base + Control_block_size)
{
if (_cpc)
_cpc->cm_write_core(reg - Core_other_base, value, cpuid, true);
return;
}
Dbg(Dbg::Cpu, Dbg::Trace, "CM")
.printf("writing GCR 0x%lx @ 0x%x\n", value, reg);
switch (reg)
{
case Gcr_gic_base:
// XXX check that this address is expected
_gic_base = value;
break;
case Gcr_cpc_base:
{
Cpc_base_addr_reg newbase(value);
if (_cpc_base.base_addr())
{
_cpc_base.enable() = (unsigned)newbase.enable();
if (newbase.enable()
&& _cpc_base.base_addr() != newbase.base_addr())
Dbg(Dbg::Cpu, Dbg::Warn, "CM")
.printf("WARNING: change of CPC base address ignored.\n");
}
else
{
_cpc_base.raw = newbase.raw;
assert(_cpc);
Dbg(Dbg::Cpu, Dbg::Info, "CM")
.printf("Mapping CPC @ 0x%lx\n",
(l4_addr_t)_cpc_base.base_addr());
(*_memmap)[Vmm::Region::ss(Vmm::Guest_addr(_cpc_base.base_addr()),
Mips_cpc::Cpc_size,
Vmm::Region_type::Virtual)] = _cpc;
}
break;
}
case Gcr_bev_base:
if (_cpc)
_cpc->set_bev_base(value);
break;
default:
Dbg(Dbg::Cpu, Dbg::Info, "CM")
.printf("writing GCR 0x%lx @ 0x%x ignored.\n", value, reg);
}
}
char const *dev_name() const override { return "Coherency_manager"; }
private:
cxx::Ref_ptr<Vdev::Mips_cpc> _cpc;
Vmm::Vm_mem *_memmap;
l4_addr_t _gic_base;
Cpc_base_addr_reg _cpc_base;
};
} // name space

View File

@@ -0,0 +1,206 @@
/*
* Copyright (C) 2016-2018, 2020, 2022-2024 Kernkonzept GmbH.
* Author(s): Sarah Hoffmann <sarah.hoffmann@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include <cassert>
#include <cstdio>
#include <mutex>
#include <l4/cxx/bitfield>
#include "irq.h"
#include "vcpu_ptr.h"
namespace Gic {
/**
* Interrupt handler for core interrupts for a single VCPU.
*
* The Mips core interrupts are line-triggered and can be connected
* to multiple devices. This is implemented by a pending counter for
* each interrupt. Only connect stateful IRQ sinks to ensure the counting
* is correct.
*
* Only handles the hardware interrupts 2 - 7.
*/
class Vcpu_ic : public Ic
{
enum
{
Min_irq = 2,
Max_irq = 7
};
public:
Vcpu_ic()
: _cpu_irq(L4Re::chkcap(L4Re::Util::make_unique_cap<L4::Irq>(),
"allocate vcpu notification interrupt")),
_irqvec(0)
{
for (size_t i = Min_irq; i <= Max_irq; ++i)
_pending[i - Min_irq] = 0;
L4Re::Env::env()->factory()->create(_cpu_irq.get());
}
void attach_cpu_thread(L4::Cap<L4::Thread> thread)
{
L4Re::chksys(_cpu_irq->bind_thread(thread, 0),
"Bind vCPU IRQ-notification IRQ.");
}
void set(unsigned irq) override
{
assert(Min_irq <= irq && irq <= Max_irq);
std::lock_guard<std::mutex> lock(_lock);
if (++_pending[irq - Min_irq] == 1)
{
_irqvec |= 1UL << (irq - Min_irq);
_cpu_irq->trigger();
}
}
void clear(unsigned irq) override
{
assert(Min_irq <= irq && irq <= Max_irq);
std::lock_guard<std::mutex> lock(_lock);
if (--_pending[irq - Min_irq] == 0)
{
_irqvec &= ~(1UL << (irq - Min_irq));
_cpu_irq->trigger();
}
}
void bind_irq_src_handler(unsigned, Irq_src_handler *) override
{
L4Re::chksys(-L4_ENOSYS, "unmask not supported for Core IC. "
"Use GIC for devices that require EOI via IC.");
}
Irq_src_handler *get_irq_src_handler(unsigned) const override
{ return nullptr; }
int dt_get_interrupt(fdt32_t const *prop, int propsz, int *read) const override
{
if (propsz < 1)
return -L4_ERANGE;
int irq = fdt32_to_cpu(prop[0]);
if (read)
*read = 1;
return irq;
}
l4_uint32_t irq_vector()
{
std::lock_guard<std::mutex> lock(_lock);
return _irqvec;
}
void show_state(FILE *f, Vmm::Vcpu_ptr vcpu)
{
auto *s = vcpu.state();
s->update_state(L4_VM_MOD_STATUS);
unsigned imask = s->g_status >> 8;
unsigned ipending = s->g_cause >> 8;
for (unsigned i = Min_irq; i <= Max_irq; ++i)
fprintf(f, " Int %d: %d (HW: %s/%s)\n", i,
_pending[i - Min_irq],
imask & (1 << i) ? "on" : "off",
ipending & (1 << i) ? "pending" : "low");
}
private:
L4Re::Util::Unique_cap<L4::Irq> _cpu_irq;
/// Cached output pending array.
l4_uint32_t _irqvec;
/// Count for each interrupt the number of incomming sources.
int _pending[Max_irq - Min_irq + 1];
std::mutex _lock;
};
/**
* Device for all core interrupts.
*
* This device is not an interrupt handler itself, it just holds
* an array of core interrupt handlers, one for each core.
*/
class Mips_core_ic : public virtual Vdev::Dev_ref
{
enum { Max_ics = 32 };
struct Hw_int_reg
{
l4_umword_t raw;
CXX_BITFIELD_MEMBER(10, 15, hw_ints, raw);
Hw_int_reg(l4_umword_t r) : raw(r) {}
};
public:
Mips_core_ic()
{
// there always is an IC for CPU 0
_core_ics[0] = Vdev::make_device<Vcpu_ic>();
}
virtual ~Mips_core_ic() = default;
void create_ic(unsigned i, L4::Cap<L4::Thread> thread)
{
assert(i <= Max_ics);
// start up one core IC per vcpu
if (!_core_ics[i])
_core_ics[i] = Vdev::make_device<Vcpu_ic>();
_core_ics[i]->attach_cpu_thread(thread);
}
cxx::Ref_ptr<Vcpu_ic> get_ic(unsigned cpuid) const
{
assert(cpuid < Max_ics);
return _core_ics[cpuid];
}
static bool has_pending(Vmm::Vcpu_ptr vcpu)
{
return Hw_int_reg(vcpu.state()->guest_ctl_2).hw_ints();
}
void update_vcpu(Vmm::Vcpu_ptr vcpu)
{
unsigned cpuid = vcpu.get_vcpu_id();
assert(cpuid < Max_ics);
assert(_core_ics[cpuid]);
auto irqvec = _core_ics[cpuid]->irq_vector();
Hw_int_reg *gc2 = (Hw_int_reg *) &vcpu.state()->guest_ctl_2;
l4_uint32_t oldvec = gc2->hw_ints();
if (oldvec == irqvec)
return;
gc2->hw_ints() = irqvec;
vcpu.state()->set_modified(L4_VM_MOD_GUEST_CTL_2);
}
void show_state(FILE *f, Vmm::Vcpu_ptr vcpu)
{
unsigned cpuid = vcpu.get_vcpu_id();
if (_core_ics[cpuid])
_core_ics[cpuid]->show_state(f, vcpu);
}
private:
cxx::Ref_ptr<Vcpu_ic> _core_ics[Max_ics];
};
} // namespace

View File

@@ -0,0 +1,117 @@
/*
* Copyright (C) 2017, 2024 Kernkonzept GmbH.
* Author(s): Sarah Hoffmann <sarah.hoffmann@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#include "cpc.h"
namespace Vdev {
static Dbg warn(Dbg::Cpu, Dbg::Warn, "CPC");
static Dbg info(Dbg::Cpu, Dbg::Info, "CPC");
static Dbg trace(Dbg::Cpu, Dbg::Trace, "CPC");
l4_umword_t
Mips_cpc::read(unsigned reg, char, unsigned cpuid)
{
trace.printf("reading CPC @ 0x%x\n", reg);
if (reg >= Core_local_base && reg < Core_local_base + Control_block_size)
return cpc_read_core(reg - Core_local_base, cpuid);
if (reg >= Core_other_base && reg < Core_other_base + Control_block_size)
{
if ((cpuid >= _cpus->size()) || !_cpus->vcpu_exists(cpuid))
{
info.printf("read on unknown other core %d. Ignored.\n", cpuid);
return 0;
}
return cpc_read_core(reg - Core_other_base, _cpus->cpu(cpuid)->core_other());
}
info.printf("reading unknown register @ 0x%x ignored.\n", reg);
return 0;
}
void
Mips_cpc::write(unsigned reg, char, l4_umword_t value, unsigned cpuid)
{
trace.printf("writing CPC 0x%lx @ 0x%x\n", value, reg);
if (reg >= Core_local_base && reg < Core_local_base + Control_block_size)
cpc_write_core(reg - Core_local_base, value, cpuid);
else if (reg >= Core_other_base && reg < Core_other_base + Control_block_size)
{
if ((cpuid < _cpus->size()) && _cpus->vcpu_exists(cpuid))
cpc_write_core(reg - Core_other_base, value, _cpus->cpu(cpuid)->core_other());
else
info.printf("read on unknown other core %d. Ignored.\n", cpuid);
}
else
info.printf("writing unknown register 0x%lx @ 0x%x ignored.\n", value, reg);
}
l4_umword_t
Mips_cpc::cpc_read_core(unsigned reg, unsigned cpuid)
{
if (cpuid >= _cpus->size() || !_cpus->vcpu_exists(cpuid))
{
info.printf("CPC reading from uninitialised core %d ignored.\n", cpuid);
return 0;
}
trace.printf("core %d: reading CPC @ 0x%x\n", cpuid, reg);
switch (reg)
{
case Cpc_cl_stat_conf_reg:
return _cpus->cpu(cpuid)->cpc_status();
default:
info.printf("core %d: reading CPC @ 0x%x ignored.\n", cpuid, reg);
}
return 0;
}
void
Mips_cpc::cpc_write_core(unsigned reg, l4_umword_t value, unsigned cpuid)
{
if (cpuid >= _cpus->size() || !_cpus->vcpu_exists(cpuid))
{
info.printf("CPC writing to uninitialised core %d ignored.\n", cpuid);
return;
}
trace.printf("core %d: writing CPC 0x%lx @ 0x%x\n", cpuid, value, reg);
switch (reg)
{
case Cpc_cl_cmd_reg:
{
unsigned cmd = value & 0x7;
_cpus->cpu(cpuid)->set_last_command(cmd);
switch (cmd)
{
case Cpc_cmd_pwr_down:
_cpus->cpu(cpuid)->stop_vcpu();
break;
case Cpc_cmd_pwr_up:
case Cpc_cmd_reset:
_cpus->cpu(cpuid)->start_vcpu(_bev_base);
break;
}
break;
}
default:
info.printf("core %d: writing 0x%lx @ 0x%x ignored.\n",
cpuid, value, reg);
}
}
}

View File

@@ -0,0 +1,114 @@
/*
* Copyright (C) 2017-2018, 2020, 2023-2024 Kernkonzept GmbH.
* Author(s): Sarah Hoffmann <sarah.hoffmann@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include <cstdio>
#include "debug.h"
#include "mmio_device.h"
#include "cpu_dev_array.h"
namespace Vdev {
class Mips_cpc : public Vmm::Mmio_device_t<Mips_cpc>
{
private:
enum Cpc_local_registers
{
Cpc_cl_cmd_reg = 0x0,
Cpc_cl_stat_conf_reg = 0x8
};
enum Cpc_commands
{
Cpc_cmd_clock_off = 1,
Cpc_cmd_pwr_down = 2,
Cpc_cmd_pwr_up = 3,
Cpc_cmd_reset = 4
};
public:
enum Memmap
{
Cpc_size = 0x6000,
Core_local_base = 0x2000,
Core_other_base = 0x4000,
Control_block_size = 0x2000
};
void register_cpus(cxx::Ref_ptr<Vmm::Cpu_dev_array> cpus)
{ _cpus = cpus; }
void set_bev_base(l4_umword_t value)
{ _bev_base = value; }
l4_umword_t bev_base() const
{ return _bev_base; }
l4_umword_t read(unsigned reg, char size, unsigned cpuid);
void write(unsigned reg, char size, l4_umword_t value, unsigned cpuid);
l4_umword_t cm_read_core(unsigned reg, unsigned cpuid, bool other)
{
if (cpuid >= _cpus->size() || !_cpus->vcpu_exists(cpuid))
return 0;
if (other)
{
cpuid = _cpus->cpu(cpuid)->core_other();
if (cpuid >= _cpus->size() || !_cpus->vcpu_exists(cpuid))
{
Dbg(Dbg::Cpu, Dbg::Info, "CMloc").printf(
"CM reading from uninitialised core %d ignored.\n", cpuid);
return 0;
}
}
Dbg(Dbg::Cpu, Dbg::Trace, "CMloc").printf(
"core %d: reading CM @ 0x%x\n", cpuid, reg);
return _cpus->cpu(cpuid)->read_cm_reg(reg);
}
void cm_write_core(unsigned reg, l4_umword_t value, unsigned cpuid,
bool other)
{
if (cpuid >= _cpus->size()|| !_cpus->vcpu_exists(cpuid))
return;
if (other)
{
cpuid = _cpus->cpu(cpuid)->core_other();
if (cpuid >= _cpus->size() || !_cpus->vcpu_exists(cpuid))
{
Dbg(Dbg::Cpu, Dbg::Info, "CMloc").printf(
"CM writing to uninitialised core %d ignored.\n", cpuid);
return;
}
}
Dbg(Dbg::Cpu, Dbg::Trace, "CMloc").printf(
"core %d: writing CM 0x%lx @ 0x%x.\n", cpuid, value, reg);
_cpus->cpu(cpuid)->write_cm_reg(reg, value);
}
/// Return the maximum CPU id in use.
unsigned max_cpuid() const
{ return _cpus->max_cpuid(); }
char const *dev_name() const override { return "Mips_cpc"; }
private:
l4_umword_t cpc_read_core(unsigned reg, unsigned cpuid);
void cpc_write_core(unsigned reg, l4_umword_t value, unsigned cpuid);
l4_umword_t _bev_base;
cxx::Ref_ptr<Vmm::Cpu_dev_array> _cpus;
};
}

View File

@@ -0,0 +1,125 @@
/*
* Copyright (C) 2017, 2019, 2024 Kernkonzept GmbH.
* Author(s): Sarah Hoffmann <sarah.hoffmann@kernkonzept.com>
* Alexander Warg <alexander.warg@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#include "cpu_dev.h"
#include "guest_entry.h"
static const std::pair<l4_umword_t, const char *> MIPS_PROC_IDS[] =
{{0x0001a700, "mips,m5150"},
{0x0001a800, "mips,p5600"},
{0x0001a900, "mips,i6400"},
{0x0001b024, "mips,i6500"},
{0, nullptr}};
static Dbg warn(Dbg::Cpu, Dbg::Warn, "CPU");
static Dbg info(Dbg::Cpu, Dbg::Info, "CPU");
static Dbg trace(Dbg::Cpu, Dbg::Trace, "CPU");
namespace Vmm
{
static l4_umword_t
get_proc_type(char const *compatible)
{
if (!compatible)
return Cpu_dev::Default_procid;
for (auto *row = MIPS_PROC_IDS; row->second; ++row)
if (strcmp(row->second, compatible) == 0)
return row->first;
return Cpu_dev::Default_procid;
}
Cpu_dev::Cpu_dev(unsigned idx, unsigned phys_id, Vdev::Dt_node const *node)
: Generic_cpu_dev(idx, phys_id), _status(0), _core_other(0)
{
// If a compatible property exists, it may be used to specify
// the reported CPU type (if supported by architecture). Without
// compatible property, the default is used.
char const *compatible = node ? node->get_prop<char>("compatible", nullptr)
: nullptr;
_vcpu.set_proc_id(get_proc_type(compatible));
_vcpu.alloc_fpu_state();
_status.seq_state() = Seq_non_coherent;
}
void
Cpu_dev::reset()
{
l4_umword_t sp;
asm ("move %0, $sp" : "=r" (sp));
_vcpu->saved_state = L4_VCPU_F_FPU_ENABLED
| L4_VCPU_F_USER_MODE
| L4_VCPU_F_IRQ
| L4_VCPU_F_PAGE_FAULTS
| L4_VCPU_F_EXCEPTIONS;
_vcpu->entry_ip = (l4_umword_t)&c_vcpu_entry;
_vcpu->entry_sp = sp & ~0xfUL;
_vcpu->r.status |= 8;
auto *s = _vcpu.state();
// disable trapping of CF1&2, CG and GT, enable ctl2
s->guest_ctl_0 |= 0x3000083;
s->guest_ctl_0_ext |= 0x10; // CGI
l4_umword_t cca = s->g_cfg[0] & 7UL;
s->g_seg_ctl[0] = 0x00200010;
s->g_seg_ctl[1] = 0x00000002 | (cca << 16);
s->g_seg_ctl[2] = 0x04300030 | (cca << 16) | cca;
s->g_ebase = (s->g_ebase & ~0x3ffUL) | _vcpu.get_vcpu_id();
s->set_modified(L4_VM_MOD_GUEST_CTL_0
| L4_VM_MOD_GUEST_CTL_0_EXT
| L4_VM_MOD_CFG
| L4_VM_MOD_EBASE
| L4_VM_MOD_XLAT);
Dbg(Dbg::Core, Dbg::Info)
.printf("Starting vcpu %d @ 0x%lx (handler @ %lx with stack @ %lx)\n",
_vcpu.get_vcpu_id(), _vcpu->r.ip, _vcpu->entry_ip, _vcpu->entry_sp);
L4::Cap<L4::Thread> myself;
auto e = l4_error(myself->vcpu_resume_commit(myself->vcpu_resume_start()));
Err().printf("VMM exited with %ld\n", e);
}
void
Cpu_dev::start_vcpu(l4_addr_t bev_base)
{
info.printf("Start of vcpu %d requested.\n", _vcpu.get_vcpu_id());
// setup vcpu state
if (_reset_base & 1)
{
_vcpu->r.ip = bev_base;
trace.printf("Using BEV reset base 0x%lx\n", bev_base);
}
else
{
_vcpu->r.ip = _reset_base & Cm_loc_reset_base_addr_mask;
trace.printf("Using Core reset base 0x%lx\n", _reset_base);
}
_vcpu.state()->g_status |= (1 << 2) | (1 << 22); // ERL, BEV
reschedule();
// consider it officially done
// XXX should that be done in reset code?
set_coherent();
}
void
Cpu_dev::stop_vcpu()
{
warn.printf("Stop of vcpu %d requested. NOT IMPLEMENTED.\n",
_vcpu.get_vcpu_id());
}
} // namespace

View File

@@ -0,0 +1,157 @@
/*
* Copyright (C) 2017-2020, 2024 Kernkonzept GmbH.
* Author(s): Sarah Hoffmann <sarah.hoffmann@kernkonzept.com>
* Alexander Warg <alexander.warg@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include <cstdio>
#include "generic_cpu_dev.h"
#include "monitor/cpu_dev_cmd_handler.h"
namespace Vmm {
class Cpu_dev
: public Generic_cpu_dev,
public Monitor::Cpu_dev_cmd_handler<Monitor::Enabled, Cpu_dev>
{
public:
// Maximum number of CPUs that are addressable.
enum { Max_cpus = 32 };
enum { Default_procid = 0x00010000 };
enum Cm_local_registers
{
Cm_loc_coh_en = 0x08,
Cm_loc_config = 0x10,
Cm_loc_other = 0x18,
Cm_loc_reset_base = 0x20,
Cm_loc_id = 0x28,
Cm_loc_reset_ext_base = 0x30
};
// Mask of valid bits for various CM registers
enum Cm_register_masks
{
Cm_loc_other_mask = 0x3f,
Cm_loc_reset_base_mask = ~0xffcUL,
Cm_loc_reset_base_addr_mask = ~0xfffUL,
Cm_loc_reset_ext_base_mask = 0xcff000ff
};
enum Sequencer_state
{
Seq_pwr_down = 0x00,
Seq_reset = 0x04,
Seq_non_coherent = 0x06,
Seq_coherent = 0x07,
};
struct Local_status_reg
{
l4_uint32_t raw;
Local_status_reg() = default;
explicit Local_status_reg(l4_uint32_t raw) : raw(raw) {}
CXX_BITFIELD_MEMBER(23, 23, pwrup_event, raw);
CXX_BITFIELD_MEMBER(19, 22, seq_state, raw);
CXX_BITFIELD_MEMBER(17, 17, clkgat_impl, raw);
CXX_BITFIELD_MEMBER(16, 16, pwrdn_impl, raw);
CXX_BITFIELD_MEMBER(15, 15, jtag_probe, raw);
CXX_BITFIELD_MEMBER(14, 14, ci_pwrup, raw);
CXX_BITFIELD_MEMBER(13, 13, ci_vddok, raw);
CXX_BITFIELD_MEMBER(12, 12, ci_rail_stable, raw);
CXX_BITFIELD_MEMBER(11, 11, coh_en, raw);
CXX_BITFIELD_MEMBER(10, 10, lpack, raw);
CXX_BITFIELD_MEMBER(8, 9, pwup_policy, raw);
CXX_BITFIELD_MEMBER(7, 7, reset_hold, raw);
CXX_BITFIELD_MEMBER(4, 4, io_trffc_en, raw);
CXX_BITFIELD_MEMBER(0, 3, cmd, raw);
};
Cpu_dev(unsigned idx, unsigned phys_id, Vdev::Dt_node const *node);
/**
* Translate a device tree "reg" value to an internally usable CPU id.
*
* For most architectures this is NOP, but some archictures like ARM
* might encode topology information into this value, which needs to
* be translated.
*/
static unsigned dtid_to_cpuid(l4_int32_t prop_val)
{ return prop_val; }
static bool has_fixed_dt_mapping() { return true; }
unsigned core_other() const
{ return _core_other; }
l4_uint32_t cpc_status() const
{ return _status.raw; }
void set_last_command(unsigned cmd)
{ _status.cmd() = cmd; }
l4_umword_t read_cm_reg(unsigned reg)
{
switch(reg)
{
case Cm_loc_coh_en: return _status.coh_en();
case Cm_loc_config: return 0; // one VP per core
case Cm_loc_other: return _core_other << 8;
case Cm_loc_reset_base: return _reset_base;
case Cm_loc_id: return _vcpu.get_vcpu_id();
case Cm_loc_reset_ext_base: return _ext_reset_base;
}
return 0;
}
void write_cm_reg(unsigned reg, l4_umword_t value)
{
switch(reg)
{
case Cm_loc_coh_en:
_status.coh_en() = value & 1;
break;
case Cm_loc_other:
_core_other = (value >> 8) & Cm_loc_other_mask;
break;
case Cm_loc_reset_base:
_reset_base = value & Cm_loc_reset_base_mask;
break;
case Cm_loc_reset_ext_base:
_ext_reset_base = value & Cm_loc_reset_ext_base_mask;
break;
}
}
void set_coherent()
{
_status.seq_state() = Seq_coherent;
_status.coh_en() = 1;
}
void start_vcpu(l4_addr_t bev_base);
void stop_vcpu();
void reset() override;
void stop() override { stop_vcpu(); }
private:
/// CPC state: local status register
Local_status_reg _status;
/// CM state: reset address register
l4_umword_t _reset_base;
/// CM state: extension to reset address register
l4_umword_t _ext_reset_base;
/// CM/CPC state: selected other core.
/// Note that starting with CM3, CM selects the other CPU for CM _and_ CPC.
unsigned char _core_other;
};
}

View File

@@ -0,0 +1,22 @@
/*
* Copyright (C) 2016-2017, 2019, 2023-2024 Kernkonzept GmbH.
* Author(s): Timo Nicolai <timo.nicolai@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
namespace Vmm {
class Vm;
}
namespace Monitor {
class Guest_debugger
{
public:
explicit Guest_debugger(Vmm::Vm *)
{}
};
}

View File

@@ -0,0 +1,334 @@
/*
* Copyright (C) 2016-2020, 2023-2024 Kernkonzept GmbH.
* Author(s): Sarah Hoffmann <sarah.hoffmann@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#include <cstdio>
#include <functional>
#include <l4/sys/kip.h>
#include "guest.h"
#include "device_factory.h"
#include "device_tree.h"
#include "debug.h"
#include "gic.h"
static Dbg trace(Dbg::Irq, Dbg::Trace, "GIC");
static Dbg warn(Dbg::Irq, Dbg::Warn, "GIC");
static Dbg dbg(Dbg::Irq, Dbg::Info, "GIC");
namespace Gic {
Dist::Dist(Mips_core_ic *core_ic)
: Read_mapped_mmio_device_t("Dist", Gic_shared_size),
_core_ic(core_ic)
{
static_assert(L4_PAGESIZE <= 16 * 1024, "Maximum supported page size is 16k");
// set up shared section
auto *cfg = gic_mem<Gic_config_reg>(Gic_sh_config);
cfg->raw = 0;
cfg->numint() = (Num_irqs >> 3) - 1;
cfg->pvps() = Num_vpes;
// set revision to 4.0, as reported by Baikal board
*gic_mem<l4_uint32_t>(Gic_sh_revision) = 4 << 8;
auto *sh = gic_mem<char>(0);
memset(sh + Gic_sh_int_avail, 0xff, Num_irqs >> 3);
memset(sh + Gic_sh_pend, 0, Num_irqs >> 3);
}
l4_umword_t
Dist::read(unsigned reg, char size, unsigned cpu_id)
{
assert(cpu_id < Num_vpes);
if (size < 2)
{
warn.printf("WARNING: read @0x%x with unsupported width %d ignored\n",
reg, 8 << size);
return 0;
}
if (reg < Gic_shared_base + Gic_shared_size)
{
if (size == 3)
return *gic_mem<l4_uint64_t>(reg);
else
return *gic_mem<l4_uint32_t>(reg);
}
if (reg >= Gic_core_local_base && reg < Gic_core_other_base)
return read_cpu(reg - Gic_core_local_base, size, cpu_id);
if (reg >= Gic_core_other_base && reg < Gic_user_visible_base)
return read_cpu(reg - Gic_core_other_base, size,
_vcpu_info[cpu_id].other_cpu);
dbg.printf("Reading unknown register @ 0x%x (%d)\n", reg, size);
return 0;
}
void
Dist::write(unsigned reg, char size, l4_umword_t value, unsigned cpu_id)
{
assert(cpu_id < Num_vpes);
if (size < 2)
{
warn.printf("WARNING: write @0x%x with unsupported width %d ignored\n",
reg, 8 << size);
return;
}
if (reg >= Gic_core_local_base && reg < Gic_core_other_base)
{
write_cpu(reg - Gic_core_local_base, size, value, cpu_id);
return;
}
if (reg >= Gic_core_other_base && reg < Gic_user_visible_base)
{
write_cpu(reg - Gic_core_other_base, size, value,
_vcpu_info[cpu_id].other_cpu);
return;
}
// write must be to shared section
if (reg == Gic_sh_wedge)
{
Gic_wedge_reg wedge(value);
if (wedge.irq() < Num_irqs)
{
if (wedge.rw())
set(wedge.irq());
else
clear(wedge.irq());
}
}
else if (reg >= Gic_sh_rmask && reg < Gic_sh_rmask + Num_irqs / 8)
{
reset_mask(reg - Gic_sh_rmask, size, value);
}
else if (reg >= Gic_sh_smask && reg < Gic_sh_smask + Num_irqs / 8)
{
set_mask(reg - Gic_sh_smask, size, value);
}
else if (reg >= Gic_sh_pol && reg < Gic_sh_wedge)
{
// polarity, edge, dual configuration ignored
gic_mem_set(reg, size, value);
}
else if (reg >= Gic_sh_pin && reg < irq_to_pinreg(Num_irqs))
{
gic_mem_set(reg, size, value);
setup_source(pinreg_to_irq(reg));
}
else if (reg >= Gic_sh_map && reg < irq_to_mapreg(Num_irqs))
{
gic_mem_set(reg, size, value);
setup_source(mapreg_to_irq(reg));
}
else
dbg.printf("Writing ignored 0x%lx @ 0x%x (%d)\n", value, reg, size);
}
l4_umword_t
Dist::read_cpu(unsigned reg, char, unsigned cpu_id)
{
if (cpu_id >= 32)
{
dbg.printf("unknown VPE id %d. Read ignored @ 0x%x\n", cpu_id, reg);
return 0;
}
switch (reg)
{
case Gic_loc_other_addr:
return _vcpu_info[cpu_id].other_cpu;
case Gic_loc_ident:
return cpu_id;
}
trace.printf("Local read from cpu %d ignored @ 0x%x\n", cpu_id, reg);
return 0;
}
void
Dist::write_cpu(unsigned reg, char, l4_umword_t value, unsigned cpu_id)
{
if (cpu_id >= 32)
{
dbg.printf("unknown VPE id %d. Write ignored 0x%lx @ 0x%x\n", cpu_id,
value, reg);
return;
}
switch (reg)
{
case Gic_loc_other_addr:
if (value < Num_vpes)
_vcpu_info[cpu_id].other_cpu = value;
return;
}
trace.printf("Local write to cpu %d ignored 0x%lx @ 0x%x\n", cpu_id, value,
reg);
}
/** disable interrupts */
void
Dist::reset_mask(unsigned reg, char size, l4_umword_t mask)
{
assert(reg * 8 < Num_irqs);
l4_umword_t pending;
std::lock_guard<std::mutex> lock(_lock);
if (size == 3)
{
*gic_mem<l4_uint64_t>(Gic_sh_mask + reg) &= ~mask;
pending = mask & *gic_mem<l4_uint64_t>(Gic_sh_pend + reg);
}
else
{
*gic_mem<l4_uint32_t>(Gic_sh_mask + reg) &= ~mask;
pending = ((l4_uint32_t) mask) & *gic_mem<l4_uint32_t>(Gic_sh_pend + reg);
}
int irq = reg * 8;
while (pending)
{
if (pending & 1)
_irq_array[irq]->ack();
++irq;
pending >>= 1;
}
}
/** enable interrupts */
void
Dist::set_mask(unsigned reg, char size, l4_umword_t mask)
{
assert(reg * 8 < Num_irqs);
int irq = reg * 8;
// narrow mask down to register width
if ((8UL << size) < 8 * sizeof(l4_umword_t))
mask &= (1UL << (8 << size)) - 1;
// Notify interrupt sources where necessary.
// Needs to be done before taking the lock as the IRQ source
// may want to clear a pending interrupt.
l4_umword_t eoibits = mask;
for (int i = 0; eoibits; ++i)
{
if ((eoibits & 1) && _sources[irq + i])
_sources[irq + i]->eoi();
eoibits >>= 1;
}
std::lock_guard<std::mutex> lock(_lock);
if (size == 3)
*gic_mem<l4_uint64_t>(Gic_sh_mask + reg) |= mask;
else
*gic_mem<l4_uint32_t>(Gic_sh_mask + reg) |= mask;
l4_umword_t pending = mask;
if (size == 3)
pending &= *gic_mem<l4_uint64_t>(Gic_sh_pend + reg);
else
pending &= *gic_mem<l4_uint32_t>(Gic_sh_pend + reg);
// reinject any interrupts that are still pending
for (int i = 0; pending; ++i)
{
if (pending & 1)
_irq_array[irq + i]->inject();
pending >>= 1;
}
}
void
Dist::setup_source(unsigned irq)
{
assert(irq < Num_irqs);
std::lock_guard<std::mutex> lock(_lock);
auto vp = *gic_mem<l4_uint32_t>(irq_to_mapreg(irq));
if (!(vp & 0x1f))
{
_irq_array[irq].reset();
return;
}
unsigned cpuid = 0;
for (; !(vp & 1); ++cpuid, vp >>= 1)
;
auto ic = _core_ic->get_ic(cpuid);
auto pin = *gic_mem<Gic_pin_reg>(irq_to_pinreg(irq));
trace.printf("GIC irq 0x%x: setting source for CPU %d to pin 0x%x (IC %p)\n",
irq, cpuid, pin.raw, ic.get());
// only int pins at the moment
if (ic && pin.pin() && pin.map() < 6)
_irq_array[irq] = cxx::make_unique<Vmm::Irq_sink>(ic, pin.map() + 2);
else
_irq_array[irq].reset();
}
void
Dist::show_state(FILE *f)
{
fprintf(f, " Interrupts available: %d\n", Num_irqs);
for (unsigned i = 0; i < Num_irqs; ++i)
{
if (_irq_array[i])
fprintf(f, " Int %d => core IC %u %s/%s\n",
i, gic_mem<Gic_pin_reg>(Gic_sh_pin + i * 4)->map() + 2,
irq_mask()[i] ? "on" : "off",
irq_pending()[i] ? "pending" : "low");
}
}
namespace {
struct F : Vdev::Factory
{
cxx::Ref_ptr<Vdev::Device> create(Vdev::Device_lookup *devs,
Vdev::Dt_node const &node) override
{
l4_uint64_t size;
int res = node.get_reg_val(0, nullptr, &size);
if (res < 0)
{
Err().printf("Failed to read 'reg' from node %s: %s\n",
node.get_name(), node.strerror(res));
throw L4::Runtime_error(-L4_EINVAL);
}
auto g = Vdev::make_device<Dist>(devs->vmm()->core_ic().get());
devs->vmm()->register_mmio_device(g, Vmm::Region_type::Virtual, node);
return g;
}
};
static F f;
static Vdev::Device_type t = { "mti,gic", nullptr, &f };
}
} // namespace

View File

@@ -0,0 +1,228 @@
/*
* Copyright (C) 2015-2018, 2020, 2023-2024 Kernkonzept GmbH.
* Author(s): Sarah Hoffmann <sarah.hoffmann@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include <mutex>
#include <l4/cxx/bitmap>
#include <l4/cxx/unique_ptr>
#include <l4/re/dataspace>
#include <l4/re/rm>
#include <l4/re/util/cap_alloc>
#include "irq.h"
#include "core_ic.h"
#include "device_tree.h"
#include "mmio_device.h"
namespace Gic {
class Dist
: public Vmm::Read_mapped_mmio_device_t<Dist, char>,
public Ic
{
enum Config
{
Num_irqs = 128, // maximum irq supported by Linux 3.19
Cfg_words = Num_irqs >> 5, // 32 irq config bits per word
Num_vpes = 32 // number of VPEs the GIC can handle
};
// The P5600 spec says there is a maximum of 256 irqs but the
// data structures can accomodate up to 512. Only then the system breaks.
static_assert(Num_irqs <= 512, "Maximum supported irqs is 512");
static_assert(Num_irqs % 8 == 0, "Number of IRQs must be a multipe of 8");
enum Mips_gic_registers
{
Gic_shared_base = 0,
Gic_shared_size = 32 * 1024,
Gic_core_local_base = 0x8000,
Gic_core_other_base = 0xc000,
Gic_local_size = 16 * 1024,
Gic_user_visible_base = 0x16000,
Gic_user_size = 64 * 1024,
Gic_sh_config = 0x0,
Gic_sh_counter = 0x10,
Gic_sh_counter_lo = 0x10,
Gic_sh_counter_hi = 0x14,
Gic_sh_revision = 0x20,
Gic_sh_int_avail = 0x28,
Gic_sh_gid_config = 0x80,
Gic_sh_pol = 0x100,
Gic_sh_trig = 0x180,
Gic_sh_dual = 0x200,
Gic_sh_wedge = 0x280,
Gic_sh_rmask = 0x300,
Gic_sh_smask = 0x380,
Gic_sh_mask = 0x400,
Gic_sh_pend = 0x480,
Gic_sh_pin = 0x500,
Gic_sh_map = 0x2000,
Gic_loc_other_addr = 0x80,
Gic_loc_ident = 0x88,
};
struct Gic_config_reg
{
l4_uint32_t raw;
CXX_BITFIELD_MEMBER(31, 31, vzp, raw);
CXX_BITFIELD_MEMBER(30, 30, vze, raw);
CXX_BITFIELD_MEMBER(29, 29, irc, raw);
CXX_BITFIELD_MEMBER(28, 28, countstop, raw);
CXX_BITFIELD_MEMBER(24, 27, countbits, raw);
CXX_BITFIELD_MEMBER(16, 23, numint, raw);
CXX_BITFIELD_MEMBER(8, 15, irgid, raw);
CXX_BITFIELD_MEMBER(0, 6, pvps, raw);
};
struct Gic_pin_reg
{
l4_uint32_t raw;
CXX_BITFIELD_MEMBER(31, 31, pin, raw);
CXX_BITFIELD_MEMBER(30, 30, nmi, raw);
CXX_BITFIELD_MEMBER(8, 15, gid, raw);
CXX_BITFIELD_MEMBER(0, 5, map, raw);
};
struct Gic_wedge_reg
{
l4_umword_t raw;
CXX_BITFIELD_MEMBER(31, 31, rw, raw);
CXX_BITFIELD_MEMBER(0, 7, irq, raw);
explicit Gic_wedge_reg(l4_umword_t value) : raw(value) {}
};
struct Cpu_info
{
unsigned other_cpu = 0;
};
public:
Dist(Mips_core_ic *core_ic);
l4_umword_t read(unsigned reg, char size, unsigned cpu_id);
void write(unsigned reg, char size, l4_umword_t value, unsigned cpu_id);
void set(unsigned irq) override
{
assert(irq < Num_irqs);
std::lock_guard<std::mutex> lock(_lock);
if (!_irq_array[irq])
return;
irq_pending().set_bit(irq);
if (irq_mask()[irq])
_irq_array[irq]->inject();
}
void clear(unsigned irq) override
{
assert(irq < Num_irqs);
std::lock_guard<std::mutex> lock(_lock);
if (!_irq_array[irq])
return;
irq_pending().clear_bit(irq);
if (irq_mask()[irq])
_irq_array[irq]->ack();
}
void bind_irq_src_handler(unsigned irq, Irq_src_handler *handler) override
{
assert(irq < Num_irqs);
if (handler && _sources[irq])
throw L4::Runtime_error(-L4_EEXIST);
_sources[irq] = handler;
}
Irq_src_handler *get_irq_src_handler(unsigned irq) const override
{ return _sources[irq]; }
int dt_get_interrupt(fdt32_t const *prop, int propsz, int *read) const override
{
if (propsz < 3)
return -L4_ERANGE;
int irq = fdt32_to_cpu(prop[1]);
if (read)
*read = 3;
return irq;
}
void reset_mask(unsigned reg, char size, l4_umword_t mask);
void set_mask(unsigned reg, char size, l4_umword_t mask);
void setup_source(unsigned irq);
void show_state(FILE *);
private:
/**
* Return offset of map register for the given IRQ.
*
* Map registers spaced at 0x20 byte intervals.
*/
unsigned irq_to_mapreg(unsigned irq) const
{ return Gic_sh_map + irq * 0x20; }
unsigned mapreg_to_irq(unsigned offset) const
{ return (offset - Gic_sh_map) / 0x20; }
cxx::Bitmap_base irq_mask() const
{ return cxx::Bitmap_base(gic_mem<void>(Gic_sh_mask)); }
cxx::Bitmap_base irq_pending() const
{ return cxx::Bitmap_base(gic_mem<void>(Gic_sh_pend)); }
/**
* Return offset of pin register for the given IRQ.
*
* Pin registers spaced at 4 byte intervals.
*/
unsigned irq_to_pinreg(unsigned irq) const
{ return Gic_sh_pin + irq * 4; }
unsigned pinreg_to_irq(unsigned offset) const
{ return (offset - Gic_sh_pin) / 4; }
template <typename T>
T *gic_mem(unsigned offset) const
{ return reinterpret_cast<T *>(mmio_local_addr() + offset); }
void gic_mem_set(unsigned offset, char size, l4_umword_t value) const
{
if (size == 3)
*gic_mem<l4_uint64_t>(offset) = value;
else
*gic_mem<l4_uint32_t>(offset) = value;
}
l4_umword_t read_cpu(unsigned reg, char size, unsigned cpu_id);
void write_cpu(unsigned reg, char size, l4_umword_t value,
unsigned cpu_id);
Mips_core_ic *_core_ic;
// array of IRQ connections towards core IC
cxx::unique_ptr<Vmm::Irq_sink> _irq_array[Num_irqs];
// registered device callbacks for configuration and eoi
Irq_src_handler *_sources[Num_irqs] = {};
Cpu_info _vcpu_info[Num_vpes];
std::mutex _lock;
};
} // namespace

View File

@@ -0,0 +1,263 @@
/*
* Copyright (C) 2015-2018, 2022-2024 Kernkonzept GmbH.
* Author(s): Sarah Hoffmann <sarah.hoffmann@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#include "binary_loader.h"
#include "device_factory.h"
#include "guest.h"
#include "guest_entry.h"
namespace Vmm {
Guest::Guest()
: _core_ic(Vdev::make_device<Gic::Mips_core_ic>()),
_cm(Vdev::make_device<Vdev::Coherency_manager>(&_memmap)),
_cpc(Vdev::make_device<Vdev::Mips_cpc>())
{
_memmap[_cm->mem_region()] = _cm;
_cm->register_cpc(_cpc);
}
void
Guest::setup_device_tree(Vdev::Device_tree dt)
{
// advertise CPU core timer frequency in DTS
auto node = dt.path_offset("/cpus");
node.setprop_u32("mips-hpt-frequency", l4re_kip()->frequency_cpu * 1000);
}
l4_addr_t
Guest::load_binary(Vm_ram *ram, char const *binary, Ram_free_list *free_list)
{
l4_addr_t entry;
Boot::Binary_loader_factory bf;
bf.load(binary, ram, free_list, &entry);
return entry;
}
void
Guest::prepare_binary_run(Vdev::Device_lookup *devs, l4_addr_t entry,
char const *binary, char const *cmd_line,
l4_addr_t dt_boot_addr)
{
Vcpu_ptr vcpu = devs->cpus()->vcpu(0);
Vm_ram *ram = devs->ram().get();
/*
* Setup arguments for Mips boot protocol
*/
Guest_addr prom_tab(L4_PAGESIZE);
size_t size = 2 * sizeof(l4_addr_t);
auto prom_buf = prom_tab + size;
size += strlen(binary) + 1;
strcpy(ram->guest2host<char *>(prom_buf), binary);
ram->guest2host<l4_addr_t *>(prom_tab)[0] = ram->guest_phys2boot(prom_buf);
if (cmd_line)
{
strcpy(ram->guest2host<char *>(prom_tab + size), cmd_line);
ram->guest2host<l4_addr_t *>(prom_tab)[1] = ram->guest_phys2boot(prom_tab + size);
size += strlen(cmd_line) + 1;
}
l4_cache_clean_data(ram->guest2host<l4_addr_t>(prom_tab),
ram->guest2host<l4_addr_t>(prom_tab) + size);
// Initial register setup:
// a0 - number of kernel arguments
// a1 - address of kernel arguments
// a2 - unused
// a3 - address of DTB
vcpu->r.a0 = cmd_line ? 2 : 1;
vcpu->r.a1 = ram->guest_phys2boot(prom_tab);
vcpu->r.a2 = 0;
vcpu->r.a3 = dt_boot_addr;
vcpu->r.status = 8;
// UHI boot protocol spec says that at least KX should be set when the
// boot loader passes in 64bit addresses for the command line parameters.
if (sizeof(l4_addr_t) == 8)
vcpu->r.status |= 0xe0;
vcpu->r.ip = entry;
}
void
Guest::run(cxx::Ref_ptr<Cpu_dev_array> const &cpus)
{
_cpc->register_cpus(cpus);
for (auto cpu: *cpus.get())
{
if (!cpu)
continue;
cpu->vcpu()->user_task = _task.cap();
cpu->powerup_cpu();
// attach the core IC
_core_ic->create_ic(cpu->vcpu().get_vcpu_id(), cpu->thread_cap());
}
cpus->cpu(0)->set_coherent();
cpus->cpu(0)->startup();
}
int
Guest::dispatch_hypcall(Hypcall_code hypcall_code, Vcpu_ptr vcpu)
{
switch (hypcall_code)
{
case Hypcall_outchar:
_hypcall_print.print_char(vcpu->r.a0);
return Jump_instr;
};
return -L4_ENOSYS;
}
void
Guest::handle_entry(Vcpu_ptr vcpu)
{
auto *utcb = l4_utcb();
unsigned cause = (vcpu->r.cause >> 2) & 0x1F;
// XXX The above statement treats all Fiasco exception codes (0x100-0x102)
// equally as 0. In case of 0x101 (ex_regs triggered exception) this
// might be an issue as handle_ipc() might evaluate stale IPC regs.
// 0x102 is defined but not used.
assert((vcpu->r.cause & 0x1FF) <= 0x100);
auto *s = vcpu.state();
unsigned exccode = (s->guest_ctl_0 >> 2) & 0x1f;
if ((cause != 27 || exccode != 2) && trace().is_active())
trace().printf("VCPU %d Entry. IP = 0x%lx, cause: 0x%lx(%d), ctl0: 0x%lx\n",
vcpu.get_vcpu_id(), vcpu->r.ip, vcpu->r.cause, cause,
s->guest_ctl_0);
switch (cause)
{
case 0:
vcpu.handle_ipc(vcpu->i.tag, vcpu->i.label, utcb);
break;
case 1: // TLB modify
case 2: // TLB load/fetch
case 3: // TLB store
if (Mips::Instruction(vcpu->r.bad_instr).is_cache_op())
{
// FIXME: cache coherency currently not handled
// We assume that the memory will be coherent when mapped into
// the guest on first access.
info().printf("Cache operation on unmapped memory requested. Ignored. (Opcode: 0x%lx, address: 0x%lx)\n",
vcpu->r.bad_instr, vcpu->r.pfa);
vcpu.jump_instruction();
break;
}
switch (handle_mmio(vcpu->r.pfa, vcpu))
{
case Retry: break;
case Jump_instr: vcpu.jump_instruction(); break;
default:
Err().printf(
"Bad page fault (%s) 0x%lx (GExcCode=0x%x) @0x%lx. Halting.\n",
cause == 2 ? "read" : "write", vcpu->r.pfa, exccode, vcpu->r.ip);
halt_vm(vcpu);
break;
}
break;
case 27: // guest exception
{
Mips::Instruction insn(vcpu->r.bad_instr);
if (!insn.raw)
{
Err().printf("Cannot decode faulting instruction @ IP 0x%lx\n",
vcpu->r.ip);
halt_vm(vcpu);
}
int ret = -L4_ENOSYS;
switch (exccode)
{
case 0: // sensitive instruction
if (insn.is_mfc0())
ret = handle_gpsi_mfc0(vcpu, insn);
else if (insn.is_mtc0())
ret = handle_gpsi_mtc0(vcpu, insn);
else if (insn.is_wait())
ret = handle_wait(vcpu, utcb);
else if (insn.is_cache_op())
{
// Index Store Tag must only be used to initialise caches, ignore.
if (insn.cache_optype() != 2)
info().printf("Unhandled cache operation 0x%lx. Ignored.\n",
vcpu->r.bad_instr);
// FIXME: assuming that cache coherency is guaranteed by Fiasco
ret = Jump_instr;
}
break;
case 1: // software field change
if (insn.is_mtc0())
ret = handle_software_field_change(vcpu, insn);
break;
case 2: // hypcall
if (insn.is_hypcall())
ret = dispatch_hypcall((Hypcall_code)(unsigned)insn.hypcall_code(), vcpu);
break;
case 9: // hardware field change
info().printf("Hardware change ignored @ IP 0x%lx\n", vcpu->r.ip);
ret = 0; // ignored
break;
case 10:
Err().printf("Bad TLB root access 0x%lx @0x%lx. Halting.\n",
vcpu->r.pfa, vcpu->r.ip);
break;
}
if (ret < 0)
{
Err().printf("Guest exception %d, error: %d, inst: 0x%x @ IP 0x%lx\n",
exccode, ret, insn.raw, vcpu->r.ip);
halt_vm(vcpu);
}
if (ret == Jump_instr)
vcpu.jump_instruction();
break;
}
default:
Err().printf("Unknown cause of VMM entry: %d. Halting.\n", cause);
halt_vm(vcpu);
}
vcpu.process_pending_ipc(utcb);
_core_ic->update_vcpu(vcpu);
}
namespace {
using namespace Vdev;
struct F : Factory
{
cxx::Ref_ptr<Vdev::Device> create(Device_lookup *devs,
Vdev::Dt_node const &) override
{
// Device tree only sees the IC for core 0.
return devs->vmm()->core_ic()->get_ic(0);
}
};
static F f;
static Vdev::Device_type t = { "mti,cpu-interrupt-controller", nullptr, &f };
}
} // namespace

View File

@@ -0,0 +1,309 @@
/*
* Copyright (C) 2015-2020, 2022-2024 Kernkonzept GmbH.
* Author(s): Sarah Hoffmann <sarah.hoffmann@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include <l4/cxx/ref_ptr>
#include <l4/vbus/vbus>
#include <l4/l4virtio/l4virtio>
#include "cpc.h"
#include "cm.h"
#include "core_ic.h"
#include "debug.h"
#include "device_tree.h"
#include "generic_guest.h"
#include "cpu_dev_array.h"
#include "irq.h"
#include "vmprint.h"
#include "mips_instructions.h"
#include "vm_ram.h"
constexpr l4_addr_t sign_ext(l4_uint32_t addr)
{ return (l4_addr_t) ((l4_mword_t) ((l4_int32_t) addr)); }
namespace Vmm {
class Guest : public Generic_guest
{
enum Hypcall_code
{
Hypcall_base = 0x160,
Hypcall_outchar = Hypcall_base + 0,
};
struct Cp0_config4
{
l4_uint32_t _v;
Cp0_config4() = default;
Cp0_config4(l4_uint32_t v) : _v(v) {}
CXX_BITFIELD_MEMBER( 0, 7, mmu_sz_ext, _v);
CXX_BITFIELD_MEMBER( 0, 3, ftlb_sets, _v);
CXX_BITFIELD_MEMBER( 4, 7, ftlb_ways, _v);
CXX_BITFIELD_MEMBER( 0, 7, ftlb_info, _v);
CXX_BITFIELD_MEMBER( 8, 12, ftlb_page_size2, _v);
CXX_BITFIELD_MEMBER( 8, 10, ftlb_page_size1, _v);
CXX_BITFIELD_MEMBER(14, 15, mmu_ext_def, _v);
CXX_BITFIELD_MEMBER(16, 23, k_scr_num, _v);
CXX_BITFIELD_MEMBER(24, 27, vtlb_sz_ext, _v);
CXX_BITFIELD_MEMBER(28, 28, ae, _v);
CXX_BITFIELD_MEMBER(29, 30, ie, _v);
static Cp0_config4 *vcpu(Vcpu_ptr vcpu)
{ return reinterpret_cast<Cp0_config4 *>(&vcpu.state()->g_cfg[4]); }
};
struct Cp0_config5
{
l4_uint32_t _v;
Cp0_config5() = default;
Cp0_config5(l4_uint32_t v) : _v(v) {}
CXX_BITFIELD_MEMBER( 0, 0, nf_exists, _v);
CXX_BITFIELD_MEMBER( 2, 2, ufr, _v);
CXX_BITFIELD_MEMBER( 3, 3, mrp, _v);
CXX_BITFIELD_MEMBER( 4, 4, llb, _v);
CXX_BITFIELD_MEMBER( 5, 5, mvh, _v);
CXX_BITFIELD_MEMBER( 6, 6, sbri, _v);
CXX_BITFIELD_MEMBER( 7, 7, vp, _v);
CXX_BITFIELD_MEMBER( 8, 8, fre, _v);
CXX_BITFIELD_MEMBER( 9, 9, ufe, _v);
CXX_BITFIELD_MEMBER(10, 10, l2c, _v);
CXX_BITFIELD_MEMBER(11, 11, dec, _v);
CXX_BITFIELD_MEMBER(13, 13, xnp, _v);
CXX_BITFIELD_MEMBER(27, 27, msa_en, _v);
CXX_BITFIELD_MEMBER(28, 28, eva, _v);
CXX_BITFIELD_MEMBER(29, 29, cv, _v);
CXX_BITFIELD_MEMBER(30, 30, k, _v);
static Cp0_config5 *vcpu(Vcpu_ptr vcpu)
{ return reinterpret_cast<Cp0_config5 *>(&vcpu.state()->g_cfg[5]); }
};
public:
enum
{
Default_rambase = 0,
Boot_offset = sign_ext(0x80000000)
};
Guest();
cxx::Ref_ptr<Gic::Mips_core_ic> core_ic() const { return _core_ic; }
void setup_device_tree(Vdev::Device_tree dt);
l4_addr_t load_binary(Vm_ram *ram, char const *binary,
Ram_free_list *free_list);
void prepare_platform(Vdev::Device_lookup *)
{}
void prepare_binary_run(Vdev::Device_lookup *devs, l4_addr_t entry,
char const *binary, char const *cmd_line,
l4_addr_t dt_boot_addr);
void run(cxx::Ref_ptr<Cpu_dev_array> const &cpus);
int dispatch_hypcall(Hypcall_code hypcall_code, Vcpu_ptr vcpu);
void handle_entry(Vcpu_ptr vcpu);
void show_state_interrupts(FILE *f, Vcpu_ptr vcpu)
{
if (_core_ic)
_core_ic->show_state(f, vcpu);
}
static Guest *create_instance();
private:
int handle_gpsi_mfc0(Vcpu_ptr vcpu, Mips::Instruction insn)
{
l4_umword_t val;
unsigned reg = (insn.rd() << 3) | (insn.func() & 0x7);
trace().printf("MFC0 for 0x%x in register %d\n",
reg, (unsigned) insn.rt());
switch (reg)
{
case L4_VM_CP0_GLOBAL_NUMBER:
val = vcpu.get_vcpu_id() << 8;
break;
case L4_VM_CP0_PROC_ID:
val = vcpu.proc_id();
break;
case L4_VM_CP0_SRS_CTL:
val = 0;
break;
case L4_VM_CP0_CMGCR_BASE:
val = Vdev::Coherency_manager::mem_region().start.get() >> 4;
break;
case L4_VM_CP0_MAAR_0:
case L4_VM_CP0_MAAR_1:
case L4_VM_CP0_ERR_CTL:
case L4_VM_CP0_CONFIG_6:
case L4_VM_CP0_CONFIG_7:
val = 0; break;
default: return -L4_ENOSYS;
}
if (sizeof(l4_addr_t) == 4 || insn.rs() == Mips::Op::Cop0_dmf)
vcpu->r.r[insn.rt()] = val;
else
vcpu->r.r[insn.rt()] = sign_ext((l4_uint32_t) val);
return Jump_instr;
}
int handle_gpsi_mtc0(Vcpu_ptr vcpu, Mips::Instruction insn)
{
unsigned reg = (insn.rd() << 3) | (insn.func() & 0x7);
trace().printf("MTC0 for 0x%x in register %u\n", reg, (unsigned) insn.rt());
switch (reg)
{
case L4_VM_CP0_COUNT:
{
l4_uint32_t newcnt = vcpu->r.r[insn.rt()];
l4_uint32_t kcnt;
asm volatile("rdhwr\t%0, $2" : "=r"(kcnt)); // timer counter
vcpu.state()->guest_timer_offset = (l4_int32_t) (newcnt - kcnt);
vcpu.state()->set_modified(L4_VM_MOD_GTOFFSET);
return Jump_instr;
}
case L4_VM_CP0_CONFIG_0:
case L4_VM_CP0_CONFIG_1:
case L4_VM_CP0_CONFIG_2:
case L4_VM_CP0_CONFIG_3:
case L4_VM_CP0_CONFIG_6:
case L4_VM_CP0_CONFIG_7:
return Jump_instr; // XXX config registers are read-only atm
case L4_VM_CP0_CONFIG_4:
{
// allow setting of ftlb size
auto *cfg4 = Cp0_config4::vcpu(vcpu);
Cp0_config4 newcfg(vcpu->r.r[insn.rt()]);
if (cfg4->ftlb_page_size2() != newcfg.ftlb_page_size2())
{
cfg4->ftlb_page_size2().set(newcfg.ftlb_page_size2());
vcpu.state()->set_modified(L4_VM_MOD_CFG);
}
return Jump_instr;
}
case L4_VM_CP0_CONFIG_5:
{
auto *cfg5 = Cp0_config5::vcpu(vcpu);
Cp0_config5 newcfg(vcpu->r.r[insn.rt()]);
// allow setting of FRE
if (cfg5->fre() != newcfg.fre())
{
cfg5->fre().set(newcfg.fre());
vcpu.state()->set_modified(L4_VM_MOD_CFG);
}
return Jump_instr;
}
case L4_VM_CP0_LOAD_LINKED_ADDR:
if (!(vcpu->r.r[insn.rt()] & 1))
vcpu.state()->set_modified(L4_VM_MOD_LLBIT);
return Jump_instr;
case L4_VM_CP0_MAAR_0: // XXX MAAR and parity are not supported
case L4_VM_CP0_MAAR_1:
case L4_VM_CP0_ERR_CTL:
case L4_VM_CP0_TAG_LO_0: // cache tagging ignored
case L4_VM_CP0_DATA_LO_0:
case L4_VM_CP0_TAG_LO_1:
case L4_VM_CP0_DATA_LO_1:
case L4_VM_CP0_TAG_HI_0:
case L4_VM_CP0_DATA_HI_0:
case L4_VM_CP0_TAG_HI_1:
case L4_VM_CP0_DATA_HI_1:
return Jump_instr;
}
return -L4_EINVAL;
}
int handle_software_field_change(Vcpu_ptr vcpu, Mips::Instruction insn)
{
l4_umword_t val = vcpu->r.r[insn.rt()];
unsigned reg = (insn.rd() << 3) | (insn.func() & 0x7);
auto *s = vcpu.state();
trace().printf("MTC0(soft) for 0x%x in register %d (0x%lx) \n",
reg, (unsigned) insn.rt(), val);
switch (reg)
{
case L4_VM_CP0_STATUS:
s->g_status = val;
s->set_modified(L4_VM_MOD_STATUS);
return Jump_instr;
case L4_VM_CP0_CAUSE:
enum { Cause_mask = 0x8c00ff00UL };
s->get_state(L4_VM_MOD_CAUSE);
s->g_cause &= ~Cause_mask;
s->g_cause |= val & Cause_mask;
s->set_modified(L4_VM_MOD_CAUSE);
return Jump_instr;
}
return -L4_EINVAL;
}
int handle_wait(Vcpu_ptr vcpu, l4_utcb_t *utcb)
{
if (Gic::Mips_core_ic::has_pending(vcpu))
return Jump_instr;
auto *s = vcpu.state();
auto *kip = l4re_kip();
l4_cpu_time_t kip_time;
// get kip time and hardware in sync
do
{
kip_time = l4_kip_clock(kip);
s->update_state(L4_VM_MOD_CAUSE | L4_VM_MOD_COMPARE);
if (s->g_cause & (1UL << 30))
return Jump_instr; // there was a timer interrupt
l4_mb();
}
while (kip_time != l4_kip_clock(kip));
l4_uint32_t gcnt = s->saved_cause_timestamp
+ (l4_int32_t) s->guest_timer_offset;
l4_uint32_t diff;
l4_uint32_t cmp = s->g_compare;
if (gcnt < cmp)
diff = cmp - gcnt;
else
diff = (0xffffffff - gcnt) + cmp;
auto freq = kip->frequency_cpu / 2;
diff = ((diff + freq - 1) / freq) * 1000;
// make sure the timer interrupt has passed on the Fiasco clock tick
diff += kip->scheduler_granularity;
l4_timeout_t to;
l4_rcv_timeout(l4_timeout_abs_u(kip_time + diff, 8, utcb), &to);
vcpu.wait_for_ipc(utcb, to);
return Jump_instr;
}
Guest_print_buffer _hypcall_print;
cxx::Ref_ptr<Gic::Mips_core_ic> _core_ic;
cxx::Ref_ptr<Vdev::Coherency_manager> _cm;
cxx::Ref_ptr<Vdev::Mips_cpc> _cpc;
};
} // namespace

View File

@@ -0,0 +1,167 @@
/*
* Copyright (C) 2016-2017, 2023-2024 Kernkonzept GmbH.
* Author(s): Sarah Hoffmann <sarah.hoffmann@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#include <l4/cxx/static_container>
#include "guest.h"
#include "guest_entry.h"
#include "vcpu_ptr.h"
/// The singleton instance of the VMM.
static cxx::Static_container<Vmm::Guest> guest;
static void
save_fpu(Vmm::Fpu_state *s)
{
asm volatile(".set push\n");
asm volatile(".set hardfloat\n");
#if __mips_fpr == 64
asm volatile("sdc1 $f0, %0" : : "m"(s->regs[0]));
asm volatile("sdc1 $f1, %0" : : "m"(s->regs[1]));
asm volatile("sdc1 $f2, %0" : : "m"(s->regs[2]));
asm volatile("sdc1 $f3, %0" : : "m"(s->regs[3]));
asm volatile("sdc1 $f4, %0" : : "m"(s->regs[4]));
asm volatile("sdc1 $f5, %0" : : "m"(s->regs[5]));
asm volatile("sdc1 $f6, %0" : : "m"(s->regs[6]));
asm volatile("sdc1 $f7, %0" : : "m"(s->regs[7]));
asm volatile("sdc1 $f8, %0" : : "m"(s->regs[8]));
asm volatile("sdc1 $f9, %0" : : "m"(s->regs[9]));
asm volatile("sdc1 $f10, %0" : : "m"(s->regs[10]));
asm volatile("sdc1 $f11, %0" : : "m"(s->regs[11]));
asm volatile("sdc1 $f12, %0" : : "m"(s->regs[12]));
asm volatile("sdc1 $f13, %0" : : "m"(s->regs[13]));
asm volatile("sdc1 $f14, %0" : : "m"(s->regs[14]));
asm volatile("sdc1 $f15, %0" : : "m"(s->regs[15]));
asm volatile("sdc1 $f16, %0" : : "m"(s->regs[16]));
asm volatile("sdc1 $f17, %0" : : "m"(s->regs[17]));
asm volatile("sdc1 $f18, %0" : : "m"(s->regs[18]));
asm volatile("sdc1 $f19, %0" : : "m"(s->regs[19]));
asm volatile("sdc1 $f20, %0" : : "m"(s->regs[20]));
asm volatile("sdc1 $f21, %0" : : "m"(s->regs[21]));
asm volatile("sdc1 $f22, %0" : : "m"(s->regs[22]));
asm volatile("sdc1 $f23, %0" : : "m"(s->regs[23]));
asm volatile("sdc1 $f24, %0" : : "m"(s->regs[24]));
asm volatile("sdc1 $f25, %0" : : "m"(s->regs[25]));
asm volatile("sdc1 $f26, %0" : : "m"(s->regs[26]));
asm volatile("sdc1 $f27, %0" : : "m"(s->regs[27]));
asm volatile("sdc1 $f28, %0" : : "m"(s->regs[28]));
asm volatile("sdc1 $f29, %0" : : "m"(s->regs[29]));
asm volatile("sdc1 $f30, %0" : : "m"(s->regs[30]));
asm volatile("sdc1 $f31, %0" : : "m"(s->regs[31]));
#else
asm volatile("sdc1 $f0, %0" : : "m"(s->regs[0]));
asm volatile("sdc1 $f2, %0" : : "m"(s->regs[1]));
asm volatile("sdc1 $f4, %0" : : "m"(s->regs[2]));
asm volatile("sdc1 $f6, %0" : : "m"(s->regs[3]));
asm volatile("sdc1 $f8, %0" : : "m"(s->regs[4]));
asm volatile("sdc1 $f10, %0" : : "m"(s->regs[5]));
asm volatile("sdc1 $f12, %0" : : "m"(s->regs[6]));
asm volatile("sdc1 $f14, %0" : : "m"(s->regs[7]));
asm volatile("sdc1 $f16, %0" : : "m"(s->regs[8]));
asm volatile("sdc1 $f18, %0" : : "m"(s->regs[9]));
asm volatile("sdc1 $f20, %0" : : "m"(s->regs[10]));
asm volatile("sdc1 $f22, %0" : : "m"(s->regs[11]));
asm volatile("sdc1 $f24, %0" : : "m"(s->regs[12]));
asm volatile("sdc1 $f26, %0" : : "m"(s->regs[13]));
asm volatile("sdc1 $f28, %0" : : "m"(s->regs[14]));
asm volatile("sdc1 $f30, %0" : : "m"(s->regs[15]));
#endif
asm volatile("cfc1 %0, $31" : "=r"(s->status));
asm volatile(".set pop\n");
}
static void
restore_fpu(Vmm::Fpu_state const *s)
{
asm volatile(".set push\n");
asm volatile(".set hardfloat\n");
#if __mips_fpr == 64
asm volatile("ldc1 $f0, %0" : : "m"(s->regs[0]));
asm volatile("ldc1 $f1, %0" : : "m"(s->regs[1]));
asm volatile("ldc1 $f2, %0" : : "m"(s->regs[2]));
asm volatile("ldc1 $f3, %0" : : "m"(s->regs[3]));
asm volatile("ldc1 $f4, %0" : : "m"(s->regs[4]));
asm volatile("ldc1 $f5, %0" : : "m"(s->regs[5]));
asm volatile("ldc1 $f6, %0" : : "m"(s->regs[6]));
asm volatile("ldc1 $f7, %0" : : "m"(s->regs[7]));
asm volatile("ldc1 $f8, %0" : : "m"(s->regs[8]));
asm volatile("ldc1 $f9, %0" : : "m"(s->regs[9]));
asm volatile("ldc1 $f10, %0" : : "m"(s->regs[10]));
asm volatile("ldc1 $f11, %0" : : "m"(s->regs[11]));
asm volatile("ldc1 $f12, %0" : : "m"(s->regs[12]));
asm volatile("ldc1 $f13, %0" : : "m"(s->regs[13]));
asm volatile("ldc1 $f14, %0" : : "m"(s->regs[14]));
asm volatile("ldc1 $f15, %0" : : "m"(s->regs[15]));
asm volatile("ldc1 $f16, %0" : : "m"(s->regs[16]));
asm volatile("ldc1 $f17, %0" : : "m"(s->regs[17]));
asm volatile("ldc1 $f18, %0" : : "m"(s->regs[18]));
asm volatile("ldc1 $f19, %0" : : "m"(s->regs[19]));
asm volatile("ldc1 $f20, %0" : : "m"(s->regs[20]));
asm volatile("ldc1 $f21, %0" : : "m"(s->regs[21]));
asm volatile("ldc1 $f22, %0" : : "m"(s->regs[22]));
asm volatile("ldc1 $f23, %0" : : "m"(s->regs[23]));
asm volatile("ldc1 $f24, %0" : : "m"(s->regs[24]));
asm volatile("ldc1 $f25, %0" : : "m"(s->regs[25]));
asm volatile("ldc1 $f26, %0" : : "m"(s->regs[26]));
asm volatile("ldc1 $f27, %0" : : "m"(s->regs[27]));
asm volatile("ldc1 $f28, %0" : : "m"(s->regs[28]));
asm volatile("ldc1 $f29, %0" : : "m"(s->regs[29]));
asm volatile("ldc1 $f30, %0" : : "m"(s->regs[30]));
asm volatile("ldc1 $f31, %0" : : "m"(s->regs[31]));
#else
asm volatile("ldc1 $f0, %0" : : "m"(s->regs[0]));
asm volatile("ldc1 $f2, %0" : : "m"(s->regs[1]));
asm volatile("ldc1 $f4, %0" : : "m"(s->regs[2]));
asm volatile("ldc1 $f6, %0" : : "m"(s->regs[3]));
asm volatile("ldc1 $f8, %0" : : "m"(s->regs[4]));
asm volatile("ldc1 $f10, %0" : : "m"(s->regs[5]));
asm volatile("ldc1 $f12, %0" : : "m"(s->regs[6]));
asm volatile("ldc1 $f14, %0" : : "m"(s->regs[7]));
asm volatile("ldc1 $f16, %0" : : "m"(s->regs[8]));
asm volatile("ldc1 $f18, %0" : : "m"(s->regs[9]));
asm volatile("ldc1 $f20, %0" : : "m"(s->regs[10]));
asm volatile("ldc1 $f22, %0" : : "m"(s->regs[11]));
asm volatile("ldc1 $f24, %0" : : "m"(s->regs[12]));
asm volatile("ldc1 $f26, %0" : : "m"(s->regs[13]));
asm volatile("ldc1 $f28, %0" : : "m"(s->regs[14]));
asm volatile("ldc1 $f30, %0" : : "m"(s->regs[15]));
#endif
asm volatile("ctc1 %0, $31" : : "r"(s->status));
asm volatile(".set pop\n");
}
void
c_vcpu_entry(l4_vcpu_state_t *vcpu)
{
Vmm::Vcpu_ptr c(vcpu);
if (!(vcpu->r.status & (1UL << 3)))
{
Err().printf("Exception in entry handler. Halting. IP = 0x%lx\n",
vcpu->r.ip);
guest->halt_vm(c);
}
save_fpu(c.fpu_state());
guest->handle_entry(c);
restore_fpu(c.fpu_state());
L4::Cap<L4::Thread> myself;
auto e = l4_error(myself->vcpu_resume_commit(myself->vcpu_resume_start()));
Err().printf("VM restart failed with %ld\n", e);
guest->halt_vm(c);
}
Vmm::Guest *
Vmm::Guest::create_instance()
{
guest.construct();
return guest;
}

View File

@@ -0,0 +1,14 @@
/*
* Copyright (C) 2016-2017, 2024 Kernkonzept GmbH.
* Author(s): Sarah Hoffmann <sarah.hoffmann@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include <l4/sys/vcpu.h>
/// Entry point for guest exits.
void c_vcpu_entry(l4_vcpu_state_t *vcpu);
/// Entry point for newly created vcpu threads.
void *powerup_handler(void *vcpu);

View File

@@ -0,0 +1,160 @@
/*
* Copyright (C) 2015-2017, 2024 Kernkonzept GmbH.
* Author(s): Sarah Hoffmann <sarah.hoffmann@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include <l4/sys/types.h>
#include <l4/cxx/bitfield>
namespace Mips {
namespace Op {
enum Opcode
{
Special = 0, Regimm, J, Jal, Beq, Bne, Blez, Bgtz,
Pop10 = 8, Addiu, Slti, Sltiu, Andi, Ori, Xori, Lui,
Cop0 = 16, Cop1, Cop2, Cop1x, Beql, Bnel, Blezl, Bgtzl,
Pop30 = 24, Special2 = 28, Jalx, Msa, Special3,
Lb = 32, Lh, Lwlw, Lw, Lbu, Lhu, Lwr,
Sb = 40, Sh, Swl, Sw, Swr = 46, Cache,
Ll = 48, Lwc1, Lwc2, Pref, Ldc1 = 53, Pop66, Ld,
Sc = 56, Swc1, Swc2, Pcrel, Sdc1 = 61, Pop76, Sd
};
enum Cop0_rs
{
Cop0_mfc0 = 0, Cop0_dmf = 1, Cop0_mfh = 2, Cop0_mtc0 = 4, Cop0_dmt,
Cop0_mth = 6, Cop0_hypcall = 0x28,
};
enum Special
{
Sp_jr = 8, Sp_jalr = 9
};
enum Special3
{
Sp3_cachee = 0x1b,
Sp3_cache = 0x25
};
enum Regimm
{
Bltz = 0, Bgez, Bltzl, Bgezl,
Tgei = 8, Tgeiu, Tlti, Tltiu, Teqi,
Nal =16, Bal, Bltzall, Bgezall
};
}
struct Instruction
{
l4_uint32_t raw;
// generic fields
CXX_BITFIELD_MEMBER_RO(26, 31, opcode, raw);
CXX_BITFIELD_MEMBER_RO(21, 25, rs, raw);
CXX_BITFIELD_MEMBER_RO(16, 20, rt, raw);
CXX_BITFIELD_MEMBER_RO( 0, 15, imm, raw);
CXX_BITFIELD_MEMBER_RO( 0, 5, func, raw);
CXX_BITFIELD_MEMBER_RO( 6, 10, sa, raw);
CXX_BITFIELD_MEMBER_RO(11, 15, rd, raw);
// HYPCALL fields
CXX_BITFIELD_MEMBER_RO(11, 20, hypcall_code, raw);
// opcode for load/store instructions
// Note that not all combinations are valid.
CXX_BITFIELD_MEMBER_RO(31, 31, opcode_mem, raw);
CXX_BITFIELD_MEMBER_RO(30, 30, op_mem_atomic, raw);
CXX_BITFIELD_MEMBER_RO(29, 29, op_mem_store, raw);
CXX_BITFIELD_MEMBER_RO(28, 28, op_mem_unsigned, raw);
CXX_BITFIELD_MEMBER_RO(26, 27, op_mem_width, raw);
// for Cop0
CXX_BITFIELD_MEMBER_RO(25, 25, cop0_co, raw);
// for J/JAL
CXX_BITFIELD_MEMBER_RO( 0, 25, instr_index, raw);
// for FP ops
CXX_BITFIELD_MEMBER_RO(28, 28, op_fp_dc1, raw);
// for cache ops
CXX_BITFIELD_MEMBER_RO(18, 20, cache_optype, raw);
Instruction(l4_uint32_t inst) : raw(inst) {}
bool is_mfc0() const
{
return opcode() == Op::Cop0
&& (rs() == Op::Cop0_mfc0 || rs() == Op::Cop0_dmf);
}
bool is_mtc0() const
{
return opcode() == Op::Cop0
&& (rs() == Op::Cop0_mtc0 || rs() == Op::Cop0_dmt);
}
bool is_hypcall() const
{ return opcode() == Op::Cop0 && func() == Op::Cop0_hypcall; }
bool is_wait() const
{ return opcode() == Op::Cop0 && cop0_co() && func() == 0x20 ; }
bool is_cache_op() const
{
return opcode() == Op::Cache
|| (sizeof(l4_umword_t) == 8 && opcode() == Op::Special3
&& (func() == Op::Sp3_cache || func() == Op::Sp3_cachee));
}
bool is_simple_load_store() const
{
return (opcode_mem() && !op_mem_atomic()
&& op_mem_width() != 2
&& !(op_mem_unsigned() && op_mem_store()))
|| ((opcode() & 0x37) == 0x37);
}
bool is_fp_load_store() const
{
return opcode() == Op::Lwc1 || opcode() == Op::Sdc1
|| opcode() == Op::Ldc1 || opcode() == Op::Sdc1;
}
/**
* Return width of a load/store operation.
*
* \pre The instruction is a load/store operation.
*
* \retval 0 Byte width (8bit).
* \retval 1 Half-word width (16bit).
* \retval 2 Word width (32bit).
* \retval 3 Double-word width (64bit).
*/
char load_store_width() const
{
switch (opcode())
{
case Op::Lb:
case Op::Lbu:
case Op::Sb:
return 0;
case Op::Lh:
case Op::Lhu:
case Op::Sh:
return 1;
case Op::Ld:
case Op::Sd:
case Op::Ldc1:
case Op::Sdc1:
return 3;
default:
return 2;
}
}
int branch_offset() const
{ return ((int) ((l4_int16_t) imm())) << 2; }
};
} // namespace

View File

@@ -0,0 +1,123 @@
/*
* Copyright (C) 2019, 2023-2024 Kernkonzept GmbH.
* Author(s): Sarah Hoffmann <sarah.hoffmann@kernkonzept.com>
* Alexander Warg <alexander.warg@kernkonzept.com>
* Timo Nicolai <timo.nicolai@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include <cstdio>
#include <cstring>
#include "vcpu_ptr.h"
#include "monitor/monitor.h"
#include "monitor/monitor_args.h"
namespace Monitor {
template<bool, typename T>
class Cpu_dev_cmd_handler {};
template<typename T>
class Cpu_dev_cmd_handler<true, T> : public Cmd
{
public:
char const *help() const override
{ return "CPU state"; }
void usage(FILE *f) const override
{
fprintf(f, "%s\n"
"* 'cpu <i> regs': dump CPU registers\n",
help());
}
void complete(FILE *f, Completion_request *compl_req) const override
{ compl_req->complete(f, "regs"); }
void exec(FILE *f, Arglist *args) override
{
if (*args == "regs")
show_regs(f);
else
argument_error("Invalid subcommand");
}
void show_regs(FILE *f) const
{
auto v = get_vcpu();
fprintf(f, "EPC=%08lx SP=%08lx\n",
v->r.ip, v->r.sp);
fprintf(f, "Status=%08lx Cause=%08lx\n",
v->r.status, v->r.cause);
fprintf(f, "ULR=%08lx Hi=%08lx Lo=%08lx\n",
v->r.ulr, v->r.hi, v->r.lo);
fprintf(f, "at/ 1=%08lx v0/ 2=%08lx v1/ 3=%08lx\n",
v->r.r[1], v->r.r[2], v->r.r[3]);
fprintf(f, "a0/ 4=%08lx a1/ 5=%08lx a1/ 6=%08lx a4/ 7=%08lx\n",
v->r.r[4], v->r.r[5], v->r.r[6], v->r.r[7]);
fprintf(f, "t0/ 8=%08lx t1/ 9=%08lx t2/10=%08lx t3/11=%08lx\n",
v->r.r[8], v->r.r[9], v->r.r[10], v->r.r[11]);
fprintf(f, "t4/12=%08lx t5/13=%08lx t6/14=%08lx t7/15=%08lx\n",
v->r.r[12], v->r.r[13], v->r.r[14], v->r.r[15]);
fprintf(f, "s0/16=%08lx s1/17=%08lx s2/18=%08lx s3/19=%08lx\n",
v->r.r[16], v->r.r[17], v->r.r[18], v->r.r[19]);
fprintf(f, "s4/20=%08lx s5/21=%08lx s6/22=%08lx s7/23=%08lx\n",
v->r.r[20], v->r.r[21], v->r.r[22], v->r.r[23]);
fprintf(f, "t8/24=%08lx t9/25=%08lx k0/26=%08lx k1/27=%08lx\n",
v->r.r[24], v->r.r[25], v->r.r[26], v->r.r[27]);
fprintf(f, "gp/28=%08lx sp/29=%08lx s8/30=%08lx ra/31=%08lx\n",
v->r.r[28], v->r.r[29], v->r.r[30], v->r.r[31]);
auto *s = v.state();
s->update_state(~0UL);
fprintf(f, "\nGuestCtl0= %08lx Guestctl0_ext= %08lx\n",
s->guest_ctl_0, s->guest_ctl_0_ext);
fprintf(f, "GuestCtl1= %08lx Guestctl2 = %08lx\n",
s->guest_ctl_1, s->guest_ctl_2);
fprintf(f, "\nGuest CP0:\n");
fprintf(f, "Status = %08lx Cause = %08lx\n",
s->g_status, s->g_cause);
fprintf(f, "Index = %08lx EBase = %08lx\n",
s->g_index, s->g_ebase);
fprintf(f, "EntryLo0 = %08lx EntryLo1 = %08lx\n",
s->g_entry_lo[0], s->g_entry_lo[1]);
fprintf(f, "Context = %08lx EntryHi = %08lx\n",
s->g_context, s->g_entry_hi);
fprintf(f, "PageMask = %08lx PageGrain= %08lx\n",
s->g_page_mask, s->g_page_grain);
fprintf(f, "ULR = %08lx Wired = %08lx\n",
s->g_ulr, s->g_wired);
fprintf(f, "SegCtl0 = %08lx SegCtl1 = %08lx\n",
s->g_seg_ctl[0], s->g_seg_ctl[1]);
fprintf(f, "SegCtl2 = %08lx HWRena = %08lx\n",
s->g_seg_ctl[2], s->g_hwrena);
fprintf(f, "PWBase = %08lx PWField = %08lx\n",
s->g_pw_base, s->g_pw_field);
fprintf(f, "PWSize = %08lx PWCtl = %08lx\n",
s->g_pw_size, s->g_pw_ctl);
fprintf(f, "BadVAddr = %08lx BadInstr = %08lx\n",
s->g_bad_v_addr, s->g_bad_instr);
fprintf(f, "BadInstrP= %08lx Compare = %08lx\n",
s->g_bad_instr_p, s->g_compare);
fprintf(f, "IntCtl = %08lx EPC = %08lx\n",
s->g_intctl, s->g_epc);
fprintf(f, "Config0 = %08lx Config1 = %08lx\n",
s->g_cfg[0], s->g_cfg[1]);
fprintf(f, "Config2 = %08lx Config3 = %08lx\n",
s->g_cfg[2], s->g_cfg[3]);
fprintf(f, "Config4 = %08lx Config5 = %08lx\n",
s->g_cfg[4], s->g_cfg[5]);
}
private:
Vmm::Vcpu_ptr get_vcpu() const
{ return static_cast<T const *>(this)->vcpu(); }
};
}

View File

@@ -0,0 +1,8 @@
/*
* Copyright (C) 2016-2017, 2019, 2024 Kernkonzept GmbH.
* Author(s): Timo Nicolai <timo.nicolai@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
constexpr bool has_iomap()
{ return false; }

View File

@@ -0,0 +1,273 @@
/*
* Copyright (C) 2015-2017, 2020, 2024 Kernkonzept GmbH.
* Author(s): Sarah Hoffmann <sarah.hoffmann@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include <cassert>
#include <cstring>
#include <l4/re/error_helper>
#include <l4/sys/kdebug.h>
#include <l4/sys/thread_mips.h>
#include <l4/sys/vm.h>
#include "generic_vcpu_ptr.h"
#include "mem_access.h"
#include "mips_instructions.h"
namespace Vmm {
struct Fpu_state
{
#if __mips_fpr == 64
l4_uint64_t read(unsigned fpnr)
{ return regs[fpnr]; }
void write(unsigned fpnr, char, l4_uint64_t value)
{ regs[fpnr] = value; }
l4_uint64_t regs[32];
#else
l4_uint64_t read(unsigned fpnr)
{
// registers are numbered by 32bit but saved in 64bit
// so for odd FPU register numbers return the upper 32bits.
return regs[fpnr >> 1] >> (32 * (fpnr & 1));
}
void write(unsigned fpnr, char size, l4_uint64_t value)
{
if (size == 3)
regs[fpnr >> 1] = value;
else
{
// write the 32bit value in the upper or lower part of the
// saved 64bit value
value &= 0xffffffff;
// Mask for the 64bit register: upper 32 bit for even FPU registers,
// lower 32 bit for odd FPU registers.
l4_uint64_t regmask = (0xffffffffULL << (32 * (~fpnr & 1)));
regs[fpnr >> 1] = (regmask & regs[fpnr >> 1])
| (value << (32 * (fpnr & 1)));
}
}
l4_uint64_t regs[16];
#endif
l4_umword_t status;
};
struct State : l4_vm_state_t
{
void set_modified(l4_umword_t bits)
{ modified_cp0_map |= bits; }
void get_state(l4_umword_t bits)
{
if ((clean_cp0_map & bits) != bits)
l4_thread_mips_save_vm_state(L4_INVALID_CAP, bits);
}
void update_state(l4_umword_t bits)
{ l4_thread_mips_save_vm_state(L4_INVALID_CAP, bits); }
};
class Vcpu_ptr : public Generic_vcpu_ptr
{
public:
explicit Vcpu_ptr(l4_vcpu_state_t *s) : Generic_vcpu_ptr(s) {}
bool pf_write() const
{ return _s->r.cause & 4; }
void thread_attach()
{
control_ext(L4::Cap<L4::Thread>());
}
void jump_instruction()
{
auto *r = &_s->r;
if (!(r->cause & (1 << 31)))
{
r->ip += 4;
return;
}
// emulate the branch instruction
Mips::Instruction insn(r->bad_instr_p);
switch (insn.opcode())
{
case Mips::Op::Special:
switch (insn.func())
{
case Mips::Op::Sp_jr:
r->ip = r->r[insn.rs()];
return;
case Mips::Op::Sp_jalr:
auto ra = r->ip + 8;
r->ip = r->r[insn.rs()];
r->r[insn.rd()] = ra;
return;
}
break;
case Mips::Op::Regimm:
switch (insn.rt())
{
case Mips::Op::Bal:
case Mips::Op::Bgezall:
r->r[31] = r->ip + 8;
case Mips::Op::Bgez:
case Mips::Op::Bgezl:
if ((long) r->r[insn.rs()] >= 0)
r->ip += insn.branch_offset() + 4;
else
r->ip += 8;
return;
case Mips::Op::Nal:
case Mips::Op::Bltzall:
r->r[31] = r->ip + 8;
case Mips::Op::Bltz:
case Mips::Op::Bltzl:
if ((long) r->r[insn.rs()] < 0)
r->ip += insn.branch_offset() + 4;
else
r->ip += 8;
return;
}
break;
case Mips::Op::Beql:
case Mips::Op::Bnel:
case Mips::Op::Bgtzl:
case Mips::Op::Blezl:
if (insn.rt() == 0)
r->ip += insn.branch_offset() + 4;
else
r->ip += 8; // R6 compact branch instruction
return;
case Mips::Op::Beq:
if (r->r[insn.rs()] == r->r[insn.rt()])
r->ip += insn.branch_offset() + 4;
else
r->ip += 8;
return;
case Mips::Op::Bne:
if (r->r[insn.rs()] != r->r[insn.rt()])
r->ip += insn.branch_offset() + 4;
else
r->ip += 8;
return;
case Mips::Op::Bgtz:
if (insn.rt() == 0 && (long) r->r[insn.rs()] > 0)
r->ip += insn.branch_offset() + 4;
else
r->ip += 8;
return;
case Mips::Op::Blez:
if (insn.rt() == 0 && (long) r->r[insn.rs()] <= 0)
r->ip += insn.branch_offset() + 4;
else
r->ip += 8;
return;
case Mips::Op::Jal:
r->ra = r->ip + 8;
[[fallthrough]];
case Mips::Op::J:
r->ip = (r->ip & ~((1UL << 28) - 1)) | (insn.instr_index() << 2);
return;
// compact branch instructions on R6
case Mips::Op::Pop10:
case Mips::Op::Pop30:
case Mips::Op::Pop66:
case Mips::Op::Pop76:
r->ip += 8;
return;
}
Err().printf("Guest exception in branch delay slot. Instruction not implemented @ IP 0x%lx\n", _s->r.ip);
enter_kdebug("STOP");
}
Mem_access decode_mmio() const
{
Mips::Instruction insn(_s->r.bad_instr);
Mem_access m;
m.access = insn.op_mem_store() ? Mem_access::Store : Mem_access::Load;
if (insn.is_simple_load_store())
{
m.width = insn.load_store_width();
if (m.access == Mem_access::Store)
m.value = _s->r.r[insn.rt()];
}
else if (insn.is_fp_load_store())
{
m.width = insn.op_fp_dc1() ? Mem_access::Wd64 : Mem_access::Wd32;
if (m.access == Mem_access::Store)
m.value = fpu_state()->read(insn.rt());
}
else
m.access = Mem_access::Other;
return m;
}
void writeback_mmio(Mem_access const &m) const
{
assert(m.access == Mem_access::Load);
Mips::Instruction insn(_s->r.bad_instr);
if (insn.is_simple_load_store())
_s->r.r[insn.rt()]
= reg_extend_width(m.value, m.width, insn.op_mem_unsigned());
else
fpu_state()->write(insn.rt(), m.width, m.value);
}
Fpu_state *fpu_state() const
{ return reinterpret_cast<Fpu_state *>(_s->user_data[Reg_fpu_state]); }
void alloc_fpu_state() const
{
_s->user_data[Reg_fpu_state]
= reinterpret_cast<l4_umword_t>(new Fpu_state());
}
void free_fpu_state() const
{
if (fpu_state())
{
delete fpu_state();
_s->user_data[Reg_fpu_state] = 0;
}
}
l4_umword_t proc_id() const
{ return _s->user_data[Reg_proc_id]; }
void set_proc_id(l4_umword_t id) const
{ _s->user_data[Reg_proc_id] = id; }
State *state()
{ return reinterpret_cast<State *>((char *)_s + L4_VCPU_OFFSET_EXT_STATE); }
private:
enum Arch_data_regs {
Reg_fpu_state = Reg_arch_base,
Reg_proc_id,
Reg_arch_end
};
static_assert(Reg_arch_end <= 7, "Too many user_data registers used");
};
} // namespace

View File

@@ -0,0 +1,43 @@
/*
* Copyright (C) 2022-2024 Kernkonzept GmbH.
* Author(s): Georg Kotheimer <georg.kotheimer@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#include "binary_loader_linux.h"
namespace Boot {
int Linux_loader::load(char const * /*bin*/, std::shared_ptr<Binary_ds> image,
Vmm::Vm_ram *ram, Vmm::Ram_free_list *free_list,
l4_addr_t *entry)
{
trace().printf("Checking for Linux image...\n");
if (!image->is_valid())
return -L4_EINVAL;
Vmm::Guest_addr ram_base = free_list->first_free_address();
unsigned char const *h = static_cast<unsigned char const *>(image->get_data());
if ( h[0x38] == 'R' && h[0x39] == 'S'
&& h[0x3A] == 'C' && h[0x3B] == 0x05) // Linux header RSC\x05
{
l4_uint64_t l = *reinterpret_cast<l4_uint64_t const *>(&h[8]);
*entry = image->load_as_raw(ram, ram_base + l, free_list);
// TODO: Can we detect the bitness of the Linux image? Currently the _64bit
// field is not used by uvmm on RISC-V, but still.
_64bit = true;
}
else
return -L4_EINVAL;
info().printf("Linux kernel detected\n");
return L4_EOK;
}
static Linux_loader f __attribute__((init_priority(Boot::Linux)));
}

View File

@@ -0,0 +1,29 @@
/*
* Copyright (C) 2022-2024 Kernkonzept GmbH.
* Author(s): Georg Kotheimer <georg.kotheimer@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
namespace Boot {
enum
{
#if __riscv_xlen == 32
Kernel_boot_address = 0x80400000,
#else
Kernel_boot_address = 0x80200000,
#endif
};
static int raw_load_image(std::shared_ptr<Binary_ds> image, Vmm::Vm_ram *ram,
Vmm::Ram_free_list *free_list, l4_addr_t *entry)
{
*entry = image->load_as_raw(ram, Vmm::Guest_addr(Kernel_boot_address),
free_list);
return L4_EOK;
}
}

View File

@@ -0,0 +1,171 @@
/*
* Copyright (C) 2020-2024 Kernkonzept GmbH.
* Author(s): Georg Kotheimer <georg.kotheimer@kernkonzept.com>
* Sarah Hoffmann <sarah.hoffmann@kernkonzept.com>
* Alexander Warg <alexander.warg@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#include <sys/asm.h>
#include <l4/util/util.h>
#include "cpu_dev.h"
#include "guest.h"
#include "riscv_arch.h"
namespace Vmm
{
Cpu_dev::Cpu_dev(unsigned idx, unsigned phys_id, Vdev::Dt_node const *node)
: Generic_cpu_dev(idx, phys_id)
{
char const *prop_isa_str = node ? node->get_prop<char>("riscv,isa", nullptr)
: nullptr;
if (!prop_isa_str)
return;
std::string isa_str = prop_isa_str;
bool has_ext_sstc = l4_kip_has_isa_ext(l4re_kip(), L4_riscv_isa_ext_sstc);
if (has_ext_sstc && isa_str.find("_sstc") == std::string::npos)
{
// Indicate in the device tree that the SSTC extension is available.
isa_str += "_sstc";
node->setprop_string("riscv,isa", isa_str.c_str());
}
}
bool
Cpu_dev::start_vcpu()
{
if (online_state() != Cpu_state::On_pending)
{
// Should we convert this to an assert()?
Err().printf("%s: CPU%d not in On_pending state", __func__, _phys_cpu_id);
return false;
}
Dbg(Dbg::Cpu, Dbg::Info)
.printf("Initiating cpu startup @ 0x%lx\n", _vcpu->r.ip);
if (_vcpu->entry_sp && !restart_vcpu())
{
mark_off();
return false;
}
else
reschedule();
return true;
}
void L4_NORETURN
Cpu_dev::stop_vcpu()
{
mark_off();
while (online_state() != Cpu_state::On_prepared)
_vcpu.wait_for_ipc(l4_utcb(), L4_IPC_NEVER);
reset();
}
bool
Cpu_dev::restart_vcpu()
{
assert(_vcpu->entry_sp);
mark_on_prepared();
l4_msgtag_t res = _restart_event.obj_cap()->trigger();
if (!l4_msgtag_has_error(res))
return true;
Err().printf("Error waking Cpu%d: %lx\n", _vcpu.get_vcpu_id(), l4_error(res));
return false;
}
void
Cpu_dev::powerup_cpu()
{
Generic_cpu_dev::powerup_cpu();
// Now the vCPU thread exists and the IPC registry is setup.
auto *registry = vcpu().get_ipc_registry();
L4Re::chkcap(registry->register_irq_obj(&_restart_event),
"Cannot register CPU restart event");
_stop_irq.arm(registry);
}
void
Cpu_dev::reset()
{
using namespace Riscv;
// set thread local cpu id
vmm_current_cpu_id = _vcpu.get_vcpu_id();
_vcpu->entry_ip = reinterpret_cast<l4_umword_t>(&Guest::vcpu_entry);
if (!_vcpu->entry_sp)
{
l4_umword_t sp;
asm volatile ("mv %0, sp" : "=r" (sp));
_vcpu->entry_sp = sp & ~0xful;
}
_vcpu->saved_state = L4_VCPU_F_FPU_ENABLED
| L4_VCPU_F_USER_MODE
| L4_VCPU_F_IRQ
| L4_VCPU_F_PAGE_FAULTS
| L4_VCPU_F_EXCEPTIONS;
_vcpu->r.hstatus = L4_vm_hstatus_spvp | L4_vm_hstatus_vtw;
#if __riscv_xlen == 64
_vcpu->r.hstatus |= static_cast<l4_umword_t>(L4_vm_hstatus_vsxl_64)
<< L4_vm_hstatus_vsxl_shift;
#endif
auto *vm_state = _vcpu.vm_state();
vm_state->hedeleg = 1 << Exc_inst_misaligned
| 1 << Exc_inst_access
| 1 << Exc_illegal_inst
| 1 << Exc_breakpoint
| 1 << Exc_load_acesss
| 1 << Exc_store_acesss
| 1 << Exc_ecall
| 1 << Exc_inst_page_fault
| 1 << Exc_load_page_fault
| 1 << Exc_store_page_fault;
vm_state->hideleg = 1 << (Int_virtual_supervisor_software & ~Msb)
| 1 << (Int_virtual_supervisor_timer & ~Msb)
| 1 << (Int_virtual_supervisor_external & ~Msb);
vm_state->hvip = 0;
vm_state->hip = 0;
vm_state->hie = 0;
vm_state->htimedelta = 0;
vm_state->htval = 0;
vm_state->htinst = 0;
Dbg(Dbg::Core, Dbg::Info)
.printf("Starting vcpu %d @ 0x%lx (handler @ %lx with stack @ %lx)\n",
_vcpu.get_vcpu_id(), _vcpu->r.ip, _vcpu->entry_ip, _vcpu->entry_sp);
mark_on();
L4::Cap<L4::Thread> self;
auto e = l4_error(self->vcpu_resume_commit(self->vcpu_resume_start()));
Err().printf("VMM exited with %ld\n", e);
stop_vcpu();
// Failed to take vCPU offline. Should not happend but play safe.
l4_sleep_forever();
}
} // namespace

View File

@@ -0,0 +1,182 @@
/*
* Copyright (C) 2020-2024, 2023-2024 Kernkonzept GmbH.
* Author(s): Georg Kotheimer <georg.kotheimer@kernkonzept.com>
* Sarah Hoffmann <sarah.hoffmann@kernkonzept.com>
* Alexander Warg <alexander.warg@kernkonzept.com>
*
* License: see LICENSE.spdx (in this directory or the directories above)
*/
#pragma once
#include <atomic>
#include "generic_cpu_dev.h"
#include "vcpu_ic.h"
#include "monitor/cpu_dev_cmd_handler.h"
namespace Vmm {
extern __thread unsigned vmm_current_cpu_id;
class Cpu_dev
: public Generic_cpu_dev,
public Monitor::Cpu_dev_cmd_handler<Monitor::Enabled, Cpu_dev>
{
public:
// Maximum number of CPUs that are addressable.
enum { Max_cpus = 8 };
Cpu_dev(unsigned idx, unsigned phys_id, Vdev::Dt_node const *node);
/**
* CPU states
*/
enum class Cpu_state
{
Off,
On_pending,
On_prepared,
On,
Suspended,
};
/**
* Translate a device tree "reg" value to an internally usable CPU id.
*
* For most architectures this is NOP, but some archictures like ARM
* might encode topology information into this value, which needs to
* be translated.
*/
static unsigned dtid_to_cpuid(l4_int32_t prop_val)
{ return prop_val; }
static bool has_fixed_dt_mapping() { return true; }
unsigned get_phys_cpu_id() const noexcept
{ return _phys_cpu_id; }
// TODO: Starting and stopping of vCPUs is adopted from ARM,
// merging this functionality probably would be benefitial.
bool start_vcpu();
void L4_NORETURN stop_vcpu();
bool restart_vcpu();
void powerup_cpu() override;
void L4_NORETURN reset() final override;
void L4_NORETURN stop() override { stop_vcpu(); };
/**
* Get the online state of a CPU.
*/
Cpu_state online_state() const
{ return std::atomic_load(&_cpu_state); }
/**
* Is the CPU online?
*/
bool online() const
{ return online_state() != Cpu_state::Off; }
/**
* Cpu_state changes
* * Off -> On_pending: concurrent execution
* * On_pending -> On: CPU local, no concurrency (initial startup)
* * On_pending -> On_prepared: CPU local, no concurrency (restart)
* * On_prepared -> On: CPU local, no concurrency (restart)
* * On* -> Off: CPU local, no concurrency
* * On -> Suspended: CPU local, no concurrency
* * Suspended -> On: CPU local, no concurrency
*
* The only state change that requires protection against concurrent access
* is the change from Off to On_pending. Therefore mark_pending() uses
* compare/exchange, the other operation use a simple store.
*/
/**
* Mark CPU as On_pending.
*
* \retval True Successfully changed state from Off to On_pending
* \retval False Failed to change the state from Off to On_pending,
* the state was already changed by someone else.
*/
bool mark_on_pending()
{
// Atomically change state from Off to On_pending, see above
Cpu_state expected{Cpu_state::Off};
return std::atomic_compare_exchange_strong(&_cpu_state, &expected,
Cpu_state::On_pending);
}
/**
* Mark CPU as On_prepared.
*
* The vCPU entry has been setup and the guest is about to be entered
* again. This state is only used when restarting a CPU that was previously
* powered off.
*/
void mark_on_prepared()
{
assert(online_state() == Cpu_state::On_pending);
std::atomic_store(&_cpu_state, Cpu_state::On_prepared);
}
/**
* Mark CPU as Off.
*
* Marks the CPU as Off. The current state has to be either On (CPU is
* switched off) or On_pending (we failed to get the CPU up and fall
* back to Off)
*/
void mark_off()
{
assert(online_state() != Cpu_state::Off);
std::atomic_store(&_cpu_state, Cpu_state::Off);
}
/**
* Mark CPU as On.
*
* Marks the CPU as On. The current state has to be On_pending or On_prepared.
*/
void mark_on()
{
assert(online_state() == Cpu_state::On_pending ||
online_state() == Cpu_state::On_prepared ||
online_state() == Cpu_state::Suspended);
std::atomic_store(&_cpu_state, Cpu_state::On);
}
/**
* Mark CPU as Suspended.
*
* Marks the CPU as Suspended. The current state has to be On.
*/
void mark_suspended()
{
assert(online_state() == Cpu_state::On);
std::atomic_store(&_cpu_state, Cpu_state::Suspended);
}
void set_vcpu_ic(cxx::Ref_ptr<Gic::Vcpu_ic> vcpu_ic)
{
_vcpu_ic = vcpu_ic;
}
private:
/**
* Trivial interrupt to wakeup stopped vCPU.
*/
struct Restart_event : public L4::Irqep_t<Restart_event>
{
public:
void handle_irq() {}
};
cxx::Ref_ptr<Gic::Vcpu_ic> _vcpu_ic;
std::atomic<Cpu_state> _cpu_state{Cpu_state::Off};
Restart_event _restart_event;
};
}

Some files were not shown because too many files have changed in this diff Show More