Adding upstream version 6.0.4-dfsg.upstream/6.0.4-dfsg upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-06 03:01:46 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-06 03:01:46 +0000
commit: f8fe689a81f906d1b91bb3220acde2a4ecb14c5b (patch)
tree: 26484e9d7e2c67806c2d1760196ff01aaa858e8c /src/recompiler
parent: Initial commit. (diff)
download: virtualbox-f8fe689a81f906d1b91bb3220acde2a4ecb14c5b.tar.xz
virtualbox-f8fe689a81f906d1b91bb3220acde2a4ecb14c5b.zip
100 files changed, 70462 insertions, 0 deletions
diff --git a/src/recompiler/.scm-settings b/src/recompiler/.scm-settings
new file mode 100644
index 00000000..946bad9f
--- /dev/null
+++ b/src/recompiler/.scm-settings
@@ -0,0 +1,34 @@
+# $Id: .scm-settings $
+## @file
+# Source code massager settings for the recompiler.
+#
+
+#
+# Copyright (C) 2017-2019 Oracle Corporation
+#
+# This file is part of VirtualBox Open Source Edition (OSE), as
+# available from http://www.virtualbox.org. This file is free software;
+# you can redistribute it and/or modify it under the terms of the GNU
+# General Public License (GPL) as published by the Free Software
+# Foundation, in version 2 as it comes in the "COPYING" file of the
+# VirtualBox OSE distribution. VirtualBox OSE is distributed in the
+# hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
+#
+
+
+# This is external stuff.
+--external-copyright --no-convert-tabs
+/.scm-settings:         --no-external-copyright --convert-tabs
+/Makefile.kmk:          --no-external-copyright --convert-tabs
+/VBox*:                 --no-external-copyright --convert-tabs
+/Sun/*:                 --no-external-copyright --convert-tabs
+/Sun/e_*.S:             --external-copyright --no-convert-tabs
+
+*.com:                  --treat-as binary
+
+*.h:                    --no-fix-header-guards
+
+/tests/linux-test.c:    --lgpl-disclaimer
+/tests/test-i386.c:     --lgpl-disclaimer
+/tests/test-mmap.c:     --lgpl-disclaimer
+
diff --git a/src/recompiler/COPYING.LIB b/src/recompiler/COPYING.LIB
new file mode 100644
index 00000000..bfd28e23
--- /dev/null
+++ b/src/recompiler/COPYING.LIB
@@ -0,0 +1,504 @@
+		  GNU LESSER GENERAL PUBLIC LICENSE
+		       Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+	51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+[This is the first released version of the Lesser GPL.  It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+  This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it.  You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations below.
+
+  When we speak of free software, we are referring to freedom of use,
+not price.  Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+  To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights.  These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+  For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you.  You must make sure that they, too, receive or can get the source
+code.  If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it.  And you must show them these terms so they know their rights.
+
+  We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+  To protect each distributor, we want to make it very clear that
+there is no warranty for the free library.  Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+
+  Finally, software patents pose a constant threat to the existence of
+any free program.  We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder.  Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+  Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License.  This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License.  We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+  When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library.  The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom.  The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+  We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License.  It also provides other free software developers Less
+of an advantage over competing non-free programs.  These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries.  However, the Lesser license provides advantages in certain
+special circumstances.
+
+  For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it becomes
+a de-facto standard.  To achieve this, non-free programs must be
+allowed to use the library.  A more frequent case is that a free
+library does the same job as widely used non-free libraries.  In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+  In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software.  For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+  Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.  Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library".  The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+
+		  GNU LESSER GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+  A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+  The "Library", below, refers to any such software library or work
+which has been distributed under these terms.  A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language.  (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+  "Source code" for a work means the preferred form of the work for
+making modifications to it.  For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control compilation
+and installation of the library.
+
+  Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it).  Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+
+  1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+  You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+
+  2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) The modified work must itself be a software library.
+
+    b) You must cause the files modified to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    c) You must cause the whole of the work to be licensed at no
+    charge to all third parties under the terms of this License.
+
+    d) If a facility in the modified Library refers to a function or a
+    table of data to be supplied by an application program that uses
+    the facility, other than as an argument passed when the facility
+    is invoked, then you must make a good faith effort to ensure that,
+    in the event an application does not supply such function or
+    table, the facility still operates, and performs whatever part of
+    its purpose remains meaningful.
+
+    (For example, a function in a library to compute square roots has
+    a purpose that is entirely well-defined independent of the
+    application.  Therefore, Subsection 2d requires that any
+    application-supplied function or table used by this function must
+    be optional: if the application does not supply it, the square
+    root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library.  To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License.  (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.)  Do not make any other change in
+these notices.
+
+  Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+  This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+  4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+  If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library".  Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+  However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library".  The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+  When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library.  The
+threshold for this to be true is not precisely defined by law.
+
+  If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work.  (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+  Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+
+  6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+  You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License.  You must supply a copy of this License.  If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License.  Also, you must do one
+of these things:
+
+    a) Accompany the work with the complete corresponding
+    machine-readable source code for the Library including whatever
+    changes were used in the work (which must be distributed under
+    Sections 1 and 2 above); and, if the work is an executable linked
+    with the Library, with the complete machine-readable "work that
+    uses the Library", as object code and/or source code, so that the
+    user can modify the Library and then relink to produce a modified
+    executable containing the modified Library.  (It is understood
+    that the user who changes the contents of definitions files in the
+    Library will not necessarily be able to recompile the application
+    to use the modified definitions.)
+
+    b) Use a suitable shared library mechanism for linking with the
+    Library.  A suitable mechanism is one that (1) uses at run time a
+    copy of the library already present on the user's computer system,
+    rather than copying library functions into the executable, and (2)
+    will operate properly with a modified version of the library, if
+    the user installs one, as long as the modified version is
+    interface-compatible with the version that the work was made with.
+
+    c) Accompany the work with a written offer, valid for at
+    least three years, to give the same user the materials
+    specified in Subsection 6a, above, for a charge no more
+    than the cost of performing this distribution.
+
+    d) If distribution of the work is made by offering access to copy
+    from a designated place, offer equivalent access to copy the above
+    specified materials from the same place.
+
+    e) Verify that the user has already received a copy of these
+    materials or that you have already sent this user a copy.
+
+  For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it.  However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+  It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system.  Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+
+  7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+    a) Accompany the combined library with a copy of the same work
+    based on the Library, uncombined with any other library
+    facilities.  This must be distributed under the terms of the
+    Sections above.
+
+    b) Give prominent notice with the combined library of the fact
+    that part of it is a work based on the Library, and explaining
+    where to find the accompanying uncombined form of the same work.
+
+  8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License.  Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License.  However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+  9. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Library or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+  10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+
+  11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under any
+particular circumstance, the balance of the section is intended to apply,
+and the section as a whole is intended to apply in other circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License may add
+an explicit geographical distribution limitation excluding those countries,
+so that distribution is permitted only in or among countries not thus
+excluded.  In such case, this License incorporates the limitation as if
+written in the body of this License.
+
+  13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation.  If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+
+  14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission.  For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this.  Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+			    NO WARRANTY
+
+  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+           How to Apply These Terms to Your New Libraries
+
+  If you develop a new library, and you want it to be of the greatest
+possible use to the public, we recommend making it free software that
+everyone can redistribute and change.  You can do so by permitting
+redistribution under these terms (or, alternatively, under the terms of the
+ordinary General Public License).
+
+  To apply these terms, attach the following notices to the library.  It is
+safest to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least the
+"copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the library's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+Also add information on how to contact you by electronic and paper mail.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the library, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the
+  library `Frob' (a library for tweaking knobs) written by James Random Hacker.
+
+  <signature of Ty Coon>, 1 April 1990
+  Ty Coon, President of Vice
+
+That's all there is to it!
+
+
diff --git a/src/recompiler/Makefile.kmk b/src/recompiler/Makefile.kmk
new file mode 100644
index 00000000..28c8ea95
--- /dev/null
+++ b/src/recompiler/Makefile.kmk
@@ -0,0 +1,340 @@
+# $Id: Makefile.kmk $
+## @file
+# The Recompiler Sub-Makefile.
+#
+
+#
+# Copyright (C) 2006-2019 Oracle Corporation
+#
+# This file is part of VirtualBox Open Source Edition (OSE), as
+# available from http://www.virtualbox.org. This file is free software;
+# you can redistribute it and/or modify it under the terms of the GNU
+# General Public License (GPL) as published by the Free Software
+# Foundation, in version 2 as it comes in the "COPYING" file of the
+# VirtualBox OSE distribution. VirtualBox OSE is distributed in the
+# hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
+#
+
+
+SUB_DEPTH = ../..
+include $(KBUILD_PATH)/subheader.kmk
+
+#
+# Globals
+#
+VBOX_PATH_RECOMPILER_SRC    := $(PATH_SUB_CURRENT)
+# Workaround for darwin hell.
+ifeq ($(KBUILD_TARGET),darwin)
+ VBOX_WITHOUT_REM_LDR_CYCLE := 1
+endif
+ifeq ($(KBUILD_TARGET).$(KBUILD_TARGET_ARCH),win.amd64)
+ VBOX_USE_MINGWW64 = 1
+endif
+
+
+#
+# The primary REM module definition.
+#
+# This is extended by one of the VBoxREM* modules below.  Currently, this
+# isn't done by inheritance because of some obscure bug wrt inheriting from
+# unused targets that I'm not going to fix now.
+#
+ifneq ($(KBUILD_TARGET),win)
+ VBoxRemPrimary_TEMPLATE       = VBOXR3NP
+ # workaround the regparm bug in gcc <= 3.3
+ VBoxRemPrimary_DEFS           = $(if $(VBOX_GCC_BUGGY_REGPARM),GCC_WITH_BUGGY_REGPARM,)
+else
+ VBoxRemPrimary_TEMPLATE       = DUMMY
+ VBoxRemPrimary_TOOL.win.x86   = MINGW32
+ ifdef VBOX_USE_MINGWW64
+  VBoxRemPrimary_TOOL.win.amd64 = MINGWW64
+ else
+  VBoxRemPrimary_TOOL.win.amd64 = XGCCAMD64LINUX
+ endif
+ VBoxRemPrimary_SDKS.win.x86   = W32API
+ VBoxRemPrimary_ASFLAGS        = -x assembler-with-cpp
+ VBoxRemPrimary_CFLAGS         = -Wall -g -fno-omit-frame-pointer -fno-strict-aliasing -Wno-shadow
+ VBoxRemPrimary_CFLAGS.debug   = -O0
+ VBoxRemPrimary_CFLAGS.release += -fno-gcse -O2
+ VBoxRemPrimary_CFLAGS.profile = $(VBoxRemPrimary_CFLAGS.release)
+ VBoxRemPrimary_DEFS          += IN_RING3 $(ARCH_BITS_DEFS)
+ # Workaround the regparm bug in gcc <= 3.3.
+ VBoxRemPrimary_DEFS.win.x86  += GCC_WITH_BUGGY_REGPARM
+ # Missing fpclassify. Is there a better define or flag for this?
+ VBoxRemPrimary_DEFS.solaris  += __C99FEATURES__
+endif # win
+VBoxRemPrimary_DEFS           += IN_REM_R3 REM_INCLUDE_CPU_H NEED_CPU_H
+#VBoxRemPrimary_DEFS           += REM_PHYS_ADDR_IN_TLB
+#VBoxRemPrimary_DEFS           += DEBUG_ALL_LOGGING DEBUG_DISAS DEBUG_PCALL CONFIG_DEBUG_EXEC DEBUG_FLUSH DEBUG_IOPORT DEBUG_SIGNAL DEBUG_TLB_CHECK DEBUG_TB_INVALIDATE DEBUG_TLB  # Enables huge amounts of debug logging.
+#VBoxRemPrimary_DEFS           += DEBUG_DISAS DEBUG_PCALL CONFIG_DEBUG_EXEC DEBUG_FLUSH DEBUG_IOPORT DEBUG_SIGNAL DEBUG_TLB_CHECK DEBUG_TB_INVALIDATE DEBUG_TLB  # Enables huge amounts of debug logging.
+ifdef VBOX_WITH_RAW_MODE
+ VBoxRemPrimary_DEFS          += VBOX_WITH_RAW_MODE
+endif
+ifdef VBOX_WITH_RAW_RING1
+ VBoxRemPrimary_DEFS          += VBOX_WITH_RAW_RING1
+endif
+VBoxRemPrimary_DEFS.linux      = _GNU_SOURCE
+ifdef VBOX_SOLARIS_10
+ VBoxRemPrimary_DEFS.solaris   = CONFIG_SOLARIS_VERSION=10
+else
+ VBoxRemPrimary_DEFS.solaris   = CONFIG_SOLARIS_VERSION=11
+endif
+VBoxRemPrimary_DEFS.freebsd   += _BSD
+VBoxRemPrimary_DEFS.amd64     += __x86_64__
+VBoxRemPrimary_DEFS.x86       += __i386__
+
+VBoxRemPrimary_INCS            = \
+	Sun \
+	target-i386 \
+	tcg \
+	fpu \
+	$(VBoxRemPrimary_0_OUTDIR) \
+	$(PATH_ROOT)/src/VBox/VMM/include \
+	tcg/i386 \
+	.
+ifn1of ($(VBoxRemPrimary_DEFS),DEBUG_TMP_LOGGING)
+ VBoxRemPrimary_DEFS          += LOG_USE_C99
+ VBoxRemPrimary_INCS          <= \
+ 	Sun/crt
+endif
+
+VBoxRemPrimary_SOURCES         = \
+	VBoxRecompiler.c \
+	cpu-exec.c \
+	exec.c \
+	translate-all.c \
+	host-utils.c \
+	cutils.c \
+	tcg-runtime.c \
+	tcg/tcg.c \
+	tcg/tcg-dyngen.c \
+	fpu/softfloat-native.c \
+	target-i386/op_helper.c \
+	target-i386/helper.c \
+	target-i386/translate.c
+VBoxRemPrimary_SOURCES.debug  += \
+	Sun/testmath.c
+VBoxRemPrimary_SOURCES.win = $(VBoxREMImp_0_OUTDIR)/VBoxREMRes.o
+VBoxRemPrimary_SOURCES.win.x86 = $(VBoxREMImp_0_OUTDIR)/VBoxREMWin.def
+ifdef VBOX_USE_MINGWW64
+ if 0 # exporting all helps when windbg pops up on crashes
+  VBoxRemPrimary_SOURCES.win.amd64 = $(VBoxREMImp_0_OUTDIR)/VBoxREMWin.def
+ else
+  VBoxRemPrimary_LDFLAGS.win.amd64 = --export-all
+ endif
+endif
+
+ifndef VBOX_USE_MINGWW64
+VBoxRemPrimary_LIBS            = \
+	$(LIB_VMM) \
+	$(LIB_RUNTIME)
+else
+VBoxRemPrimary_LIBS            = \
+	$(VBoxRemPrimary_0_OUTDIR)/VBoxVMMImp.a \
+	$(VBoxRemPrimary_0_OUTDIR)/VBoxRTImp.a
+VBoxRemPrimary_CLEAN           = \
+	$(VBoxRemPrimary_0_OUTDIR)/VBoxVMMImp.a \
+	$(VBoxRemPrimary_0_OUTDIR)/VBoxVMMImp.def \
+	$(VBoxRemPrimary_0_OUTDIR)/VBoxRTImp.a \
+	$(VBoxRemPrimary_0_OUTDIR)/VBoxRTImp.def
+endif
+
+VBoxRemPrimary_LDFLAGS.solaris = -mimpure-text
+if defined(VBOX_WITH_HARDENING) && "$(KBUILD_TARGET)" == "win"
+ VBoxRemPrimary_POST_CMDS      = \
+ 	$(VBOX_VCC_EDITBIN) /LargeAddressAware /DynamicBase /NxCompat /Release /IntegrityCheck \
+ 		/Version:$(VBOX_VERSION_MAJOR)0$(VBOX_VERSION_MINOR).$(VBOX_VERSION_BUILD) "$(out)" \
+ 	$$(NLTAB)$(VBOX_SIGN_IMAGE_CMDS)
+else
+ VBoxRemPrimary_POST_CMDS      = $(VBOX_SIGN_IMAGE_CMDS)
+endif
+
+
+if "$(KBUILD_TARGET).$(KBUILD_TARGET_ARCH)" == "win.amd64" && !defined(VBOX_USE_MINGWW64)
+ #
+ # VBoxREM2/VBoxRemPrimary - Currently only used by 64-bit Windows.
+ # (e_powl-xxx.S doesn't fit in IPRT because it requires GAS and is LGPL.)
+ #
+ SYSMODS += VBoxRemPrimary
+ VBoxRemPrimary_TEMPLATE   = VBOXNOCRTGAS
+ VBoxRemPrimary_NAME       = VBoxREM2
+ VBoxRemPrimary_DEFS      += LOG_USE_C99 $(ARCH_BITS_DEFS)
+ VBoxRemPrimary_SOURCES   += \
+ 	Sun/e_powl-$(KBUILD_TARGET_ARCH).S
+ VBoxRemPrimary_INCS      += \
+ 	Sun/crt
+ VBoxRemPrimary_SYSSUFF    = .rel
+ VBoxRemPrimary_LIBS       = \
+ 	$(PATH_STAGE_LIB)/RuntimeR3NoCRTGCC$(VBOX_SUFF_LIB)
+ VBoxRemPrimary_POST_CMDS  = $(NO_SUCH_VARIABLE)
+ VBOX_REM_WRAPPER          = 2
+
+else if "$(KBUILD_TARGET_ARCH)" == "x86" && defined(VBOX_WITH_64_BITS_GUESTS)
+ #
+ # For 32-bit targets when enabled 64-bit guests we build 2 REM DLLs:
+ #  with 64-bit support (slow and buggy at the moment) VBOXREM64
+ #  only 32-bit support (faster, stable, but not suitable for 64-bit guests) VBOXREM32
+ # During the runtime, we load appropriate library from VBOXREM, depending on guest settings.
+ # 64-bit targets have 64-bit enabled REM by default, so is not part of this mess
+ #
+
+ #
+ # VBoxREM32/VBoxRemPrimary
+ #
+ DLLS += VBoxRemPrimary
+ VBoxRemPrimary_NAME       = VBoxREM32
+ VBoxRemPrimary_LDFLAGS.darwin = -install_name $(VBOX_DYLD_EXECUTABLE_PATH)/VBoxREM32.dylib
+ VBOX_REM_WRAPPER          = 32
+
+ #
+ # VBoxREM64
+ #
+ DLLS += VBoxREM64
+ VBoxREM64_EXTENDS         = VBoxRemPrimary
+ VBoxREM64_EXTENDS_BY      = appending
+ VBoxREM64_NAME            = VBoxREM64
+ VBoxREM64_DEFS            = VBOX_ENABLE_VBOXREM64
+ VBoxREM64_LDFLAGS.darwin  = -install_name $(VBOX_DYLD_EXECUTABLE_PATH)/VBoxREM64.dylib
+
+else
+ #
+ # VBoxREM/VBoxRemPrimary - Normal.
+ #
+ DLLS += VBoxRemPrimary
+ VBoxRemPrimary_NAME       = VBoxREM
+ VBoxRemPrimary_LDFLAGS.darwin = -install_name $(VBOX_DYLD_EXECUTABLE_PATH)/VBoxREM3.dylib
+
+ ifdef VBOX_USE_MINGWW64
+  # GNU ld (rubenvb-4.5.4) 2.22.52.20120716 doesn't fix up rip relative
+  # addressing in the import libraries generated by microsoft link.exe. So, we
+  # have to regenerate these.
+  # Note! The chdir to the output directory is because dlltool writes temporary files to the current directory.
+  $$(VBoxRemPrimary_0_OUTDIR)/VBoxVMMImp.a \
+  $$(VBoxRemPrimary_0_OUTDIR)/VBoxRTImp.a : $$(VBoxRemPrimary_0_OUTDIR)/$$(notdir $$(basename $$@)).def
+	$(REDIRECT) -C "$(dir $@)" -- $(TOOL_MINGWW64_DLLTOOL) \
+		--output-lib "$@" \
+		--input-def "$<" \
+		--dllname "$(patsubst %Imp.a,%.dll,$(notdir $@))"
+
+  $$(VBoxRemPrimary_0_OUTDIR)/VBoxVMMImp.def \
+  $$(VBoxRemPrimary_0_OUTDIR)/VBoxRTImp.def : \
+  		$(PATH_STAGE_BIN)/$$(patsubst %Imp.def,%.dll,$$(notdir $$@)) \
+               | $$(dir $$@)
+	$(APPEND) -nt $@ "LIBRARY $(notdir $<)" "EXPORTS"
+	$(TOOL_$(VBOX_VCC_TOOL)_DUMPBIN) /EXPORTS "$<" \
+		| $(SED) -e '/ = /!d' \
+			 -e 's/^.* \([^ ][^ ]*\) = .*$(DOLLAR)/  \"\1\"/' \
+			 --append $@
+ endif # VBOX_USE_MINGWW64
+
+endif
+
+
+ifdef VBOX_REM_WRAPPER
+ #
+ # VBoxREM - Wrapper for loading VBoxREM2, VBoxREM32 or VBoxREM64.
+ #
+ DLLS += VBoxREMWrapper
+ VBoxREMWrapper_TEMPLATE   = VBoxR3DllWarnNoPic
+ VBoxREMWrapper_NAME       = VBoxREM
+ VBoxREMWrapper_DEFS       = IN_REM_R3
+ if "$(KBUILD_TARGET_ARCH)" == "x86" && defined(VBOX_WITH_64_BITS_GUESTS)
+  VBoxREMWrapper_DEFS     += VBOX_USE_BITNESS_SELECTOR
+ endif
+ ifdef VBOX_WITHOUT_REM_LDR_CYCLE
+  VBoxREMWrapper_DEFS     += VBOX_WITHOUT_REM_LDR_CYCLE
+ endif
+ VBoxREMWrapper_SOURCES    = \
+ 	VBoxREMWrapper.cpp
+ VBoxREMWrapper_SOURCES.win = VBoxREM.rc
+ if "$(KBUILD_TARGET).$(KBUILD_TARGET_ARCH)" == "win.amd64" && !defined(VBOX_USE_MINGWW64)
+  VBoxREMWrapper_SOURCES  += \
+  	VBoxREMWrapperA.asm
+ endif
+ VBoxREMWrapper_LIBS       = \
+ 	$(LIB_RUNTIME)
+ ifndef VBOX_WITHOUT_REM_LDR_CYCLE
+  VBoxREMWrapper_LIBS     += \
+  	$(LIB_VMM)
+  VBoxREMWrapper_LIBS.darwin += \
+  	$(TARGET_VBoxREMImp)
+ endif
+ VBoxREMWrapper_LDFLAGS.darwin = -install_name $(VBOX_DYLD_EXECUTABLE_PATH)/VBoxREM.dylib
+endif
+
+
+#
+# The VBoxREM import library.
+#
+# This is a HACK to get around (a) the cyclic dependency between VBoxVMM and
+# VBoxREM during linking and (b) the recursive build ordering which means VBoxREM
+# won't be built until after all the other DLLs.
+#
+IMPORT_LIBS += VBoxREMImp
+VBoxREMImp_TEMPLATE        = VBoxR3Dll
+ ifn1of ($(KBUILD_TARGET), os2 win)
+VBoxREMImp_NAME            = VBoxREM
+ endif
+VBoxREMImp_INST            = $(INST_LIB)
+VBoxREMImp_SOURCES.win     = $(VBoxREMImp_0_OUTDIR)/VBoxREMWin.def
+VBoxREMImp_CLEAN.win       = $(VBoxREMImp_0_OUTDIR)/VBoxREMWin.def
+VBoxREMImp_SOURCES.os2     = $(VBoxREMImp_0_OUTDIR)/VBoxREMOS2.def
+VBoxREMImp_CLEAN.os2       = $(VBoxREMImp_0_OUTDIR)/VBoxREMOS2.def
+ ifn1of ($(KBUILD_TARGET), os2 win)
+VBoxREMImp_SOURCES         = $(VBoxREMImp_0_OUTDIR)/VBoxREMImp.c
+VBoxREMImp_CLEAN           = $(VBoxREMImp_0_OUTDIR)/VBoxREMImp.c
+ endif
+ ifn1of ($(KBUILD_TARGET), darwin os2 win)
+VBoxREMImp_SONAME          = VBoxREM$(SUFF_DLL)
+ endif
+ifdef VBOX_WITHOUT_REM_LDR_CYCLE
+ VBoxREMImp_LDFLAGS.darwin = -install_name $(VBOX_DYLD_EXECUTABLE_PATH)/VBoxREM.dylib
+else
+ VBoxREMImp_LDFLAGS.darwin = -install_name $(subst @rpath,@executable_path,$(VBOX_DYLD_EXECUTABLE_PATH))/VBoxREM.dylib
+endif
+VBoxREMImp_LDFLAGS.l4      = -T$(L4_LIBDIR)/../main_rel.ld -nostdlib
+
+$$(VBoxREMImp_0_OUTDIR)/VBoxREMImp.c: $(VBOX_PATH_RECOMPILER_SRC)/VBoxREM.def $(VBOX_PATH_RECOMPILER_SRC)/Sun/deftoimp.sed $(MAKEFILE_CURRENT) | $$(dir $$@)
+	$(call MSG_GENERATE,,$@)
+	$(QUIET)$(APPEND) -t $@ '#ifdef VBOX_HAVE_VISIBILITY_HIDDEN'
+	$(QUIET)$(APPEND)    $@ '# define EXPORT __attribute__((visibility("default")))'
+	$(QUIET)$(APPEND)    $@ '#else'
+	$(QUIET)$(APPEND)    $@ '# define EXPORT'
+	$(QUIET)$(APPEND)    $@ '#endif'
+	$(QUIET)$(APPEND)    $@ ''
+	$(QUIET)$(SED) -f $(VBOX_PATH_RECOMPILER_SRC)/Sun/deftoimp.sed --append $@ $<
+
+$$(VBoxREMImp_0_OUTDIR)/VBoxREMOS2.def: $(VBOX_PATH_RECOMPILER_SRC)/VBoxREM.def $(MAKEFILE_CURRENT) | $$(dir $$@)
+	$(SED) \
+		-e 's/^[ \t][ \t]*REMR3/    _REMR3/' \
+		-e 's/\.[Dd][Ll][Ll]//' \
+		-e 's/^LIBRARY .*/LIBRARY VBoxREM INITINSTANCE TERMINSTANCE\nDATA MULTIPLE\n/' \
+		--output $@ \
+		$<
+
+$$(VBoxREMImp_0_OUTDIR)/VBoxREMWin.def: $(VBOX_PATH_RECOMPILER_SRC)/VBoxREM.def $(MAKEFILE_CURRENT) | $$(dir $$@)
+	$(CP) -f $< $@
+
+$$(VBoxREMImp_0_OUTDIR)/VBoxREMRes.o: $(VBOX_PATH_RECOMPILER_SRC)/VBoxREM.rc $(MAKEFILE_CURRENT) $(VBOX_VERSION_MK) | $$(dir $$@)
+	$(call MSG_GENERATE,,$@)
+	$(QUIET)$(REDIRECT) -E 'COMSPEC=$(VBOX_GOOD_COMSPEC_BS)' \
+	    -- $(TOOL_$(VBoxRemPrimary_TOOL.win.$(KBUILD_TARGET_ARCH))_PREFIX)windres \
+	    $(addprefix -I,$(INCS) $(PATH_SDK_$(VBOX_WINPSDK)_INC) $(PATH_TOOL_$(VBOX_VCC_TOOL)_INC)) \
+	    -DVBOX_SVN_REV=$(VBOX_SVN_REV) \
+	    -DVBOX_SVN_REV_MOD_5K=$(expr $(VBOX_SVN_REV) % 50000) \
+	    $< $@
+
+#
+# The math testcase as a standalone program for testing and debugging purposes.
+#
+## @todo This is a bit messy because of MINGW32.
+testmath_ASFLAGS.amd64  = -m amd64
+testmath_CFLAGS         = -Wall -g
+testmath_CFLAGS.release = -O3
+testmath_LDFLAGS        = -g
+testmath_DEFS           = MATHTEST_STANDALONE
+testmath_SOURCES        = Sun/testmath.c
+
+
+include $(FILE_KBUILD_SUB_FOOTER)
+
diff --git a/src/recompiler/README.vbox b/src/recompiler/README.vbox
new file mode 100644
index 00000000..d478b71e
--- /dev/null
+++ b/src/recompiler/README.vbox
@@ -0,0 +1,6 @@
+QEMU is based on v0.13.0 (6ed912999d6ef636a5be5ccb266d7d3c0f0310b4)
+from git://git.savannah.nongnu.org/qemu.git.
+
+Modified parts related to calls of external functions, to allow loading
+recompiler library as regular shared object in high memory on 64-bit systems.
+
diff --git a/src/recompiler/Sun/Makefile.kup b/src/recompiler/Sun/Makefile.kup
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/src/recompiler/Sun/Makefile.kup
diff --git a/src/recompiler/Sun/config-host.h b/src/recompiler/Sun/config-host.h
new file mode 100644
index 00000000..a18a147e
--- /dev/null
+++ b/src/recompiler/Sun/config-host.h
@@ -0,0 +1,46 @@
+/* $Id: config-host.h $ */
+/** @file
+ * Sun host config - maintained by hand
+ */
+
+/*
+ * Copyright (C) 2006-2019 Oracle Corporation
+ *
+ * This file is part of VirtualBox Open Source Edition (OSE), as
+ * available from http://www.virtualbox.org. This file is free software;
+ * you can redistribute it and/or modify it under the terms of the GNU
+ * General Public License (GPL) as published by the Free Software
+ * Foundation, in version 2 as it comes in the "COPYING" file of the
+ * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
+ * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
+ */
+
+
+#if defined(RT_ARCH_AMD64)
+# define HOST_X86_64 1
+# define HOST_LONG_BITS 64
+#else
+# define HOST_I386 1
+# define HOST_LONG_BITS 32
+#endif
+
+#ifndef IPRT_NO_CRT
+# ifdef RT_OS_WINDOWS
+#  define CONFIG_WIN32 1
+# elif defined(RT_OS_OS2)
+#  define CONFIG_OS2
+# elif defined(RT_OS_DARWIN)
+#  define CONFIG_DARWIN
+# elif defined(RT_OS_FREEBSD) || defined(RT_OS_NETBSD) || defined(RT_OS_OPENBSD)
+#  define HAVE_MACHINE_BSWAP_H
+/*#  define CONFIG_BSD*/
+# elif defined(RT_OS_SOLARIS)
+#  define CONFIG_SOLARIS
+# else
+#  define HAVE_BYTESWAP_H 1
+# endif
+#endif
+#define QEMU_VERSION "0.13.0"
+#define CONFIG_UNAME_RELEASE ""
+#define CONFIG_QEMU_SHAREDIR "."
+
diff --git a/src/recompiler/Sun/config.h b/src/recompiler/Sun/config.h
new file mode 100644
index 00000000..173b660d
--- /dev/null
+++ b/src/recompiler/Sun/config.h
@@ -0,0 +1,44 @@
+/* $Id: config.h $ */
+/** @file
+ * Sun config - Maintained by hand
+ */
+
+/*
+ * Copyright (C) 2006-2019 Oracle Corporation
+ *
+ * This file is part of VirtualBox Open Source Edition (OSE), as
+ * available from http://www.virtualbox.org. This file is free software;
+ * you can redistribute it and/or modify it under the terms of the GNU
+ * General Public License (GPL) as published by the Free Software
+ * Foundation, in version 2 as it comes in the "COPYING" file of the
+ * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
+ * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
+ */
+
+#include "config-host.h"
+#define CONFIG_QEMU_PREFIX "/usr/gnemul/qemu-i386"
+#define TARGET_ARCH "i386"
+#define TARGET_I386 1
+#define CONFIG_SOFTMMU 1
+#define TARGET_PHYS_ADDR_BITS 64
+
+#ifdef VBOX_WITH_64_BITS_GUESTS
+# if defined(__x86_64__) || defined (VBOX_ENABLE_VBOXREM64)
+#  define TARGET_X86_64
+# endif
+#endif
+
+/* Uncomment to see all phys memory accesses */
+/* #define VBOX_DEBUG_PHYS */
+/* Uncomment to see emulated CPU state changes */
+/* #define VBOX_DUMP_STATE */
+/* Uncomment to see QEMU logging, goes to /tmp/vbox-qemu.log */
+/* #define DEBUG_ALL_LOGGING */
+/* Uncomment to see generated code */
+/* #define DEBUG_DISAS */
+
+#if 0 /*defined(RT_ARCH_AMD64) && defined(VBOX_STRICT)*/
+# define VBOX_CHECK_ADDR(ptr) do { if ((uintptr_t)(ptr) >= _4G) __asm__("int3"); } while (0)
+#else
+# define VBOX_CHECK_ADDR(ptr) do { } while (0)
+#endif
diff --git a/src/recompiler/Sun/crt/stdio.h b/src/recompiler/Sun/crt/stdio.h
new file mode 100644
index 00000000..5c73fb1a
--- /dev/null
+++ b/src/recompiler/Sun/crt/stdio.h
@@ -0,0 +1,73 @@
+/* $Id: stdio.h $ */
+/** @file
+ * Our minimal stdio
+ */
+
+/*
+ * Copyright (C) 2006-2019 Oracle Corporation
+ *
+ * This file is part of VirtualBox Open Source Edition (OSE), as
+ * available from http://www.virtualbox.org. This file is free software;
+ * you can redistribute it and/or modify it under the terms of the GNU
+ * General Public License (GPL) as published by the Free Software
+ * Foundation, in version 2 as it comes in the "COPYING" file of the
+ * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
+ * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
+ */
+
+#ifndef ___Sun_stdio_h
+#define ___Sun_stdio_h
+
+#ifndef LOG_GROUP
+# define UNDO_LOG_GROUP
+#endif
+
+#include <VBox/log.h>
+
+#ifdef UNDO_LOG_GROUP
+# undef UNDO_LOG_GROUP
+# undef LOG_GROUP
+#endif
+
+#ifndef LOG_USE_C99
+# error "LOG_USE_C99 isn't defined."
+#endif
+
+RT_C_DECLS_BEGIN
+
+typedef struct FILE FILE;
+
+#if defined(RT_OS_SOLARIS)
+/** @todo Check solaris' floatingpoint.h as to why we do this */
+# define _FILEDEFED
+#endif
+
+DECLINLINE(int) fprintf(FILE *ignored, const char *pszFormat, ...)
+{
+/** @todo We don't support wrapping calls taking a va_list yet. It's not worth it yet,
+ * since there are only a couple of cases where this fprintf implementation is used.
+ * (The macro below will deal with the majority of the fprintf calls.) */
+#if 0 /*def LOG_ENABLED*/
+    if (LogIsItEnabled(NULL, 0, LOG_GROUP_REM_PRINTF))
+    {
+        va_list va;
+        va_start(va, pszFormat);
+        RTLogLoggerExV(NULL, 0, LOG_GROUP_REM_PRINTF, pszFormat, va);
+        va_end(va);
+    }
+#endif
+    return 0;
+}
+
+#define fflush(file)            RTLogFlush(NULL)
+#define printf(...)             LogIt(0, LOG_GROUP_REM_PRINTF, (__VA_ARGS__))
+#define fprintf(logfile, ...)   LogIt(0, LOG_GROUP_REM_PRINTF, (__VA_ARGS__))
+
+#ifdef DEBUG_TMP_LOGGING
+# error "DEBUG_TMP_LOGGING doesn't work with the Sun/crt/stdio.h wrapper."
+#endif
+
+RT_C_DECLS_END
+
+#endif
+
diff --git a/src/recompiler/Sun/deftoimp.sed b/src/recompiler/Sun/deftoimp.sed
new file mode 100644
index 00000000..3e35efa1
--- /dev/null
+++ b/src/recompiler/Sun/deftoimp.sed
@@ -0,0 +1,37 @@
+# $Id: deftoimp.sed $
+## @file
+# SED script for generating a dummy .so from a windows .def file.
+#
+
+#
+# Copyright (C) 2006-2019 Oracle Corporation
+#
+# This file is part of VirtualBox Open Source Edition (OSE), as
+# available from http://www.virtualbox.org. This file is free software;
+# you can redistribute it and/or modify it under the terms of the GNU
+# General Public License (GPL) as published by the Free Software
+# Foundation, in version 2 as it comes in the "COPYING" file of the
+# VirtualBox OSE distribution. VirtualBox OSE is distributed in the
+# hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
+#
+
+s/;.*$//g
+s/^[[:space:]][[:space:]]*//g
+s/[[:space:]][[:space:]]*$//g
+/^$/d
+
+# Handle text after EXPORTS
+/EXPORTS/,//{
+s/^EXPORTS$//
+/^$/b end
+
+s/^\(.*\)$/EXPORT\nvoid \1(void);\nvoid \1(void){}/
+b end
+}
+d
+b end
+
+
+# next expression
+:end
+
diff --git a/src/recompiler/Sun/e_powl-amd64.S b/src/recompiler/Sun/e_powl-amd64.S
new file mode 100644
index 00000000..5e0353e7
--- /dev/null
+++ b/src/recompiler/Sun/e_powl-amd64.S
@@ -0,0 +1,371 @@
+/* ix87 specific implementation of pow function.
+   Copyright (C) 1996, 1997, 1998, 1999, 2001, 2004 Free Software Foundation
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+/*
+ * Oracle LGPL Disclaimer: For the avoidance of doubt, except that if any license choice
+ * other than GPL or LGPL is available it will apply instead, Oracle elects to use only
+ * the Lesser General Public License version 2.1 (LGPLv2) at this time for any software where
+ * a choice of LGPL license versions is made available with the language indicating
+ * that LGPLv2 or any later version may be used, or where a choice of which version
+ * of the LGPL is applied is otherwise unspecified.
+ */
+
+/*#include <machine/asm.h>*/
+#include <iprt/cdefs.h>
+
+#define ALIGNARG(log2) 1<<log2
+#define ASM_TYPE_DIRECTIVE(name,typearg) .type name,typearg;
+#define ASM_SIZE_DIRECTIVE(name) .size name,.-name;
+#define ASM_GLOBAL_DIRECTIVE .global
+
+#define C_LABEL(name)		name:
+#define C_SYMBOL_NAME(name) name
+
+#define	ENTRY(name)							      \
+  ASM_GLOBAL_DIRECTIVE C_SYMBOL_NAME(name);				      \
+  ASM_TYPE_DIRECTIVE (C_SYMBOL_NAME(name),@function)			      \
+  .align ALIGNARG(4);							      \
+  C_LABEL(name)
+
+#undef	END
+#define END(name)							      \
+  ASM_SIZE_DIRECTIVE(name)
+
+
+#ifdef __ELF__
+	.section .rodata
+#else
+	.text
+#endif
+
+	.align ALIGNARG(4)
+	ASM_TYPE_DIRECTIVE(infinity,@object)
+inf_zero:
+infinity:
+	.byte 0, 0, 0, 0, 0, 0, 0xf0, 0x7f
+	ASM_SIZE_DIRECTIVE(infinity)
+	ASM_TYPE_DIRECTIVE(zero,@object)
+zero:	.double 0.0
+	ASM_SIZE_DIRECTIVE(zero)
+	ASM_TYPE_DIRECTIVE(minf_mzero,@object)
+minf_mzero:
+minfinity:
+	.byte 0, 0, 0, 0, 0, 0, 0xf0, 0xff
+mzero:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0x80
+	ASM_SIZE_DIRECTIVE(minf_mzero)
+	ASM_TYPE_DIRECTIVE(one,@object)
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	ASM_TYPE_DIRECTIVE(limit,@object)
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+	ASM_TYPE_DIRECTIVE(p63,@object)
+p63:
+	.byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43
+	ASM_SIZE_DIRECTIVE(p63)
+
+//#ifdef PIC
+//#define MO(op) op##(%rip)
+//#else
+#define MO(op) op
+//#endif
+
+	.text
+/*ENTRY(__ieee754_powl)*/
+ENTRY(RT_NOCRT(powl))
+
+	fldt	24(%rsp)	// y
+	fxam
+
+
+	fnstsw
+	movb	%ah, %dl
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah	// is y == 0 ?
+	je	11f
+
+	cmpb	$0x05, %ah	// is y == �inf ?
+	je	12f
+
+	cmpb	$0x01, %ah	// is y == NaN ?
+	je	30f
+
+	fldt	8(%rsp)		// x : y
+
+	fxam
+	fnstsw
+	movb	%ah, %dh
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	je	20f		// x is �0
+
+	cmpb	$0x05, %ah
+	je	15f		// x is �inf
+
+	fxch			// y : x
+
+	/* fistpll raises invalid exception for |y| >= 1L<<63.  */
+	fldl	MO(p63)		// 1L<<63 : y : x
+	fld	%st(1)		// y : 1L<<63 : y : x
+	fabs			// |y| : 1L<<63 : y : x
+	fcomip	%st(1), %st	// 1L<<63 : y : x
+	fstp	%st(0)		// y : x
+	jnc	2f
+
+	/* First see whether `y' is a natural number.  In this case we
+	   can use a more precise algorithm.  */
+	fld	%st		// y : y : x
+	fistpll	-8(%rsp)	// y : x
+	fildll	-8(%rsp)	// int(y) : y : x
+	fucomip	%st(1),%st	// y : x
+	jne	2f
+
+	/* OK, we have an integer value for y.  */
+	mov	-8(%rsp),%eax
+	mov	-4(%rsp),%edx
+	orl	$0, %edx
+	fstp	%st(0)		// x
+	jns	4f		// y >= 0, jump
+	fdivrl	MO(one)		// 1/x		(now referred to as x)
+	negl	%eax
+	adcl	$0, %edx
+	negl	%edx
+4:	fldl	MO(one)		// 1 : x
+	fxch
+
+6:	shrdl	$1, %edx, %eax
+	jnc	5f
+	fxch
+	fmul	%st(1)		// x : ST*x
+	fxch
+5:	fmul	%st(0), %st	// x*x : ST*x
+	shrl	$1, %edx
+	movl	%eax, %ecx
+	orl	%edx, %ecx
+	jnz	6b
+	fstp	%st(0)		// ST*x
+	ret
+
+	/* y is �NAN */
+30:	fldt	8(%rsp)		// x : y
+	fldl	MO(one)		// 1.0 : x : y
+	fucomip	%st(1),%st	// x : y
+	je	31f
+	fxch			// y : x
+31:	fstp	%st(1)
+	ret
+
+	.align ALIGNARG(4)
+2:	/* y is a real number.  */
+	fxch			// x : y
+	fldl	MO(one)		// 1.0 : x : y
+	fld	%st(1)		// x : 1.0 : x : y
+	fsub	%st(1)		// x-1 : 1.0 : x : y
+	fabs			// |x-1| : 1.0 : x : y
+	fcompl	MO(limit)	// 1.0 : x : y
+	fnstsw
+	fxch			// x : 1.0 : y
+	test	$4500,%eax
+	jz	7f
+	fsub	%st(1)		// x-1 : 1.0 : y
+	fyl2xp1			// log2(x) : y
+	jmp	8f
+
+7:	fyl2x			// log2(x) : y
+8:	fmul	%st(1)		// y*log2(x) : y
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x05, %ah      // is y*log2(x) == �inf ?
+	je	28f
+	fst	%st(1)		// y*log2(x) : y*log2(x)
+	frndint			// int(y*log2(x)) : y*log2(x)
+	fsubr	%st, %st(1)	// int(y*log2(x)) : fract(y*log2(x))
+	fxch			// fract(y*log2(x)) : int(y*log2(x))
+	f2xm1			// 2^fract(y*log2(x))-1 : int(y*log2(x))
+	faddl	MO(one)		// 2^fract(y*log2(x)) : int(y*log2(x))
+	fscale			// 2^fract(y*log2(x))*2^int(y*log2(x)) : int(y*log2(x))
+	fstp	%st(1)		// 2^fract(y*log2(x))*2^int(y*log2(x))
+	ret
+
+28:	fstp	%st(1)		// y*log2(x)
+	fldl	MO(one)		// 1 : y*log2(x)
+	fscale			// 2^(y*log2(x)) : y*log2(x)
+	fstp	%st(1)		// 2^(y*log2(x))
+	ret
+
+	// pow(x,�0) = 1
+	.align ALIGNARG(4)
+11:	fstp	%st(0)		// pop y
+	fldl	MO(one)
+	ret
+
+	// y == �inf
+	.align ALIGNARG(4)
+12:	fstp	%st(0)		// pop y
+	fldt	8(%rsp)		// x
+	fabs
+	fcompl	MO(one)		// < 1, == 1, or > 1
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x45, %ah
+	je	13f		// jump if x is NaN
+
+	cmpb	$0x40, %ah
+	je	14f		// jump if |x| == 1
+
+	shlb	$1, %ah
+	xorb	%ah, %dl
+	andl	$2, %edx
+#ifdef PIC
+	lea	inf_zero(%rip),%rcx
+	fldl	(%rcx, %rdx, 4)
+#else
+	fldl	inf_zero(,%rdx, 4)
+#endif
+	ret
+
+	.align ALIGNARG(4)
+14:	fldl	MO(one)
+	ret
+
+	.align ALIGNARG(4)
+13:	fldt	8(%rsp)		// load x == NaN
+	ret
+
+	.align ALIGNARG(4)
+	// x is �inf
+15:	fstp	%st(0)		// y
+	testb	$2, %dh
+	jz	16f		// jump if x == +inf
+
+	// We must find out whether y is an odd integer.
+	fld	%st		// y : y
+	fistpll	-8(%rsp)	// y
+	fildll	-8(%rsp)	// int(y) : y
+	fucomip %st(1),%st
+	ffreep	%st		// <empty>
+	jne	17f
+
+	// OK, the value is an integer, but is it odd?
+	mov	-8(%rsp), %eax
+	mov	-4(%rsp), %edx
+	andb	$1, %al
+	jz	18f		// jump if not odd
+	// It's an odd integer.
+	shrl	$31, %edx
+#ifdef PIC
+	lea	minf_mzero(%rip),%rcx
+	fldl	(%rcx, %rdx, 8)
+#else
+	fldl	minf_mzero(,%rdx, 8)
+#endif
+	ret
+
+	.align ALIGNARG(4)
+16:	fcompl	MO(zero)
+	fnstsw
+	shrl	$5, %eax
+	andl	$8, %eax
+#ifdef PIC
+	lea	inf_zero(%rip),%rcx
+	fldl	(%rcx, %rax, 1)
+#else
+	fldl	inf_zero(,%rax, 1)
+#endif
+	ret
+
+	.align ALIGNARG(4)
+17:	shll	$30, %edx	// sign bit for y in right position
+18:	shrl	$31, %edx
+#ifdef PIC
+	lea	inf_zero(%rip),%rcx
+	fldl	(%rcx, %rdx, 8)
+#else
+	fldl	inf_zero(,%rdx, 8)
+#endif
+	ret
+
+	.align ALIGNARG(4)
+	// x is �0
+20:	fstp	%st(0)		// y
+	testb	$2, %dl
+	jz	21f		// y > 0
+
+	// x is �0 and y is < 0.  We must find out whether y is an odd integer.
+	testb	$2, %dh
+	jz	25f
+
+	fld	%st		// y : y
+	fistpll	-8(%rsp)	// y
+	fildll	-8(%rsp)	// int(y) : y
+	fucomip	%st(1),%st
+	ffreep	%st		// <empty>
+	jne	26f
+
+	// OK, the value is an integer, but is it odd?
+	mov	-8(%rsp),%eax
+	mov	-4(%rsp),%edx
+	andb	$1, %al
+	jz	27f		// jump if not odd
+	// It's an odd integer.
+	// Raise divide-by-zero exception and get minus infinity value.
+	fldl	MO(one)
+	fdivl	MO(zero)
+	fchs
+	ret
+
+25:	fstp	%st(0)
+26:
+27:	// Raise divide-by-zero exception and get infinity value.
+	fldl	MO(one)
+	fdivl	MO(zero)
+	ret
+
+	.align ALIGNARG(4)
+	// x is �0 and y is > 0.  We must find out whether y is an odd integer.
+21:	testb	$2, %dh
+	jz	22f
+
+	fld	%st		// y : y
+	fistpll	-8(%rsp)	// y
+	fildll	-8(%rsp)	// int(y) : y
+	fucomip %st(1),%st
+	ffreep	%st		// <empty>
+	jne	23f
+
+	// OK, the value is an integer, but is it odd?
+	mov	-8(%rsp),%eax
+	mov	-4(%rsp),%edx
+	andb	$1, %al
+	jz	24f		// jump if not odd
+	// It's an odd integer.
+	fldl	MO(mzero)
+	ret
+
+22:	fstp	%st(0)
+23:
+24:	fldl	MO(zero)
+	ret
+
+/*END(__ieee754_powl)*/
+END(RT_NOCRT(powl))
+
diff --git a/src/recompiler/Sun/e_powl-x86.S b/src/recompiler/Sun/e_powl-x86.S
new file mode 100644
index 00000000..cbc99256
--- /dev/null
+++ b/src/recompiler/Sun/e_powl-x86.S
@@ -0,0 +1,413 @@
+/* ix87 specific implementation of pow function.
+   Copyright (C) 1996, 1997, 1998, 1999, 2001, 2004, 2005
+   Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+/*
+ * Oracle LGPL Disclaimer: For the avoidance of doubt, except that if any license choice
+ * other than GPL or LGPL is available it will apply instead, Oracle elects to use only
+ * the Lesser General Public License version 2.1 (LGPLv2) at this time for any software where
+ * a choice of LGPL license versions is made available with the language indicating
+ * that LGPLv2 or any later version may be used, or where a choice of which version
+ * of the LGPL is applied is otherwise unspecified.
+ */
+
+/*#include <machine/asm.h>*/
+#include <iprt/cdefs.h>
+
+#ifdef __MINGW32__
+# define ASM_TYPE_DIRECTIVE(name,typearg)
+# define ASM_SIZE_DIRECTIVE(name)
+# define cfi_adjust_cfa_offset(a)
+# define C_LABEL(name)       _ ## name:
+# define C_SYMBOL_NAME(name) _ ## name
+# define ASM_GLOBAL_DIRECTIVE .global
+# define ALIGNARG(log2) 1<<log2
+#elif __APPLE__
+# define ASM_TYPE_DIRECTIVE(name,typearg)
+# define ASM_SIZE_DIRECTIVE(name)
+# define cfi_adjust_cfa_offset(a)
+# define C_LABEL(name)       _ ## name:
+# define C_SYMBOL_NAME(name) _ ## name
+# define ASM_GLOBAL_DIRECTIVE .globl
+# define ALIGNARG(log2) log2
+#else
+# define ASM_TYPE_DIRECTIVE(name,typearg) .type name,typearg;
+# define ASM_SIZE_DIRECTIVE(name) .size name,.-name;
+# define C_LABEL(name)		name:
+# define C_SYMBOL_NAME(name) name
+# /* figure this one out. */
+# define cfi_adjust_cfa_offset(a)
+# define ASM_GLOBAL_DIRECTIVE .global
+# define ALIGNARG(log2) 1<<log2
+#endif
+
+#define	ENTRY(name)							      \
+  ASM_GLOBAL_DIRECTIVE C_SYMBOL_NAME(name);				      \
+  ASM_TYPE_DIRECTIVE (C_SYMBOL_NAME(name),@function)			      \
+  .align ALIGNARG(4);							      \
+  C_LABEL(name)
+
+#undef	END
+#define END(name)							      \
+  ASM_SIZE_DIRECTIVE(name)
+
+#ifdef __ELF__
+	.section .rodata
+#else
+	.text
+#endif
+
+	.align ALIGNARG(4)
+	ASM_TYPE_DIRECTIVE(infinity,@object)
+inf_zero:
+infinity:
+	.byte 0, 0, 0, 0, 0, 0, 0xf0, 0x7f
+	ASM_SIZE_DIRECTIVE(infinity)
+	ASM_TYPE_DIRECTIVE(zero,@object)
+zero:	.double 0.0
+	ASM_SIZE_DIRECTIVE(zero)
+	ASM_TYPE_DIRECTIVE(minf_mzero,@object)
+minf_mzero:
+minfinity:
+	.byte 0, 0, 0, 0, 0, 0, 0xf0, 0xff
+mzero:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0x80
+	ASM_SIZE_DIRECTIVE(minf_mzero)
+	ASM_TYPE_DIRECTIVE(one,@object)
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	ASM_TYPE_DIRECTIVE(limit,@object)
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+	ASM_TYPE_DIRECTIVE(p63,@object)
+p63:	.byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43
+	ASM_SIZE_DIRECTIVE(p63)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%ecx)
+#define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
+#else
+#define MO(op) op
+#define MOX(op,x,f) op(,x,f)
+#endif
+
+	.text
+//ENTRY(__ieee754_powl)
+ENTRY(RT_NOCRT(powl))
+#ifdef RT_OS_DARWIN /* 16-byte long double with 8 byte alignment requirements */
+	fldt	20(%esp)	// y
+#else
+	fldt	16(%esp)	// y
+#endif
+	fxam
+
+#ifdef	PIC
+	LOAD_PIC_REG (cx)
+#endif
+
+	fnstsw
+	movb	%ah, %dl
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah	// is y == 0 ?
+	je	.L11
+
+	cmpb	$0x05, %ah	// is y == �inf ?
+	je	.L12
+
+	cmpb	$0x01, %ah	// is y == NaN ?
+	je	.L30
+
+	fldt	4(%esp)		// x : y
+
+	subl	$8,%esp
+	cfi_adjust_cfa_offset (8)
+
+	fxam
+	fnstsw
+	movb	%ah, %dh
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	je	.L20		// x is �0
+
+	cmpb	$0x05, %ah
+	je	.L15		// x is �inf
+
+	fxch			// y : x
+
+	/* fistpll raises invalid exception for |y| >= 1L<<63.  */
+	fld	%st		// y : y : x
+	fabs			// |y| : y : x
+	fcompl	MO(p63)		// y : x
+	fnstsw
+	sahf
+	jnc	.L2
+
+	/* First see whether `y' is a natural number.  In this case we
+	   can use a more precise algorithm.  */
+	fld	%st		// y : y : x
+	fistpll	(%esp)		// y : x
+	fildll	(%esp)		// int(y) : y : x
+	fucomp	%st(1)		// y : x
+	fnstsw
+	sahf
+	jne	.L2
+
+	/* OK, we have an integer value for y.  */
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	orl	$0, %edx
+	fstp	%st(0)		// x
+	jns	.L4		// y >= 0, jump
+	fdivrl	MO(one)		// 1/x		(now referred to as x)
+	negl	%eax
+	adcl	$0, %edx
+	negl	%edx
+.L4:	fldl	MO(one)		// 1 : x
+	fxch
+
+.L6:	shrdl	$1, %edx, %eax
+	jnc	.L5
+	fxch
+	fmul	%st(1)		// x : ST*x
+	fxch
+.L5:	fmul	%st(0), %st	// x*x : ST*x
+	shrl	$1, %edx
+	movl	%eax, %ecx
+	orl	%edx, %ecx
+	jnz	.L6
+	fstp	%st(0)		// ST*x
+	ret
+
+	/* y is �NAN */
+.L30:	fldt	4(%esp)		// x : y
+	fldl	MO(one)		// 1.0 : x : y
+	fucomp	%st(1)		// x : y
+	fnstsw
+	sahf
+	je	.L31
+	fxch			// y : x
+.L31:	fstp	%st(1)
+	ret
+
+	cfi_adjust_cfa_offset (8)
+	.align ALIGNARG(4)
+.L2:	/* y is a real number.  */
+	fxch			// x : y
+	fldl	MO(one)		// 1.0 : x : y
+	fld	%st(1)		// x : 1.0 : x : y
+	fsub	%st(1)		// x-1 : 1.0 : x : y
+	fabs			// |x-1| : 1.0 : x : y
+	fcompl	MO(limit)	// 1.0 : x : y
+	fnstsw
+	fxch			// x : 1.0 : y
+	sahf
+	ja	.L7
+	fsub	%st(1)		// x-1 : 1.0 : y
+	fyl2xp1			// log2(x) : y
+	jmp	.L8
+
+.L7:	fyl2x			// log2(x) : y
+.L8:	fmul	%st(1)		// y*log2(x) : y
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x05, %ah	// is y*log2(x) == �inf ?
+	je	.L28
+	fst	%st(1)		// y*log2(x) : y*log2(x)
+	frndint			// int(y*log2(x)) : y*log2(x)
+	fsubr	%st, %st(1)	// int(y*log2(x)) : fract(y*log2(x))
+	fxch			// fract(y*log2(x)) : int(y*log2(x))
+	f2xm1			// 2^fract(y*log2(x))-1 : int(y*log2(x))
+	faddl	MO(one)		// 2^fract(y*log2(x)) : int(y*log2(x))
+	fscale			// 2^fract(y*log2(x))*2^int(y*log2(x)) : int(y*log2(x))
+	addl	$8, %esp
+	cfi_adjust_cfa_offset (-8)
+	fstp	%st(1)		// 2^fract(y*log2(x))*2^int(y*log2(x))
+	ret
+
+	cfi_adjust_cfa_offset (8)
+.L28:	fstp	%st(1)		// y*log2(x)
+	fldl	MO(one)		// 1 : y*log2(x)
+	fscale			// 2^(y*log2(x)) : y*log2(x)
+	addl	$8, %esp
+	cfi_adjust_cfa_offset (-8)
+	fstp	%st(1)		// 2^(y*log2(x))
+	ret
+
+	// pow(x,�0) = 1
+	.align ALIGNARG(4)
+.L11:	fstp	%st(0)		// pop y
+	fldl	MO(one)
+	ret
+
+	// y == �inf
+	.align ALIGNARG(4)
+.L12:	fstp	%st(0)		// pop y
+	fldt	4(%esp)		// x
+	fabs
+	fcompl	MO(one)		// < 1, == 1, or > 1
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x45, %ah
+	je	.L13		// jump if x is NaN
+
+	cmpb	$0x40, %ah
+	je	.L14		// jump if |x| == 1
+
+	shlb	$1, %ah
+	xorb	%ah, %dl
+	andl	$2, %edx
+	fldl	MOX(inf_zero, %edx, 4)
+	ret
+
+	.align ALIGNARG(4)
+.L14:	fldl	MO(one)
+	ret
+
+	.align ALIGNARG(4)
+.L13:	fldt	4(%esp)		// load x == NaN
+	ret
+
+	cfi_adjust_cfa_offset (8)
+	.align ALIGNARG(4)
+	// x is �inf
+.L15:	fstp	%st(0)		// y
+	testb	$2, %dh
+	jz	.L16		// jump if x == +inf
+
+	// We must find out whether y is an odd integer.
+	fld	%st		// y : y
+	fistpll	(%esp)		// y
+	fildll	(%esp)		// int(y) : y
+	fucompp			// <empty>
+	fnstsw
+	sahf
+	jne	.L17
+
+	// OK, the value is an integer, but is it odd?
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	andb	$1, %al
+	jz	.L18		// jump if not odd
+	// It's an odd integer.
+	shrl	$31, %edx
+	fldl	MOX(minf_mzero, %edx, 8)
+	ret
+
+	cfi_adjust_cfa_offset (8)
+	.align ALIGNARG(4)
+.L16:	fcompl	MO(zero)
+	addl	$8, %esp
+	cfi_adjust_cfa_offset (-8)
+	fnstsw
+	shrl	$5, %eax
+	andl	$8, %eax
+	fldl	MOX(inf_zero, %eax, 1)
+	ret
+
+	cfi_adjust_cfa_offset (8)
+	.align ALIGNARG(4)
+.L17:	shll	$30, %edx	// sign bit for y in right position
+	addl	$8, %esp
+	cfi_adjust_cfa_offset (-8)
+.L18:	shrl	$31, %edx
+	fldl	MOX(inf_zero, %edx, 8)
+	ret
+
+	cfi_adjust_cfa_offset (8)
+	.align ALIGNARG(4)
+	// x is �0
+.L20:	fstp	%st(0)		// y
+	testb	$2, %dl
+	jz	.L21		// y > 0
+
+	// x is �0 and y is < 0.  We must find out whether y is an odd integer.
+	testb	$2, %dh
+	jz	.L25
+
+	fld	%st		// y : y
+	fistpll	(%esp)		// y
+	fildll	(%esp)		// int(y) : y
+	fucompp			// <empty>
+	fnstsw
+	sahf
+	jne	.L26
+
+	// OK, the value is an integer, but is it odd?
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	andb	$1, %al
+	jz	.L27		// jump if not odd
+	// It's an odd integer.
+	// Raise divide-by-zero exception and get minus infinity value.
+	fldl	MO(one)
+	fdivl	MO(zero)
+	fchs
+	ret
+
+	cfi_adjust_cfa_offset (8)
+.L25:	fstp	%st(0)
+.L26:	addl	$8, %esp
+	cfi_adjust_cfa_offset (-8)
+.L27:	// Raise divide-by-zero exception and get infinity value.
+	fldl	MO(one)
+	fdivl	MO(zero)
+	ret
+
+	cfi_adjust_cfa_offset (8)
+	.align ALIGNARG(4)
+	// x is �0 and y is > 0.  We must find out whether y is an odd integer.
+.L21:	testb	$2, %dh
+	jz	.L22
+
+	fld	%st		// y : y
+	fistpll	(%esp)		// y
+	fildll	(%esp)		// int(y) : y
+	fucompp			// <empty>
+	fnstsw
+	sahf
+	jne	.L23
+
+	// OK, the value is an integer, but is it odd?
+	popl	%eax
+	cfi_adjust_cfa_offset (-4)
+	popl	%edx
+	cfi_adjust_cfa_offset (-4)
+	andb	$1, %al
+	jz	.L24		// jump if not odd
+	// It's an odd integer.
+	fldl	MO(mzero)
+	ret
+
+	cfi_adjust_cfa_offset (8)
+.L22:	fstp	%st(0)
+.L23:	addl	$8, %esp	// Don't use 2 x pop
+	cfi_adjust_cfa_offset (-8)
+.L24:	fldl	MO(zero)
+	ret
+
+END(RT_NOCRT(powl))
+//END(__ieee754_powl)
diff --git a/src/recompiler/Sun/kvm.h b/src/recompiler/Sun/kvm.h
new file mode 100644
index 00000000..0403e3e4
--- /dev/null
+++ b/src/recompiler/Sun/kvm.h
@@ -0,0 +1,28 @@
+/* $Id: kvm.h $ */
+/** @file
+ * VBox Recompiler - kvm stub header.
+ */
+
+/*
+ * Copyright (C) 2011-2019 Oracle Corporation
+ *
+ * This file is part of VirtualBox Open Source Edition (OSE), as
+ * available from http://www.virtualbox.org. This file is free software;
+ * you can redistribute it and/or modify it under the terms of the GNU
+ * General Public License (GPL) as published by the Free Software
+ * Foundation, in version 2 as it comes in the "COPYING" file of the
+ * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
+ * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
+ */
+
+#ifndef ___kvm_stub_h___
+#define ___kvm_stub_h___
+
+#define kvm_enabled()                   false
+#define kvm_update_guest_debug(a, b)    AssertFailed()
+#define kvm_set_phys_mem(a, b, c)       AssertFailed()
+#define kvm_arch_get_registers(a)       AssertFailed()
+#define cpu_synchronize_state(a)        do { } while (0)
+
+#endif
+
diff --git a/src/recompiler/Sun/testmath.c b/src/recompiler/Sun/testmath.c
new file mode 100644
index 00000000..b1650753
--- /dev/null
+++ b/src/recompiler/Sun/testmath.c
@@ -0,0 +1,828 @@
+/* $Id: testmath.c $ */
+/** @file
+ * Testcase for the no-crt math stuff.
+ */
+
+/*
+ * Copyright (C) 2006-2019 Oracle Corporation
+ *
+ * This file is part of VirtualBox Open Source Edition (OSE), as
+ * available from http://www.virtualbox.org. This file is free software;
+ * you can redistribute it and/or modify it under the terms of the GNU
+ * General Public License (GPL) as published by the Free Software
+ * Foundation, in version 2 as it comes in the "COPYING" file of the
+ * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
+ * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
+ */
+
+
+/*********************************************************************************************************************************
+*   Header Files                                                                                                                 *
+*********************************************************************************************************************************/
+#ifndef MATHTEST_STANDALONE
+# include <iprt/assert.h>
+# include <math.h>
+# undef printf
+# define printf RTAssertMsg2Weak
+#else
+# include <stdio.h>
+# include <math.h>
+#endif
+
+/* gcc starting with version 4.3 uses the MPFR library which results in more accurate results. gcc-4.3.1 seems to emit the less accurate result. So just allow both results. */
+#define SIN180a -0.8011526357338304777463731115L
+#define SIN180b -0.801152635733830477871L
+
+static void bitch(const char *pszWhat, const long double *plrdResult, const long double *plrdExpected)
+{
+    const unsigned char *pach1 = (const unsigned char *)plrdResult;
+    const unsigned char *pach2 = (const unsigned char *)plrdExpected;
+#ifndef MATHTEST_STANDALONE
+    printf("error: %s - %d instead of %d\n", pszWhat, (int)(*plrdResult * 100000), (int)(*plrdExpected * 100000));
+#else
+    printf("error: %s - %.25f instead of %.25f\n", pszWhat, (double)*plrdResult, (double)*plrdExpected);
+#endif
+    printf("   %02x%02x%02x%02x-%02x%02x%02x%02x-%02x%02x\n", pach1[0], pach1[1], pach1[2], pach1[3], pach1[4], pach1[5], pach1[6], pach1[7], pach1[8], pach1[9]);
+    printf("   %02x%02x%02x%02x-%02x%02x%02x%02x-%02x%02x\n", pach2[0], pach2[1], pach2[2], pach2[3], pach2[4], pach2[5], pach2[6], pach2[7], pach2[8], pach2[9]);
+}
+
+static void bitchll(const char *pszWhat, long long llResult, long long llExpected)
+{
+#if defined(__MINGW32__) && !defined(Assert)
+    printf("error: %s - %I64d instead of %I64d\n", pszWhat, llResult, llExpected);
+#else
+    printf("error: %s - %lld instead of %lld\n", pszWhat, llResult, llExpected);
+#endif
+}
+
+static void bitchl(const char *pszWhat, long lResult, long lExpected)
+{
+    printf("error: %s - %ld instead of %ld\n", pszWhat, lResult, lExpected);
+}
+
+extern int testsin(void)
+{
+    return sinl(180.0L) == SIN180a || sinl(180.0L) == SIN180b;
+}
+
+extern int testremainder(void)
+{
+    static double s_rd1 = 2.5;
+    static double s_rd2 = 2.0;
+    static double s_rd3 = 0.5;
+    return remainder(s_rd1, s_rd2) == s_rd3;
+}
+
+static __inline__ void set_cw(unsigned cw)
+{
+    __asm __volatile("fldcw %0" : : "m" (cw));
+}
+
+static __inline__ unsigned get_cw(void)
+{
+    unsigned cw;
+    __asm __volatile("fstcw %0" : "=m" (cw));
+    return cw & 0xffff;
+}
+
+static long double check_lrd(const long double lrd, const unsigned long long ull, const unsigned short us)
+{
+    static volatile long double lrd2;
+    lrd2 = lrd;
+    if (    *(unsigned long long *)&lrd2 != ull
+        ||  ((unsigned short *)&lrd2)[4] != us)
+    {
+#if defined(__MINGW32__) && !defined(Assert)
+        printf("%I64x:%04x instead of %I64x:%04x\n", *(unsigned long long *)&lrd2, ((unsigned short *)&lrd2)[4], ull, us);
+#else
+        printf("%llx:%04x instead of %llx:%04x\n", *(unsigned long long *)&lrd2, ((unsigned short *)&lrd2)[4], ull, us);
+#endif
+        __asm__("int $3\n");
+    }
+    return lrd;
+}
+
+
+static long double make_lrd(const unsigned long long ull, const unsigned short us)
+{
+    union
+    {
+        long double lrd;
+        struct
+        {
+            unsigned long long ull;
+            unsigned short us;
+        } i;
+    } u;
+
+    u.i.ull = ull;
+    u.i.us = us;
+    return u.lrd;
+}
+
+static long double check_lrd_cw(const long double lrd, const unsigned long long ull, const unsigned short us, const unsigned cw)
+{
+    set_cw(cw);
+    if (cw != get_cw())
+    {
+        printf("get_cw() -> %#x expected %#x\n", get_cw(), cw);
+        __asm__("int $3\n");
+    }
+    return check_lrd(lrd, ull, us);
+}
+
+static long double make_lrd_cw(unsigned long long ull, unsigned short us, unsigned cw)
+{
+    set_cw(cw);
+    return check_lrd_cw(make_lrd(ull, us), ull, us, cw);
+}
+
+extern int testmath(void)
+{
+    unsigned cErrors = 0;
+    long double lrdResult;
+    long double lrdExpect;
+    long double lrd;
+#define CHECK(operation, expect) \
+    do { \
+        lrdExpect = expect; \
+        lrdResult = operation; \
+        if (lrdResult != lrdExpect) \
+        {  \
+            bitch(#operation,  &lrdResult, &lrdExpect); \
+            cErrors++; \
+        } \
+    } while (0)
+
+    long long llResult;
+    long long llExpect;
+#define CHECKLL(operation, expect) \
+    do { \
+        llExpect = expect; \
+        llResult = operation; \
+        if (llResult != llExpect) \
+        {  \
+            bitchll(#operation,  llResult, llExpect); \
+            cErrors++; \
+        } \
+    } while (0)
+
+    long lResult;
+    long lExpect;
+#define CHECKL(operation, expect) \
+    do { \
+        lExpect = expect; \
+        lResult = operation; \
+        if (lResult != lExpect) \
+        {  \
+            bitchl(#operation,  lResult, lExpect); \
+            cErrors++; \
+        } \
+    } while (0)
+
+    CHECK(atan2l(1.0L, 1.0L), 0.785398163397448309603L);
+    CHECK(atan2l(2.3L, 3.3L), 0.608689307327411694890L);
+
+    CHECK(ceill(1.9L), 2.0L);
+    CHECK(ceill(4.5L), 5.0L);
+    CHECK(ceill(3.3L), 4.0L);
+    CHECK(ceill(6.1L), 7.0L);
+
+    CHECK(floorl(1.9L), 1.0L);
+    CHECK(floorl(4.5L), 4.0L);
+    CHECK(floorl(7.3L), 7.0L);
+    CHECK(floorl(1234.1L), 1234.0L);
+    CHECK(floor(1233.1), 1233.0);
+    CHECK(floor(1239.98989898), 1239.0);
+    CHECK(floorf(9999.999), 9999.0);
+
+    CHECK(ldexpl(1.0L, 1), 2.0L);
+    CHECK(ldexpl(1.0L, 10), 1024.0L);
+    CHECK(ldexpl(2.25L, 10), 2304.0L);
+
+    CHECKLL(llrintl(1.0L), 1);
+    CHECKLL(llrintl(1.3L), 1);
+    CHECKLL(llrintl(1.5L), 2);
+    CHECKLL(llrintl(1.9L), 2);
+    CHECKLL(llrintf(123.34), 123);
+    CHECKLL(llrintf(-123.50), -124);
+    CHECKLL(llrint(42.42), 42);
+    CHECKLL(llrint(-2147483648.12343), -2147483648LL);
+#if !defined(RT_ARCH_AMD64)
+    CHECKLL(lrint(-21474836499.12343), -2147483648LL);
+    CHECKLL(lrint(-2147483649932412.12343), -2147483648LL);
+#else
+    CHECKLL(lrint(-21474836499.12343), -21474836499L);
+    CHECKLL(lrint(-2147483649932412.12343), -2147483649932412L);
+#endif
+
+//    __asm__("int $3");
+    CHECKL(lrintl(make_lrd_cw(000000000000000000ULL,000000,0x027f)), 0L);
+    CHECKL(lrintl(make_lrd_cw(0x8000000000000000ULL,0x3ffe,0x027f)), 0L);
+    CHECKL(lrintl(make_lrd_cw(0x8000000000000000ULL,0x3ffe,0x027f)), 0L);
+    CHECKL(lrintl(make_lrd_cw(0x8000000000000000ULL,0x3ffe,0x067f)), 0L);
+    CHECKL(lrintl(make_lrd_cw(0x8000000000000000ULL,0x3ffe,0x067f)), 0L);
+    CHECKL(lrintl(make_lrd_cw(0x8000000000000000ULL,0x3ffe,0x0a7f)), 1L);
+    CHECKL(lrintl(make_lrd_cw(0x8000000000000000ULL,0x3ffe,0x0a7f)), 1L);
+    CHECKL(lrintl(make_lrd_cw(0x8000000000000000ULL,0x3ffe,0x0e7f)), 0L);
+    CHECKL(lrintl(make_lrd_cw(0x8000000000000000ULL,0x3ffe,0x0e7f)), 0L);
+    CHECKL(lrintl(make_lrd_cw(0x8000000000000000ULL,0xbffe,0x027f)), 0L);
+    CHECKL(lrintl(make_lrd_cw(0x8000000000000000ULL,0xbffe,0x027f)), 0L);
+    CHECKL(lrintl(make_lrd_cw(0x8000000000000000ULL,0xbffe,0x067f)), -1L);
+    CHECKL(lrintl(make_lrd_cw(0x8000000000000000ULL,0xbffe,0x067f)), -1L);
+    CHECKL(lrintl(make_lrd_cw(0x8000000000000000ULL,0xbffe,0x0a7f)), 0L);
+    CHECKL(lrintl(make_lrd_cw(0x8000000000000000ULL,0xbffe,0x0a7f)), 0L);
+    CHECKL(lrintl(make_lrd_cw(0x8000000000000000ULL,0xbffe,0x0e7f)), 0L);
+    CHECKL(lrintl(make_lrd_cw(0x8000000000000000ULL,0xbffe,0x0e7f)), 0L);
+    CHECKL(lrintl(make_lrd_cw(0x9249249249249000ULL,0x3ffc,0x027f)), 0L);
+    CHECKL(lrintl(make_lrd_cw(0x9249249249249000ULL,0x3ffc,0x027f)), 0L);
+    CHECKL(lrintl(make_lrd_cw(0x9249249249249000ULL,0x3ffc,0x067f)), 0L);
+    CHECKL(lrintl(make_lrd_cw(0x9249249249249000ULL,0x3ffc,0x067f)), 0L);
+    CHECKL(lrintl(make_lrd_cw(0x9249249249249000ULL,0x3ffc,0x0a7f)), 1L);
+    CHECKL(lrintl(make_lrd_cw(0x9249249249249000ULL,0x3ffc,0x0a7f)), 1L);
+    CHECKL(lrintl(make_lrd_cw(0x9249249249249000ULL,0x3ffc,0x0e7f)), 0L);
+    CHECKL(lrintl(make_lrd_cw(0x9249249249249000ULL,0x3ffc,0x0e7f)), 0L);
+    CHECKL(lrintl(make_lrd_cw(0xe38e38e38e38e000ULL,0xbffb,0x027f)), 0L);
+    CHECKL(lrintl(make_lrd_cw(0xe38e38e38e38e000ULL,0xbffb,0x027f)), 0L);
+    CHECKL(lrintl(make_lrd_cw(0xe38e38e38e38e000ULL,0xbffb,0x067f)), -1L);
+    CHECKL(lrintl(make_lrd_cw(0xe38e38e38e38e000ULL,0xbffb,0x067f)), -1L);
+    CHECKL(lrintl(make_lrd_cw(0xe38e38e38e38e000ULL,0xbffb,0x0a7f)), 0L);
+    CHECKL(lrintl(make_lrd_cw(0xe38e38e38e38e000ULL,0xbffb,0x0a7f)), 0L);
+    CHECKL(lrintl(make_lrd_cw(0xe38e38e38e38e000ULL,0xbffb,0x0e7f)), 0L);
+    CHECKL(lrintl(make_lrd_cw(0xe38e38e38e38e000ULL,0xbffb,0x0e7f)), 0L);
+    CHECKL(lrintl(make_lrd_cw(0x8000000000000000ULL,0x400e,0x027f)), 32768L);
+    CHECKL(lrintl(make_lrd_cw(0x8000000000000000ULL,0x400e,0x027f)), 32768L);
+    CHECKL(lrintl(make_lrd_cw(0x8000000000000000ULL,0x400e,0x067f)), 32768L);
+    CHECKL(lrintl(make_lrd_cw(0x8000000000000000ULL,0x400e,0x067f)), 32768L);
+    CHECKL(lrintl(make_lrd_cw(0x8000000000000000ULL,0x400e,0x0a7f)), 32768L);
+    CHECKL(lrintl(make_lrd_cw(0x8000000000000000ULL,0x400e,0x0a7f)), 32768L);
+    CHECKL(lrintl(make_lrd_cw(0x8000000000000000ULL,0x400e,0x0e7f)), 32768L);
+    CHECKL(lrintl(make_lrd_cw(0x8000000000000000ULL,0x400e,0x0e7f)), 32768L);
+#if !defined(RT_ARCH_AMD64)
+    /* c90 says that the constant is 2147483648 (which is not representable as a signed 32-bit
+     * value).  To that constant you've then applied the negation operation. c90 doesn't have
+     * negative constants, only positive ones that have been negated.  */
+    CHECKL(lrintl(make_lrd_cw(0xad78ebc5ac620000ULL,0xc041,0x027f)), (long)(-2147483647L - 1));
+    CHECKL(lrintl(make_lrd_cw(0xad78ebc5ac620000ULL,0xc041,0x027f)), (long)(-2147483647L - 1));
+    CHECKL(lrintl(make_lrd_cw(0xad78ebc5ac620000ULL,0xc041,0x067f)), (long)(-2147483647L - 1));
+    CHECKL(lrintl(make_lrd_cw(0xad78ebc5ac620000ULL,0xc041,0x067f)), (long)(-2147483647L - 1));
+    CHECKL(lrintl(make_lrd_cw(0xad78ebc5ac620000ULL,0xc041,0x0a7f)), (long)(-2147483647L - 1));
+    CHECKL(lrintl(make_lrd_cw(0xad78ebc5ac620000ULL,0xc041,0x0a7f)), (long)(-2147483647L - 1));
+    CHECKL(lrintl(make_lrd_cw(0xad78ebc5ac620000ULL,0xc041,0x0e7f)), (long)(-2147483647L - 1));
+    CHECKL(lrintl(make_lrd_cw(0xad78ebc5ac620000ULL,0xc041,0x0e7f)), (long)(-2147483647L - 1));
+#endif
+    set_cw(0x27f);
+
+    CHECK(logl(2.7182818284590452353602874713526625L), 1.0);
+
+    CHECK(remainderl(1.0L, 1.0L), 0.0);
+    CHECK(remainderl(1.0L, 1.5L), -0.5);
+    CHECK(remainderl(42.0L, 34.25L), 7.75);
+    CHECK(remainderf(43.0, 34.25), 8.75);
+    CHECK(remainder(44.25, 34.25), 10.00);
+    double rd1 = 44.25;
+    double rd2 = 34.25;
+    CHECK(remainder(rd1, rd2), 10.00);
+    CHECK(remainder(2.5, 2.0), 0.5);
+    CHECK(remainder(2.5, 2.0), 0.5);
+    CHECK(remainder(2.5, 2.0), 0.5);
+    CHECKLL(testremainder(), 1);
+
+
+    /* Only works in extended precision, while double precision is default on BSD (including Darwin) */
+    set_cw(0x37f);
+    CHECK(rintl(1.0L), 1.0);
+    CHECK(rintl(1.4L), 1.0);
+    CHECK(rintl(1.3L), 1.0);
+    CHECK(rintl(0.9L), 1.0);
+    CHECK(rintl(3123.1232L), 3123.0);
+    CHECK(rint(3985.13454), 3985.0);
+    CHECK(rintf(9999.999), 10000.0);
+    set_cw(0x27f);
+
+    CHECK(sinl(1.0L),  0.84147098480789650664L);
+#if 0
+    lrd = 180.0L;
+    CHECK(sinl(lrd), -0.801152635733830477871L);
+#else
+    lrd = 180.0L;
+    lrdExpect = SIN180a;
+    lrdResult = sinl(lrd);
+    if (lrdResult != lrdExpect)
+    {
+        lrdExpect = SIN180b;
+        if (lrdResult != lrdExpect)
+        {
+            bitch("sinl(lrd)",  &lrdResult, &lrdExpect);
+            cErrors++;
+        }
+    }
+#endif
+#if 0
+    CHECK(sinl(180.0L), SIN180);
+#else
+    lrdExpect = SIN180a;
+    lrdResult = sinl(180.0L);
+    if (lrdResult != lrdExpect)
+    {
+        lrdExpect = SIN180b;
+        if (lrdResult != lrdExpect)
+        {
+            bitch("sinl(180.0L)",  &lrdResult, &lrdExpect);
+            cErrors++;
+        }
+    }
+#endif
+    CHECKLL(testsin(), 1);
+
+    CHECK(sqrtl(1.0L), 1.0);
+    CHECK(sqrtl(4.0L), 2.0);
+    CHECK(sqrtl(1525225.0L), 1235.0);
+
+    CHECK(tanl(0.0L), 0.0);
+    CHECK(tanl(0.7853981633974483096156608458198757L), 1.0);
+
+    CHECK(powl(0.0, 0.0), 1.0);
+    CHECK(powl(2.0, 2.0), 4.0);
+    CHECK(powl(3.0, 3.0), 27.0);
+
+    return cErrors;
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////
+#if 0
+
+#define floatx_to_int32 floatx80_to_int32
+#define floatx_to_int64 floatx80_to_int64
+#define floatx_to_int32_round_to_zero floatx80_to_int32_round_to_zero
+#define floatx_to_int64_round_to_zero floatx80_to_int64_round_to_zero
+#define floatx_abs floatx80_abs
+#define floatx_chs floatx80_chs
+#define floatx_round_to_int(foo, bar) floatx80_round_to_int(foo, NULL)
+#define floatx_compare floatx80_compare
+#define floatx_compare_quiet floatx80_compare_quiet
+#undef sin
+#undef cos
+#undef sqrt
+#undef pow
+#undef log
+#undef tan
+#undef atan2
+#undef floor
+#undef ceil
+#undef ldexp
+#define sin sinl
+#define cos cosl
+#define sqrt sqrtl
+#define pow powl
+#define log logl
+#define tan tanl
+#define atan2 atan2l
+#define floor floorl
+#define ceil ceill
+#define ldexp ldexpl
+
+
+typedef long double CPU86_LDouble;
+
+typedef union {
+    long double d;
+    struct {
+        unsigned long long lower;
+        unsigned short upper;
+    } l;
+} CPU86_LDoubleU;
+
+/* the following deal with x86 long double-precision numbers */
+#define MAXEXPD 0x7fff
+#define EXPBIAS 16383
+#define EXPD(fp)        (fp.l.upper & 0x7fff)
+#define SIGND(fp)       ((fp.l.upper) & 0x8000)
+#define MANTD(fp)       (fp.l.lower)
+#define BIASEXPONENT(fp) fp.l.upper = (fp.l.upper & ~(0x7fff)) | EXPBIAS
+
+typedef long double floatx80;
+#define STATUS_PARAM , void *pv
+
+static floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM)
+{
+    return rintl(a);
+}
+
+
+
+struct myenv
+{
+    unsigned int fpstt; /* top of stack index */
+    unsigned int fpus;
+    unsigned int fpuc;
+    unsigned char fptags[8];   /* 0 = valid, 1 = empty */
+    union {
+#ifdef USE_X86LDOUBLE
+        CPU86_LDouble d __attribute__((aligned(16)));
+#else
+        CPU86_LDouble d;
+#endif
+    } fpregs[8];
+
+} my_env, env_org, env_res, *env = &my_env;
+
+
+#define ST0    (env->fpregs[env->fpstt].d)
+#define ST(n)  (env->fpregs[(env->fpstt + (n)) & 7].d)
+#define ST1    ST(1)
+#define MAXTAN 9223372036854775808.0
+
+
+static inline void fpush(void)
+{
+    env->fpstt = (env->fpstt - 1) & 7;
+    env->fptags[env->fpstt] = 0; /* validate stack entry */
+}
+
+static inline void fpop(void)
+{
+    env->fptags[env->fpstt] = 1; /* invalidate stack entry */
+    env->fpstt = (env->fpstt + 1) & 7;
+}
+
+static void helper_f2xm1(void)
+{
+    ST0 = pow(2.0,ST0) - 1.0;
+}
+
+static void helper_fyl2x(void)
+{
+    CPU86_LDouble fptemp;
+
+    fptemp = ST0;
+    if (fptemp>0.0){
+        fptemp = log(fptemp)/log(2.0);   /* log2(ST) */
+        ST1 *= fptemp;
+        fpop();
+    } else {
+        env->fpus &= (~0x4700);
+        env->fpus |= 0x400;
+    }
+}
+
+static void helper_fptan(void)
+{
+    CPU86_LDouble fptemp;
+
+    fptemp = ST0;
+    if((fptemp > MAXTAN)||(fptemp < -MAXTAN)) {
+        env->fpus |= 0x400;
+    } else {
+        ST0 = tan(fptemp);
+        fpush();
+        ST0 = 1.0;
+        env->fpus &= (~0x400);  /* C2 <-- 0 */
+        /* the above code is for  |arg| < 2**52 only */
+    }
+}
+
+static void helper_fpatan(void)
+{
+    CPU86_LDouble fptemp, fpsrcop;
+
+    fpsrcop = ST1;
+    fptemp = ST0;
+    ST1 = atan2(fpsrcop,fptemp);
+    fpop();
+}
+
+static void helper_fxtract(void)
+{
+    CPU86_LDoubleU temp;
+    unsigned int expdif;
+
+    temp.d = ST0;
+    expdif = EXPD(temp) - EXPBIAS;
+    /*DP exponent bias*/
+    ST0 = expdif;
+    fpush();
+    BIASEXPONENT(temp);
+    ST0 = temp.d;
+}
+
+static void helper_fprem1(void)
+{
+    CPU86_LDouble dblq, fpsrcop, fptemp;
+    CPU86_LDoubleU fpsrcop1, fptemp1;
+    int expdif;
+    int q;
+
+    fpsrcop = ST0;
+    fptemp = ST1;
+    fpsrcop1.d = fpsrcop;
+    fptemp1.d = fptemp;
+    expdif = EXPD(fpsrcop1) - EXPD(fptemp1);
+    if (expdif < 53) {
+        dblq = fpsrcop / fptemp;
+        dblq = (dblq < 0.0)? ceil(dblq): floor(dblq);
+        ST0 = fpsrcop - fptemp*dblq;
+        q = (int)dblq; /* cutting off top bits is assumed here */
+        env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
+                                /* (C0,C1,C3) <-- (q2,q1,q0) */
+        env->fpus |= (q&0x4) << 6; /* (C0) <-- q2 */
+        env->fpus |= (q&0x2) << 8; /* (C1) <-- q1 */
+        env->fpus |= (q&0x1) << 14; /* (C3) <-- q0 */
+    } else {
+        env->fpus |= 0x400;  /* C2 <-- 1 */
+        fptemp = pow(2.0, expdif-50);
+        fpsrcop = (ST0 / ST1) / fptemp;
+        /* fpsrcop = integer obtained by rounding to the nearest */
+        fpsrcop = (fpsrcop-floor(fpsrcop) < ceil(fpsrcop)-fpsrcop)?
+            floor(fpsrcop): ceil(fpsrcop);
+        ST0 -= (ST1 * fpsrcop * fptemp);
+    }
+}
+
+static void helper_fprem(void)
+{
+#if 0
+LogFlow(("helper_fprem: ST0=%.*Rhxs ST1=%.*Rhxs fpus=%#x\n", sizeof(ST0), &ST0, sizeof(ST1), &ST1, env->fpus));
+
+    __asm__ __volatile__("fldt (%2)\n"
+                         "fldt (%1)\n"
+                         "fprem \n"
+                         "fnstsw (%0)\n"
+                         "fstpt (%1)\n"
+                         "fstpt (%2)\n"
+                         : : "r" (&env->fpus), "r" (&ST0), "r" (&ST1) : "memory");
+
+LogFlow(("helper_fprem: -> ST0=%.*Rhxs fpus=%#x c\n", sizeof(ST0), &ST0, env->fpus));
+#else
+    CPU86_LDouble dblq, fpsrcop, fptemp;
+    CPU86_LDoubleU fpsrcop1, fptemp1;
+    int expdif;
+    int q;
+
+    fpsrcop = ST0;
+    fptemp = ST1;
+    fpsrcop1.d = fpsrcop;
+    fptemp1.d = fptemp;
+
+    expdif = EXPD(fpsrcop1) - EXPD(fptemp1);
+    if ( expdif < 53 ) {
+        dblq = fpsrcop / fptemp;
+        dblq = (dblq < 0.0)? ceil(dblq): floor(dblq);
+        ST0 = fpsrcop - fptemp*dblq;
+        q = (int)dblq; /* cutting off top bits is assumed here */
+        env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
+                                /* (C0,C1,C3) <-- (q2,q1,q0) */
+        env->fpus |= (q&0x4) << 6; /* (C0) <-- q2 */
+        env->fpus |= (q&0x2) << 8; /* (C1) <-- q1 */
+        env->fpus |= (q&0x1) << 14; /* (C3) <-- q0 */
+    } else {
+        env->fpus |= 0x400;  /* C2 <-- 1 */
+        fptemp = pow(2.0, expdif-50);
+        fpsrcop = (ST0 / ST1) / fptemp;
+        /* fpsrcop = integer obtained by chopping */
+        fpsrcop = (fpsrcop < 0.0)?
+            -(floor(fabs(fpsrcop))): floor(fpsrcop);
+        ST0 -= (ST1 * fpsrcop * fptemp);
+    }
+#endif
+}
+
+static void helper_fyl2xp1(void)
+{
+    CPU86_LDouble fptemp;
+
+    fptemp = ST0;
+    if ((fptemp+1.0)>0.0) {
+        fptemp = log(fptemp+1.0) / log(2.0); /* log2(ST+1.0) */
+        ST1 *= fptemp;
+        fpop();
+    } else {
+        env->fpus &= (~0x4700);
+        env->fpus |= 0x400;
+    }
+}
+
+static void helper_fsqrt(void)
+{
+    CPU86_LDouble fptemp;
+
+    fptemp = ST0;
+    if (fptemp<0.0) {
+        env->fpus &= (~0x4700);  /* (C3,C2,C1,C0) <-- 0000 */
+        env->fpus |= 0x400;
+    }
+    ST0 = sqrt(fptemp);
+}
+
+static void helper_fsincos(void)
+{
+    CPU86_LDouble fptemp;
+
+    fptemp = ST0;
+    if ((fptemp > MAXTAN)||(fptemp < -MAXTAN)) {
+        env->fpus |= 0x400;
+    } else {
+        ST0 = sin(fptemp);
+        fpush();
+        ST0 = cos(fptemp);
+        env->fpus &= (~0x400);  /* C2 <-- 0 */
+        /* the above code is for  |arg| < 2**63 only */
+    }
+}
+
+static void helper_frndint(void)
+{
+    ST0 = floatx_round_to_int(ST0, &env->fp_status);
+}
+
+static void helper_fscale(void)
+{
+    ST0 = ldexp (ST0, (int)(ST1));
+}
+
+static void helper_fsin(void)
+{
+    CPU86_LDouble fptemp;
+
+    fptemp = ST0;
+    if ((fptemp > MAXTAN)||(fptemp < -MAXTAN)) {
+        env->fpus |= 0x400;
+    } else {
+        ST0 = sin(fptemp);
+        env->fpus &= (~0x400);  /* C2 <-- 0 */
+        /* the above code is for  |arg| < 2**53 only */
+    }
+}
+
+static void helper_fcos(void)
+{
+    CPU86_LDouble fptemp;
+
+    fptemp = ST0;
+    if((fptemp > MAXTAN)||(fptemp < -MAXTAN)) {
+        env->fpus |= 0x400;
+    } else {
+        ST0 = cos(fptemp);
+        env->fpus &= (~0x400);  /* C2 <-- 0 */
+        /* the above code is for  |arg5 < 2**63 only */
+    }
+}
+
+static void helper_fxam_ST0(void)
+{
+    CPU86_LDoubleU temp;
+    int expdif;
+
+    temp.d = ST0;
+
+    env->fpus &= (~0x4700);  /* (C3,C2,C1,C0) <-- 0000 */
+    if (SIGND(temp))
+        env->fpus |= 0x200; /* C1 <-- 1 */
+
+    /* XXX: test fptags too */
+    expdif = EXPD(temp);
+    if (expdif == MAXEXPD) {
+#ifdef USE_X86LDOUBLE
+        if (MANTD(temp) == 0x8000000000000000ULL)
+#else
+        if (MANTD(temp) == 0)
+#endif
+            env->fpus |=  0x500 /*Infinity*/;
+        else
+            env->fpus |=  0x100 /*NaN*/;
+    } else if (expdif == 0) {
+        if (MANTD(temp) == 0)
+            env->fpus |=  0x4000 /*Zero*/;
+        else
+            env->fpus |= 0x4400 /*Denormal*/;
+    } else {
+        env->fpus |= 0x400;
+    }
+}
+
+
+void check_env(void)
+{
+    int i;
+    for (i = 0; i < 8; i++)
+    {
+        CPU86_LDoubleU my, res;
+        my.d = env->fpregs[i].d;
+        res.d = env_res.fpregs[i].d;
+
+        if (    my.l.lower != res.l.lower
+            ||  my.l.upper != res.l.upper)
+            printf("register %i: %#018llx:%#06x\n"
+                   "    expected %#018llx:%#06x\n",
+                   i,
+                   my.l.lower, my.l.upper,
+                   res.l.lower, res.l.upper);
+    }
+    for (i = 0; i < 8; i++)
+        if (env->fptags[i] != env_res.fptags[i])
+            printf("tag %i: %d != %d\n", i, env->fptags[i], env_res.fptags[i]);
+    if (env->fpstt != env_res.fpstt)
+        printf("fpstt: %#06x != %#06x\n", env->fpstt, env_res.fpstt);
+    if (env->fpuc != env_res.fpuc)
+        printf("fpuc:  %#06x != %#06x\n", env->fpuc, env_res.fpuc);
+    if (env->fpus != env_res.fpus)
+        printf("fpus:  %#06x != %#06x\n", env->fpus, env_res.fpus);
+}
+#endif /* not used. */
+
+#if 0 /* insert this into helper.c */
+/* FPU helpers */
+CPU86_LDoubleU  my_st[8];
+unsigned int    my_fpstt;
+unsigned int    my_fpus;
+unsigned int    my_fpuc;
+unsigned char my_fptags[8];
+
+void hlp_fpu_enter(void)
+{
+    int i;
+    for (i = 0; i < 8; i++)
+        my_st[i].d = env->fpregs[i].d;
+    my_fpstt = env->fpstt;
+    my_fpus = env->fpus;
+    my_fpuc = env->fpuc;
+    memcpy(&my_fptags, &env->fptags, sizeof(my_fptags));
+}
+
+void hlp_fpu_leave(const char *psz)
+{
+    int i;
+    Log(("/*code*/ \n"));
+    for (i = 0; i < 8; i++)
+        Log(("/*code*/ *(unsigned long long *)&env_org.fpregs[%d] = %#018llxULL; ((unsigned short *)&env_org.fpregs[%d])[4] = %#06x; env_org.fptags[%d]=%d;\n",
+             i, my_st[i].l.lower, i, my_st[i].l.upper, i, my_fptags[i]));
+    Log(("/*code*/ env_org.fpstt=%#x;\n", my_fpstt));
+    Log(("/*code*/ env_org.fpus=%#x;\n", my_fpus));
+    Log(("/*code*/ env_org.fpuc=%#x;\n", my_fpuc));
+    for (i = 0; i < 8; i++)
+    {
+        CPU86_LDoubleU u;
+        u.d = env->fpregs[i].d;
+        Log(("/*code*/ *(unsigned long long *)&env_res.fpregs[%d] = %#018llxULL; ((unsigned short *)&env_res.fpregs[%d])[4] = %#06x; env_res.fptags[%d]=%d;\n",
+             i, u.l.lower, i, u.l.upper, i, env->fptags[i]));
+    }
+    Log(("/*code*/ env_res.fpstt=%#x;\n", env->fpstt));
+    Log(("/*code*/ env_res.fpus=%#x;\n", env->fpus));
+    Log(("/*code*/ env_res.fpuc=%#x;\n", env->fpuc));
+
+    Log(("/*code*/ my_env = env_org;\n"));
+    Log(("/*code*/ %s();\n", psz));
+    Log(("/*code*/ check_env();\n"));
+}
+#endif /* helper.c */
+
+extern void testmath2(void )
+{
+#if 0
+#include "/tmp/code.h"
+#endif
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef MATHTEST_STANDALONE
+
+void test_fops(double a, double b)
+{
+    printf("a=%f b=%f a+b=%f\n", a, b, a + b);
+    printf("a=%f b=%f a-b=%f\n", a, b, a - b);
+    printf("a=%f b=%f a*b=%f\n", a, b, a * b);
+    printf("a=%f b=%f a/b=%f\n", a, b, a / b);
+    printf("a=%f b=%f fmod(a, b)=%f\n", a, b, (double)fmod(a, b));
+    printf("a=%f sqrt(a)=%f\n", a, (double)sqrtl(a));
+    printf("a=%f sin(a)=%f\n", a, (double)sinl(a));
+    printf("a=%f cos(a)=%f\n", a, (double)cos(a));
+    printf("a=%f tan(a)=%f\n", a, (double)tanl(a));
+    printf("a=%f log(a)=%f\n", a, (double)log(a));
+    printf("a=%f exp(a)=%f\n", a, (double)exp(a));
+    printf("a=%f b=%f atan2(a, b)=%f\n", a, b, atan2(a, b));
+    /* just to test some op combining */
+    printf("a=%f asin(sinl(a))=%f\n", a, (double)asin(sinl(a)));
+    printf("a=%f acos(cos(a))=%f\n", a, (double)acos(cos(a)));
+    printf("a=%f atan(tanl(a))=%f\n", a, (double)atan(tanl(a)));
+}
+
+int main()
+{
+    unsigned cErrors = testmath();
+
+    testmath2();
+    test_fops(2, 3);
+    test_fops(1.4, -5);
+
+    printf("cErrors=%u\n", cErrors);
+    return cErrors;
+}
+#endif
+
diff --git a/src/recompiler/VBoxREM.def b/src/recompiler/VBoxREM.def
new file mode 100644
index 00000000..96e28350
--- /dev/null
+++ b/src/recompiler/VBoxREM.def
@@ -0,0 +1,50 @@
+; $Id: VBoxREM.def $
+;; @file
+; VBoxREM Definition File.
+;
+
+;
+; Copyright (C) 2006-2019 Oracle Corporation
+;
+; This file is part of VirtualBox Open Source Edition (OSE), as
+; available from http://www.virtualbox.org. This file is free software;
+; you can redistribute it and/or modify it under the terms of the GNU
+; General Public License (GPL) as published by the Free Software
+; Foundation, in version 2 as it comes in the "COPYING" file of the
+; VirtualBox OSE distribution. VirtualBox OSE is distributed in the
+; hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
+;
+
+LIBRARY VBoxREM.dll
+
+EXPORTS
+    REMR3Init
+    REMR3InitFinalize
+    REMR3Term
+    REMR3Reset
+    REMR3Step
+    REMR3BreakpointSet
+    REMR3BreakpointClear
+    REMR3Run
+    REMR3EmulateInstruction
+    REMR3State
+    REMR3StateBack
+    REMR3StateUpdate
+    REMR3A20Set
+    REMR3DisasEnableStepping
+    REMR3ReplayHandlerNotifications
+    REMR3NotifyPhysRamRegister
+    REMR3NotifyPhysRamDeregister
+    REMR3NotifyPhysRomRegister
+    REMR3NotifyHandlerPhysicalModify
+    REMR3NotifyHandlerPhysicalRegister
+    REMR3NotifyHandlerPhysicalDeregister
+    REMR3NotifyInterruptSet
+    REMR3NotifyInterruptClear
+    REMR3NotifyTimerPending
+    REMR3NotifyDmaPending
+    REMR3NotifyQueuePending
+    REMR3NotifyFF
+    REMR3NotifyCodePageChanged
+    REMR3IsPageAccessHandled
+
diff --git a/src/recompiler/VBoxREM.rc b/src/recompiler/VBoxREM.rc
new file mode 100644
index 00000000..ccf23339
--- /dev/null
+++ b/src/recompiler/VBoxREM.rc
@@ -0,0 +1,51 @@
+/* $Id: VBoxREM.rc $ */
+/** @file
+ * VBoxREM - Resource file containing version info.
+ */
+
+/*
+ * Copyright (C) 2015-2019 Oracle Corporation
+ *
+ * This file is part of VirtualBox Open Source Edition (OSE), as
+ * available from http://www.virtualbox.org. This file is free software;
+ * you can redistribute it and/or modify it under the terms of the GNU
+ * General Public License (GPL) as published by the Free Software
+ * Foundation, in version 2 as it comes in the "COPYING" file of the
+ * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
+ * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
+ */
+
+#include <windows.h>
+#include <VBox/version.h>
+
+LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US
+
+VS_VERSION_INFO VERSIONINFO
+  FILEVERSION      VBOX_RC_FILE_VERSION
+  PRODUCTVERSION   VBOX_RC_PRODUCT_VERSION
+  FILEFLAGSMASK    VS_FFI_FILEFLAGSMASK
+  FILEFLAGS        VBOX_RC_FILE_FLAGS
+  FILEOS           VBOX_RC_FILE_OS
+  FILETYPE         VBOX_RC_TYPE_DLL
+  FILESUBTYPE      VFT2_UNKNOWN
+BEGIN
+  BLOCK "StringFileInfo"
+  BEGIN
+    BLOCK "040904b0" // Lang=US English, CharSet=Unicode
+    BEGIN
+      VALUE "FileDescription",  "VirtualBox Recompiler\0"
+      VALUE "InternalName",     "VBoxREM\0"
+      VALUE "OriginalFilename", "VBoxREM.dll\0"
+      VALUE "CompanyName",      VBOX_RC_COMPANY_NAME
+      VALUE "FileVersion",      VBOX_RC_FILE_VERSION_STR
+      VALUE "LegalCopyright",   VBOX_RC_LEGAL_COPYRIGHT
+      VALUE "ProductName",      VBOX_RC_PRODUCT_NAME_STR
+      VALUE "ProductVersion",   VBOX_RC_PRODUCT_VERSION_STR
+      VBOX_RC_MORE_STRINGS
+    END
+  END
+  BLOCK "VarFileInfo"
+  BEGIN
+    VALUE "Translation", 0x409, 1200
+  END
+END
diff --git a/src/recompiler/VBoxREMWrapper.cpp b/src/recompiler/VBoxREMWrapper.cpp
new file mode 100644
index 00000000..4349bd9a
--- /dev/null
+++ b/src/recompiler/VBoxREMWrapper.cpp
@@ -0,0 +1,2477 @@
+/* $Id: VBoxREMWrapper.cpp $ */
+/** @file
+ *
+ * VBoxREM Win64 DLL Wrapper.
+ */
+/*
+ * Copyright (C) 2006-2019 Oracle Corporation
+ *
+ * This file is part of VirtualBox Open Source Edition (OSE), as
+ * available from http://www.virtualbox.org. This file is free software;
+ * you can redistribute it and/or modify it under the terms of the GNU
+ * General Public License (GPL) as published by the Free Software
+ * Foundation, in version 2 as it comes in the "COPYING" file of the
+ * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
+ * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
+ */
+
+
+/** @page pg_vboxrem_amd64      VBoxREM Hacks on AMD64
+ *
+ * There are problems with building BoxREM both on WIN64 and 64-bit linux.
+ *
+ * On linux binutils refuses to link shared objects without -fPIC compiled code
+ * (bitches about some fixup types). But when trying to build with -fPIC dyngen
+ * doesn't like the code anymore. Sweet. The current solution is to build the
+ * VBoxREM code as a relocatable module and use our ELF loader to load it.
+ *
+ * On WIN64 we're not aware of any GCC port which can emit code using the MSC
+ * calling convention. So, we're in for some real fun here. The choice is between
+ * porting GCC to AMD64 WIN64 and coming up with some kind of wrapper around
+ * either the win32 build or the 64-bit linux build.
+ *
+ *  -# Porting GCC will be a lot of work. For one thing the calling convention differs
+ *     and messing with such stuff can easily create ugly bugs. We would also have to
+ *     do some binutils changes, but I think those are rather small compared to GCC.
+ *     (That said, the MSC calling convention is far simpler than the linux one, it
+ *     reminds me of _Optlink which we have working already.)
+ *  -# Wrapping win32 code will work, but addresses outside the first 4GB are
+ *     inaccessible and we will have to create 32-64 thunks for all imported functions.
+ *     (To switch between 32-bit and 64-bit is load the right CS using far jmps (32->64)
+ *     or far returns (both).)
+ *  -# Wrapping 64-bit linux code might be the easier solution. The requirements here
+ *     are:
+ *       - Remove all CRT references we possibly, either by using intrinsics or using
+ *         IPRT. Part of IPRT will be linked into VBoxREM2.rel, this will be yet another
+ *         IPRT mode which I've dubbed 'no-crt'. The no-crt mode provide basic non-system
+ *         dependent stuff.
+ *       - Compile and link it into a relocatable object (include the gcc intrinsics
+ *         in libgcc). Call this VBoxREM2.rel.
+ *       - Write a wrapper dll, VBoxREM.dll, for which during REMR3Init() will load
+ *         VBoxREM2.rel (using IPRT) and generate calling convention wrappers
+ *         for all IPRT functions and VBoxVMM functions that it uses. All exports
+ *         will be wrapped vice versa.
+ *       - For building on windows hosts, we will use a mingw32 hosted cross compiler.
+ *         and add a 'no-crt' mode to IPRT where it provides the necessary CRT headers
+ *         and function implementations.
+ *
+ * The 3rd solution will be tried out first since it requires the least effort and
+ * will let us make use of the full 64-bit register set.
+ *
+ *
+ *
+ * @section sec_vboxrem_amd64_compare   Comparing the GCC and MSC calling conventions
+ *
+ * GCC expects the following (cut & past from page 20 in the ABI draft 0.96):
+ *
+ * @verbatim
+    %rax     temporary register; with variable arguments passes information about the
+             number of SSE registers used; 1st return register.
+             [Not preserved]
+    %rbx     callee-saved register; optionally used as base pointer.
+             [Preserved]
+    %rcx     used to pass 4th integer argument to functions.
+             [Not preserved]
+    %rdx     used to pass 3rd argument to functions; 2nd return register
+             [Not preserved]
+    %rsp     stack pointer
+             [Preserved]
+    %rbp     callee-saved register; optionally used as frame pointer
+             [Preserved]
+    %rsi     used to pass 2nd argument to functions
+             [Not preserved]
+    %rdi     used to pass 1st argument to functions
+             [Not preserved]
+    %r8      used to pass 5th argument to functions
+             [Not preserved]
+    %r9      used to pass 6th argument to functions
+             [Not preserved]
+    %r10     temporary register, used for passing a function's static chain
+             pointer [Not preserved]
+    %r11     temporary register
+             [Not preserved]
+    %r12-r15 callee-saved registers
+             [Preserved]
+    %xmm0-%xmm1  used to pass and return floating point arguments
+             [Not preserved]
+    %xmm2-%xmm7  used to pass floating point arguments
+             [Not preserved]
+    %xmm8-%xmm15 temporary registers
+             [Not preserved]
+    %mmx0-%mmx7  temporary registers
+             [Not preserved]
+    %st0     temporary register; used to return long double arguments
+             [Not preserved]
+    %st1     temporary registers; used to return long double arguments
+             [Not preserved]
+    %st2-%st7 temporary registers
+             [Not preserved]
+    %fs      Reserved for system use (as thread specific data register)
+             [Not preserved]
+   @endverbatim
+ *
+ * Direction flag is preserved as cleared.
+ * The stack must be aligned on a 16-byte boundary before the 'call/jmp' instruction.
+ *
+ *
+ *
+ * MSC expects the following:
+ * @verbatim
+    rax      return value, not preserved.
+    rbx      preserved.
+    rcx      1st argument, integer, not preserved.
+    rdx      2nd argument, integer, not preserved.
+    rbp      preserved.
+    rsp      preserved.
+    rsi      preserved.
+    rdi      preserved.
+    r8       3rd argument, integer, not preserved.
+    r9       4th argument, integer, not preserved.
+    r10      scratch register, not preserved.
+    r11      scratch register, not preserved.
+    r12-r15  preserved.
+    xmm0     1st argument, fp, return value, not preserved.
+    xmm1     2st argument, fp, not preserved.
+    xmm2     3st argument, fp, not preserved.
+    xmm3     4st argument, fp, not preserved.
+    xmm4-xmm5    scratch, not preserved.
+    xmm6-xmm15   preserved.
+   @endverbatim
+ *
+ * Dunno what the direction flag is...
+ * The stack must be aligned on a 16-byte boundary before the 'call/jmp' instruction.
+ *
+ *
+ * Thus, When GCC code is calling MSC code we don't really have to preserve
+ * anything. But but MSC code is calling GCC code, we'll have to save esi and edi.
+ *
+ */
+
+
+/*********************************************************************************************************************************
+*   Defined Constants And Macros                                                                                                 *
+*********************************************************************************************************************************/
+/** @def USE_REM_STUBS
+ * Define USE_REM_STUBS to stub the entire REM stuff. This is useful during
+ * early porting (before we start running stuff).
+ */
+#if defined(DOXYGEN_RUNNING)
+# define USE_REM_STUBS
+#endif
+
+/** @def USE_REM_CALLING_CONVENTION_GLUE
+ * Define USE_REM_CALLING_CONVENTION_GLUE for platforms where it's necessary to
+ * use calling convention wrappers.
+ */
+#if (defined(RT_ARCH_AMD64) && defined(RT_OS_WINDOWS)) || defined(DOXYGEN_RUNNING)
+# define USE_REM_CALLING_CONVENTION_GLUE
+#endif
+
+/** @def USE_REM_IMPORT_JUMP_GLUE
+ * Define USE_REM_IMPORT_JUMP_GLUE for platforms where we need to
+ * emit some jump glue to deal with big addresses.
+ */
+#if (defined(RT_ARCH_AMD64) && !defined(USE_REM_CALLING_CONVENTION_GLUE) && !defined(RT_OS_DARWIN)) || defined(DOXYGEN_RUNNING)
+# define USE_REM_IMPORT_JUMP_GLUE
+#endif
+
+/** @def VBOX_USE_BITNESS_SELECTOR
+ * Define VBOX_USE_BITNESS_SELECTOR to build this module as a bitness selector
+ * between VBoxREM32 and VBoxREM64.
+ */
+#if defined(DOXYGEN_RUNNING)
+# define VBOX_USE_BITNESS_SELECTOR
+#endif
+
+/** @def VBOX_WITHOUT_REM_LDR_CYCLE
+ * Define VBOX_WITHOUT_REM_LDR_CYCLE dynamically resolve any dependencies on
+ * VBoxVMM and thus avoid the cyclic dependency between VBoxREM and VBoxVMM.
+ */
+#if defined(DOXYGEN_RUNNING)
+# define VBOX_WITHOUT_REM_LDR_CYCLE
+#endif
+
+
+/*********************************************************************************************************************************
+*   Header Files                                                                                                                 *
+*********************************************************************************************************************************/
+#define LOG_GROUP LOG_GROUP_REM
+#include <VBox/vmm/rem.h>
+#include <VBox/vmm/vmm.h>
+#include <VBox/vmm/dbgf.h>
+#include <VBox/dbg.h>
+#include <VBox/vmm/csam.h>
+#include <VBox/vmm/mm.h>
+#include <VBox/vmm/em.h>
+#include <VBox/vmm/ssm.h>
+#include <VBox/vmm/hm.h>
+#include <VBox/vmm/patm.h>
+#include <VBox/vmm/pdm.h>
+#include <VBox/vmm/pdmcritsect.h>
+#include <VBox/vmm/pgm.h>
+#include <VBox/vmm/iom.h>
+#include <VBox/vmm/vm.h>
+#include <VBox/err.h>
+#include <VBox/log.h>
+#include <VBox/dis.h>
+
+#include <iprt/alloc.h>
+#include <iprt/assert.h>
+#include <iprt/ldr.h>
+#include <iprt/lockvalidator.h>
+#include <iprt/param.h>
+#include <iprt/path.h>
+#include <iprt/string.h>
+#include <iprt/stream.h>
+
+
+/*********************************************************************************************************************************
+*   Structures and Typedefs                                                                                                      *
+*********************************************************************************************************************************/
+/**
+ * Parameter descriptor.
+ */
+typedef struct REMPARMDESC
+{
+    /** Parameter flags (REMPARMDESC_FLAGS_*). */
+    uint8_t     fFlags;
+    /** The parameter size if REMPARMDESC_FLAGS_SIZE is set. */
+    uint8_t     cb;
+    /** Pointer to additional data.
+     * For REMPARMDESC_FLAGS_PFN this is a PREMFNDESC. */
+    void       *pvExtra;
+
+} REMPARMDESC, *PREMPARMDESC;
+/** Pointer to a constant parameter descriptor. */
+typedef const REMPARMDESC *PCREMPARMDESC;
+
+/** @name Parameter descriptor flags.
+ * @{ */
+/** The parameter type is a kind of integer which could fit in a register. This includes pointers. */
+#define REMPARMDESC_FLAGS_INT           0
+/** The parameter is a GC pointer. */
+#define REMPARMDESC_FLAGS_GCPTR         1
+/** The parameter is a GC physical address. */
+#define REMPARMDESC_FLAGS_GCPHYS        2
+/** The parameter is a HC physical address. */
+#define REMPARMDESC_FLAGS_HCPHYS        3
+/** The parameter type is a kind of floating point. */
+#define REMPARMDESC_FLAGS_FLOAT         4
+/** The parameter value is a struct. This type takes a size. */
+#define REMPARMDESC_FLAGS_STRUCT        5
+/** The parameter is an elipsis. */
+#define REMPARMDESC_FLAGS_ELLIPSIS      6
+/** The parameter is a va_list. */
+#define REMPARMDESC_FLAGS_VALIST        7
+/** The parameter is a function pointer. pvExtra is a PREMFNDESC. */
+#define REMPARMDESC_FLAGS_PFN           8
+/** The parameter type mask. */
+#define REMPARMDESC_FLAGS_TYPE_MASK     15
+/** The parameter size field is valid. */
+#define REMPARMDESC_FLAGS_SIZE          RT_BIT(7)
+/** @} */
+
+/**
+ * Function descriptor.
+ */
+typedef struct REMFNDESC
+{
+    /** The function name. */
+    const char         *pszName;
+    /** Exports: Pointer to the function pointer.
+     * Imports: Pointer to the function. */
+    void               *pv;
+    /** Array of parameter descriptors. */
+    PCREMPARMDESC       paParams;
+    /** The number of parameter descriptors pointed to by paParams. */
+    uint8_t             cParams;
+    /** Function flags (REMFNDESC_FLAGS_*). */
+    uint8_t             fFlags;
+    /** The size of the return value. */
+    uint8_t             cbReturn;
+    /** Pointer to the wrapper code for imports. */
+    void               *pvWrapper;
+} REMFNDESC, *PREMFNDESC;
+/** Pointer to a constant function descriptor. */
+typedef const REMFNDESC *PCREMFNDESC;
+
+/** @name Function descriptor flags.
+ * @{ */
+/** The return type is void. */
+#define REMFNDESC_FLAGS_RET_VOID        0
+/** The return type is a kind of integer passed in rax/eax. This includes pointers. */
+#define REMFNDESC_FLAGS_RET_INT         1
+/** The return type is a kind of floating point. */
+#define REMFNDESC_FLAGS_RET_FLOAT       2
+/** The return value is a struct. This type take a size. */
+#define REMFNDESC_FLAGS_RET_STRUCT      3
+/** The return type mask. */
+#define REMFNDESC_FLAGS_RET_TYPE_MASK   7
+/** The argument list contains one or more va_list arguments (i.e. problems). */
+#define REMFNDESC_FLAGS_VALIST          RT_BIT(6)
+/** The function has an ellipsis (i.e. a problem). */
+#define REMFNDESC_FLAGS_ELLIPSIS        RT_BIT(7)
+/** @} */
+
+/**
+ * Chunk of read-write-executable memory.
+ */
+typedef struct REMEXECMEM
+{
+    /** The number of bytes left. */
+    struct REMEXECMEM  *pNext;
+    /** The size of this chunk. */
+    uint32_t            cb;
+    /** The offset of the next code block. */
+    uint32_t            off;
+#if ARCH_BITS == 32
+    uint32_t            padding;
+#endif
+} REMEXECMEM, *PREMEXECMEM;
+
+
+/*********************************************************************************************************************************
+*   Global Variables                                                                                                             *
+*********************************************************************************************************************************/
+#ifndef USE_REM_STUBS
+/** Loader handle of the REM object/DLL. */
+static RTLDRMOD g_ModREM2 = NIL_RTLDRMOD;
+# ifndef VBOX_USE_BITNESS_SELECTOR
+/** Pointer to the memory containing the loaded REM2 object/DLL. */
+static void    *g_pvREM2 = NULL;
+/** The size of the memory g_pvREM2 is pointing to. */
+static size_t   g_cbREM2 = 0;
+# endif
+# ifdef VBOX_WITHOUT_REM_LDR_CYCLE
+/** Loader handle of the VBoxVMM DLL. */
+static RTLDRMOD g_ModVMM = NIL_RTLDRMOD;
+# endif
+
+/** Linux object export addresses.
+ * These are references from the assembly wrapper code.
+ * @{ */
+static DECLCALLBACKPTR(int, pfnREMR3Init)(PVM);
+static DECLCALLBACKPTR(int, pfnREMR3InitFinalize)(PVM);
+static DECLCALLBACKPTR(int, pfnREMR3Term)(PVM);
+static DECLCALLBACKPTR(void, pfnREMR3Reset)(PVM);
+static DECLCALLBACKPTR(int, pfnREMR3Step)(PVM, PVMCPU);
+static DECLCALLBACKPTR(int, pfnREMR3BreakpointSet)(PVM, RTGCUINTPTR);
+static DECLCALLBACKPTR(int, pfnREMR3BreakpointClear)(PVM, RTGCUINTPTR);
+static DECLCALLBACKPTR(int, pfnREMR3EmulateInstruction)(PVM, PVMCPU);
+static DECLCALLBACKPTR(int, pfnREMR3Run)(PVM, PVMCPU);
+static DECLCALLBACKPTR(int, pfnREMR3State)(PVM, PVMCPU);
+static DECLCALLBACKPTR(int, pfnREMR3StateBack)(PVM, PVMCPU);
+static DECLCALLBACKPTR(void, pfnREMR3StateUpdate)(PVM, PVMCPU);
+static DECLCALLBACKPTR(void, pfnREMR3A20Set)(PVM, PVMCPU, bool);
+static DECLCALLBACKPTR(void, pfnREMR3ReplayHandlerNotifications)(PVM pVM);
+static DECLCALLBACKPTR(void, pfnREMR3NotifyPhysRamRegister)(PVM, RTGCPHYS, RTGCPHYS, unsigned);
+static DECLCALLBACKPTR(void, pfnREMR3NotifyPhysRamDeregister)(PVM, RTGCPHYS, RTUINT);
+static DECLCALLBACKPTR(void, pfnREMR3NotifyPhysRomRegister)(PVM, RTGCPHYS, RTUINT, void *, bool);
+static DECLCALLBACKPTR(void, pfnREMR3NotifyHandlerPhysicalModify)(PVM, PGMPHYSHANDLERKIND, RTGCPHYS, RTGCPHYS, RTGCPHYS, bool, bool);
+static DECLCALLBACKPTR(void, pfnREMR3NotifyHandlerPhysicalRegister)(PVM, PGMPHYSHANDLERKIND, RTGCPHYS, RTGCPHYS, bool);
+static DECLCALLBACKPTR(void, pfnREMR3NotifyHandlerPhysicalDeregister)(PVM, PGMPHYSHANDLERKIND, RTGCPHYS, RTGCPHYS, bool, bool);
+static DECLCALLBACKPTR(void, pfnREMR3NotifyInterruptSet)(PVM, PVMCPU);
+static DECLCALLBACKPTR(void, pfnREMR3NotifyInterruptClear)(PVM, PVMCPU);
+static DECLCALLBACKPTR(void, pfnREMR3NotifyTimerPending)(PVM, PVMCPU);
+static DECLCALLBACKPTR(void, pfnREMR3NotifyDmaPending)(PVM);
+static DECLCALLBACKPTR(void, pfnREMR3NotifyQueuePending)(PVM);
+static DECLCALLBACKPTR(void, pfnREMR3NotifyFF)(PVM);
+static DECLCALLBACKPTR(int, pfnREMR3NotifyCodePageChanged)(PVM, PVMCPU, RTGCPTR);
+static DECLCALLBACKPTR(int, pfnREMR3DisasEnableStepping)(PVM, bool);
+static DECLCALLBACKPTR(bool, pfnREMR3IsPageAccessHandled)(PVM, RTGCPHYS);
+/** @} */
+
+/** Export and import parameter descriptors.
+ * @{
+ */
+/* Common args. */
+static const REMPARMDESC g_aArgsSIZE_T[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(size_t),             NULL }
+};
+static const REMPARMDESC g_aArgsPTR[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(void *),             NULL }
+};
+static const REMPARMDESC g_aArgsSIZE_TTag[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(size_t),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(const char *),       NULL }
+};
+static const REMPARMDESC g_aArgsPTRTag[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(void *),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(const char *),       NULL }
+};
+static const REMPARMDESC g_aArgsPTR_SIZE_T[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(void *),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(size_t),             NULL }
+};
+static const REMPARMDESC g_aArgsSIZE_TTagLoc[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(size_t),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(const char *),       NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(const char *),       NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(unsigned int),       NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(const char *),       NULL }
+};
+static const REMPARMDESC g_aArgsPTRLoc[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(void *),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(const char *),       NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(unsigned int),       NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(const char *),       NULL }
+};
+static const REMPARMDESC g_aArgsVM[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL }
+};
+static const REMPARMDESC g_aArgsVMCPU[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVMCPU),             NULL }
+};
+
+static const REMPARMDESC g_aArgsVMandVMCPU[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVMCPU),             NULL }
+};
+
+/* REM args */
+static const REMPARMDESC g_aArgsBreakpoint[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_GCPTR,      sizeof(RTGCUINTPTR),        NULL }
+};
+static const REMPARMDESC g_aArgsA20Set[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVMCPU),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(bool),               NULL }
+};
+static const REMPARMDESC g_aArgsNotifyPhysRamRegister[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_GCPHYS,     sizeof(RTGCPHYS),           NULL },
+    { REMPARMDESC_FLAGS_GCPHYS,     sizeof(RTGCPHYS),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(void *),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(unsigned),           NULL }
+};
+static const REMPARMDESC g_aArgsNotifyPhysRamChunkRegister[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_GCPHYS,     sizeof(RTGCPHYS),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(RTUINT),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(RTHCUINTPTR),        NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(unsigned),           NULL }
+};
+static const REMPARMDESC g_aArgsNotifyPhysRamDeregister[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_GCPHYS,     sizeof(RTGCPHYS),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(RTUINT),             NULL }
+};
+static const REMPARMDESC g_aArgsNotifyPhysRomRegister[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_GCPHYS,     sizeof(RTGCPHYS),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(RTUINT),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(void *),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(bool),               NULL }
+};
+static const REMPARMDESC g_aArgsNotifyHandlerPhysicalModify[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(PGMPHYSHANDLERKIND), NULL },
+    { REMPARMDESC_FLAGS_GCPHYS,     sizeof(RTGCPHYS),           NULL },
+    { REMPARMDESC_FLAGS_GCPHYS,     sizeof(RTGCPHYS),           NULL },
+    { REMPARMDESC_FLAGS_GCPHYS,     sizeof(RTGCPHYS),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(bool),               NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(bool),               NULL }
+};
+static const REMPARMDESC g_aArgsNotifyHandlerPhysicalRegister[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(PGMPHYSHANDLERKIND), NULL },
+    { REMPARMDESC_FLAGS_GCPHYS,     sizeof(RTGCPHYS),           NULL },
+    { REMPARMDESC_FLAGS_GCPHYS,     sizeof(RTGCPHYS),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(bool),               NULL }
+};
+static const REMPARMDESC g_aArgsNotifyHandlerPhysicalDeregister[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(PGMPHYSHANDLERKIND), NULL },
+    { REMPARMDESC_FLAGS_GCPHYS,     sizeof(RTGCPHYS),           NULL },
+    { REMPARMDESC_FLAGS_GCPHYS,     sizeof(RTGCPHYS),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(bool),               NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(bool),               NULL }
+};
+static const REMPARMDESC g_aArgsNotifyCodePageChanged[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVMCPU),             NULL },
+    { REMPARMDESC_FLAGS_GCPTR,      sizeof(RTGCUINTPTR),        NULL }
+};
+static const REMPARMDESC g_aArgsNotifyPendingInterrupt[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVMCPU),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint8_t),            NULL }
+};
+static const REMPARMDESC g_aArgsDisasEnableStepping[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(bool),               NULL }
+};
+static const REMPARMDESC g_aArgsIsPageAccessHandled[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_GCPHYS,     sizeof(RTGCPHYS),           NULL }
+};
+
+# ifndef VBOX_USE_BITNESS_SELECTOR
+
+/* VMM args */
+static const REMPARMDESC g_aArgsAPICUpdatePendingInterrupts[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVMCPU),             NULL }
+};
+static const REMPARMDESC g_aArgsAPICGetTpr[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVMCPU),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint8_t *),          NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(bool *),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint8_t *),          NULL }
+};
+static const REMPARMDESC g_aArgsAPICSetTpr[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVMCPU),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint8_t),            NULL }
+};
+static const REMPARMDESC g_aArgsCPUMGetGuestCpl[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVMCPU),             NULL },
+};
+
+/* CPUMQueryGuestMsr args */
+static const REMPARMDESC g_aArgsCPUMQueryGuestMsr[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVMCPU),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint32_t),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint64_t *),         NULL },
+};
+
+/* CPUMSetGuestMsr args */
+static const REMPARMDESC g_aArgsCPUMSetGuestMsr[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVMCPU),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint32_t),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint64_t),           NULL },
+};
+
+static const REMPARMDESC g_aArgsCPUMGetGuestCpuId[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVMCPU),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint32_t),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint32_t),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint32_t *),         NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint32_t *),         NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint32_t *),         NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint32_t *),         NULL }
+};
+
+static const REMPARMDESC g_aArgsCPUMR3RemEnter[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVMCPU),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint32_t *),         NULL }
+};
+
+static const REMPARMDESC g_aArgsCPUMR3RemLeave[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVMCPU),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(bool),               NULL }
+};
+
+static const REMPARMDESC g_aArgsCPUMSetChangedFlags[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVMCPU),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint32_t),           NULL }
+};
+
+static const REMPARMDESC g_aArgsCPUMQueryGuestCtxPtr[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVMCPU),             NULL }
+};
+static const REMPARMDESC g_aArgsCSAMR3MonitorPage[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(RTRCPTR),            NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(CSAMTAG),            NULL }
+};
+static const REMPARMDESC g_aArgsCSAMR3UnmonitorPage[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(RTRCPTR),            NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(CSAMTAG),            NULL }
+};
+
+static const REMPARMDESC g_aArgsCSAMR3RecordCallAddress[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(RTRCPTR),            NULL }
+};
+
+#  if defined(VBOX_WITH_DEBUGGER) && !(defined(RT_OS_WINDOWS) && defined(RT_ARCH_AMD64)) /* the callbacks are problematic */
+static const REMPARMDESC g_aArgsDBGCRegisterCommands[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PCDBGCCMD),          NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(unsigned),           NULL }
+};
+#  endif
+static const REMPARMDESC g_aArgsDBGFR3DisasInstrEx[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PUVM),               NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(VMCPUID),            NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(RTSEL),              NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(RTGCPTR),            NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(unsigned),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(char *),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint32_t),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint32_t *),         NULL }
+};
+static const REMPARMDESC g_aArgsDBGFR3DisasInstrCurrentLogInternal[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVMCPU),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(char *),             NULL }
+};
+static const REMPARMDESC g_aArgsDBGFR3Info[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PUVM),               NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(const char *),       NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(const char *),       NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(PCDBGFINFOHLP),      NULL }
+};
+static const REMPARMDESC g_aArgsDBGFR3AsSymbolByAddr[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PUVM),               NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(RTDBGAS),            NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(PCDBGFADDRESS),      NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint32_t),           NULL },
+    { REMPARMDESC_FLAGS_GCPTR,      sizeof(PRTGCINTPTR),        NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(PRTDBGSYMBOL),       NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(PRTDBGMOD),          NULL }
+};
+static const REMPARMDESC g_aArgsDBGFR3AddrFromFlat[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PUVM),               NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(PDBGFADDRESS),       NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(RTGCUINTPTR),        NULL }
+};
+static const REMPARMDESC g_aArgsDISInstrToStr[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint8_t const *),    NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(DISCPUMODE),         NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(PDISCPUSTATE),       NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint32_t *),         NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(char *),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(size_t),             NULL }
+};
+static const REMPARMDESC g_aArgsEMR3FatalError[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVMCPU),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(int),                NULL }
+};
+static const REMPARMDESC g_aArgsEMSetInhibitInterruptsPC[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVMCPU),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(RTGCPTR),            NULL }
+};
+static const REMPARMDESC g_aArgsHMCanExecuteGuest[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVMCPU),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(PCPUMCTX),           NULL },
+};
+static const REMPARMDESC g_aArgsIOMIOPortRead[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVMCPU),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(RTIOPORT),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint32_t *),         NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint32_t),           NULL }
+};
+static const REMPARMDESC g_aArgsIOMIOPortWrite[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVMCPU),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(RTIOPORT),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint32_t),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint32_t),           NULL }
+};
+static const REMPARMDESC g_aArgsIOMMMIORead[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVMCPU),             NULL },
+    { REMPARMDESC_FLAGS_GCPHYS,     sizeof(RTGCPHYS),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint32_t *),         NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint32_t),           NULL }
+};
+static const REMPARMDESC g_aArgsIOMMMIOWrite[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVMCPU),             NULL },
+    { REMPARMDESC_FLAGS_GCPHYS,     sizeof(RTGCPHYS),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint32_t),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint32_t),           NULL }
+};
+static const REMPARMDESC g_aArgsMMR3HeapAlloc[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(MMTAG),              NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint32_t),           NULL }
+};
+static const REMPARMDESC g_aArgsMMR3HeapAllocZ[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(MMTAG),              NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint32_t),           NULL }
+};
+static const REMPARMDESC g_aArgsPATMIsPatchGCAddr[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(RTRCUINTPTR),        NULL }
+};
+static const REMPARMDESC g_aArgsPATMR3QueryOpcode[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(RTRCPTR),            NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint8_t *),          NULL }
+};
+static const REMPARMDESC g_aArgsPDMGetInterrupt[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVMCPU),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint8_t *),          NULL }
+};
+static const REMPARMDESC g_aArgsPDMIsaSetIrq[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint8_t),            NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint8_t),            NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint32_t),           NULL }
+};
+static const REMPARMDESC g_aArgsPDMR3CritSectInit[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(PPDMCRITSECT),       NULL },
+    /* RT_SRC_POS_DECL */
+    { REMPARMDESC_FLAGS_INT,        sizeof(const char *),       NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(unsigned int),       NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(const char *),       NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(char *),             NULL },
+    { REMPARMDESC_FLAGS_ELLIPSIS,   0 }
+};
+static const REMPARMDESC g_aArgsPDMCritSectEnter[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PPDMCRITSECT),       NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(int),                NULL }
+};
+static const REMPARMDESC g_aArgsPDMCritSectEnterDebug[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PPDMCRITSECT),       NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(int),                NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(RTHCUINTPTR),        NULL },
+    /* RT_SRC_POS_DECL */
+    { REMPARMDESC_FLAGS_INT,        sizeof(const char *),       NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(unsigned),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(const char *),       NULL }
+};
+static const REMPARMDESC g_aArgsPGMGetGuestMode[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVMCPU),             NULL },
+};
+static const REMPARMDESC g_aArgsPGMGstGetPage[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVMCPU),             NULL },
+    { REMPARMDESC_FLAGS_GCPTR,      sizeof(RTGCPTR),            NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint64_t *),         NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(PRTGCPHYS),          NULL }
+};
+static const REMPARMDESC g_aArgsPGMInvalidatePage[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVMCPU),             NULL },
+    { REMPARMDESC_FLAGS_GCPTR,      sizeof(RTGCPTR),            NULL }
+};
+static const REMPARMDESC g_aArgsPGMR3PhysTlbGCPhys2Ptr[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_GCPHYS,     sizeof(RTGCPHYS),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(bool),               NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(void *),             NULL }
+};
+static const REMPARMDESC g_aArgsPGM3PhysGrowRange[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(PCRTGCPHYS),         NULL }
+};
+static const REMPARMDESC g_aArgsPGMPhysIsGCPhysValid[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_GCPHYS,     sizeof(RTGCPHYS),           NULL }
+};
+static const REMPARMDESC g_aArgsPGMPhysRead[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_GCPHYS,     sizeof(RTGCPHYS),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(void *),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(size_t),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(PGMACCESSORIGIN),    NULL }
+};
+static const REMPARMDESC g_aArgsPGMPhysSimpleReadGCPtr[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVMCPU),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(void *),             NULL },
+    { REMPARMDESC_FLAGS_GCPTR,      sizeof(RTGCPTR),            NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(size_t),             NULL }
+};
+static const REMPARMDESC g_aArgsPGMPhysWrite[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_GCPHYS,     sizeof(RTGCPHYS),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(const void *),       NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(size_t),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(PGMACCESSORIGIN),    NULL }
+};
+static const REMPARMDESC g_aArgsPGMChangeMode[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVMCPU),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint64_t),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint64_t),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint64_t),           NULL }
+};
+static const REMPARMDESC g_aArgsPGMFlushTLB[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVMCPU),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint64_t),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(bool),               NULL }
+};
+static const REMPARMDESC g_aArgsPGMR3PhysReadUxx[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_GCPHYS,     sizeof(RTGCPHYS),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(PGMACCESSORIGIN),    NULL }
+};
+static const REMPARMDESC g_aArgsPGMR3PhysWriteU8[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_GCPHYS,     sizeof(RTGCPHYS),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint8_t),            NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(PGMACCESSORIGIN),    NULL }
+};
+static const REMPARMDESC g_aArgsPGMR3PhysWriteU16[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_GCPHYS,     sizeof(RTGCPHYS),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint16_t),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(PGMACCESSORIGIN),    NULL }
+};
+static const REMPARMDESC g_aArgsPGMR3PhysWriteU32[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_GCPHYS,     sizeof(RTGCPHYS),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint32_t),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(PGMACCESSORIGIN),    NULL }
+};
+static const REMPARMDESC g_aArgsPGMR3PhysWriteU64[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_GCPHYS,     sizeof(RTGCPHYS),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint64_t),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(PGMACCESSORIGIN),    NULL }
+};
+static const REMPARMDESC g_aArgsRTMemReallocTag[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(void*),              NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(size_t),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(const char *),       NULL }
+};
+static const REMPARMDESC g_aArgsRTMemEfRealloc[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(void*),              NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(size_t),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(const char *),       NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(const char *),       NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(unsigned int),       NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(const char *),       NULL }
+};
+static const REMPARMDESC g_aArgsSSMR3GetGCPtr[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PSSMHANDLE),         NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(PRTGCPTR),           NULL }
+};
+static const REMPARMDESC g_aArgsSSMR3GetMem[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PSSMHANDLE),         NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(void *),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(size_t),             NULL }
+};
+static const REMPARMDESC g_aArgsSSMR3GetU32[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PSSMHANDLE),         NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint32_t *),         NULL }
+};
+static const REMPARMDESC g_aArgsSSMR3GetUInt[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PSSMHANDLE),         NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(PRTUINT),            NULL }
+};
+static const REMPARMDESC g_aArgsSSMR3PutGCPtr[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PSSMHANDLE),         NULL },
+    { REMPARMDESC_FLAGS_GCPTR,      sizeof(RTGCPTR),            NULL }
+};
+static const REMPARMDESC g_aArgsSSMR3PutMem[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PSSMHANDLE),         NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(const void *),       NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(size_t),             NULL }
+};
+static const REMPARMDESC g_aArgsSSMR3PutU32[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PSSMHANDLE),         NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint32_t),           NULL },
+};
+static const REMPARMDESC g_aArgsSSMR3PutUInt[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PSSMHANDLE),         NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(RTUINT),             NULL },
+};
+
+static const REMPARMDESC g_aArgsSSMIntLiveExecCallback[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(PSSMHANDLE),         NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint32_t),           NULL },
+};
+static REMFNDESC g_SSMIntLiveExecCallback =
+{
+    "SSMIntLiveExecCallback", NULL, &g_aArgsSSMIntLiveExecCallback[0], RT_ELEMENTS(g_aArgsSSMIntLiveExecCallback), REMFNDESC_FLAGS_RET_INT, sizeof(int),  NULL
+};
+
+static const REMPARMDESC g_aArgsSSMIntLiveVoteCallback[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(PSSMHANDLE),         NULL },
+};
+static REMFNDESC g_SSMIntLiveVoteCallback =
+{
+    "SSMIntLiveVoteCallback", NULL, &g_aArgsSSMIntLiveVoteCallback[0], RT_ELEMENTS(g_aArgsSSMIntLiveVoteCallback), REMFNDESC_FLAGS_RET_INT, sizeof(bool),  NULL
+};
+
+static const REMPARMDESC g_aArgsSSMIntCallback[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(PSSMHANDLE),         NULL },
+};
+static REMFNDESC g_SSMIntCallback =
+{
+    "SSMIntCallback", NULL, &g_aArgsSSMIntCallback[0], RT_ELEMENTS(g_aArgsSSMIntCallback), REMFNDESC_FLAGS_RET_INT, sizeof(int),  NULL
+};
+
+static const REMPARMDESC g_aArgsSSMIntLoadExecCallback[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(PSSMHANDLE),         NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint32_t),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint32_t),           NULL },
+};
+static REMFNDESC g_SSMIntLoadExecCallback =
+{
+    "SSMIntLoadExecCallback", NULL, &g_aArgsSSMIntLoadExecCallback[0], RT_ELEMENTS(g_aArgsSSMIntLoadExecCallback), REMFNDESC_FLAGS_RET_INT, sizeof(int),  NULL
+};
+/* Note: don't forget about the handwritten assembly wrapper when changing this! */
+static const REMPARMDESC g_aArgsSSMR3RegisterInternal[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(const char *),       NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint32_t),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint32_t),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(size_t),             NULL },
+    { REMPARMDESC_FLAGS_PFN,        sizeof(PFNSSMINTLIVEPREP),  &g_SSMIntCallback },
+    { REMPARMDESC_FLAGS_PFN,        sizeof(PFNSSMINTLIVEEXEC),  &g_SSMIntLiveExecCallback },
+    { REMPARMDESC_FLAGS_PFN,        sizeof(PFNSSMINTLIVEVOTE),  &g_SSMIntLiveVoteCallback },
+    { REMPARMDESC_FLAGS_PFN,        sizeof(PFNSSMINTSAVEPREP),  &g_SSMIntCallback },
+    { REMPARMDESC_FLAGS_PFN,        sizeof(PFNSSMINTSAVEEXEC),  &g_SSMIntCallback },
+    { REMPARMDESC_FLAGS_PFN,        sizeof(PFNSSMINTSAVEDONE),  &g_SSMIntCallback },
+    { REMPARMDESC_FLAGS_PFN,        sizeof(PFNSSMINTLOADPREP),  &g_SSMIntCallback },
+    { REMPARMDESC_FLAGS_PFN,        sizeof(PFNSSMINTLOADEXEC),  &g_SSMIntLoadExecCallback },
+    { REMPARMDESC_FLAGS_PFN,        sizeof(PFNSSMINTLOADDONE),  &g_SSMIntCallback },
+};
+
+static const REMPARMDESC g_aArgsSTAMR3Register[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(void *),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(STAMTYPE),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(STAMVISIBILITY),     NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(const char *),       NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(STAMUNIT),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(const char *),       NULL }
+};
+static const REMPARMDESC g_aArgsSTAMR3Deregister[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(const char *),       NULL },
+};
+static const REMPARMDESC g_aArgsTRPMAssertTrap[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVMCPU),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint8_t),            NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(TRPMEVENT),          NULL }
+};
+static const REMPARMDESC g_aArgsTRPMQueryTrap[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVMCPU),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint8_t *),          NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(TRPMEVENT *),        NULL }
+};
+static const REMPARMDESC g_aArgsTRPMSetErrorCode[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVMCPU),             NULL },
+    { REMPARMDESC_FLAGS_GCPTR,      sizeof(RTGCUINT),           NULL }
+};
+static const REMPARMDESC g_aArgsTRPMSetFaultAddress[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVMCPU),             NULL },
+    { REMPARMDESC_FLAGS_GCPTR,      sizeof(RTGCUINT),           NULL }
+};
+static const REMPARMDESC g_aArgsVMR3ReqCallWait[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVM),                NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(VMCPUID),            NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(void *),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(unsigned),           NULL },
+    { REMPARMDESC_FLAGS_ELLIPSIS,   0,                          NULL }
+};
+static const REMPARMDESC g_aArgsVMR3ReqFree[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PVMREQ),             NULL }
+};
+
+/* IPRT args */
+static const REMPARMDESC g_aArgsRTAssertMsg1[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(const char *),       NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(unsigned),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(const char *),       NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(const char *),       NULL }
+};
+static const REMPARMDESC g_aArgsRTAssertMsg2[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(const char *),       NULL },
+    { REMPARMDESC_FLAGS_ELLIPSIS,   0,                          NULL }
+};
+static const REMPARMDESC g_aArgsRTAssertMsg2V[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(const char *),       NULL },
+    { REMPARMDESC_FLAGS_VALIST,     0,                          NULL }
+};
+static const REMPARMDESC g_aArgsRTLogGetDefaultInstanceEx[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(uint32_t),           NULL }
+};
+static const REMPARMDESC g_aArgsRTLogFlags[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PRTLOGGER),          NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(const char *),       NULL }
+};
+static const REMPARMDESC g_aArgsRTLogFlush[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PRTLOGGER),          NULL }
+};
+static const REMPARMDESC g_aArgsRTLogLoggerEx[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PRTLOGGER),          NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(unsigned),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(unsigned),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(const char *),       NULL },
+    { REMPARMDESC_FLAGS_ELLIPSIS,   0,                          NULL }
+};
+static const REMPARMDESC g_aArgsRTLogLoggerExV[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(PRTLOGGER),          NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(unsigned),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(unsigned),           NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(const char *),       NULL },
+    { REMPARMDESC_FLAGS_VALIST,     0,                          NULL }
+};
+static const REMPARMDESC g_aArgsRTLogPrintf[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(const char *),       NULL },
+    { REMPARMDESC_FLAGS_ELLIPSIS,   0,                          NULL }
+};
+static const REMPARMDESC g_aArgsRTMemProtect[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(void *),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(size_t),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(unsigned),           NULL }
+};
+static const REMPARMDESC g_aArgsRTStrPrintf[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(char *),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(size_t),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(const char *),       NULL },
+    { REMPARMDESC_FLAGS_ELLIPSIS,   0,                          NULL }
+};
+static const REMPARMDESC g_aArgsRTStrPrintfV[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(char *),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(size_t),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(const char *),       NULL },
+    { REMPARMDESC_FLAGS_VALIST,     0,                          NULL }
+};
+static const REMPARMDESC g_aArgsThread[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(RTTHREAD),           NULL }
+};
+
+
+/* CRT args */
+static const REMPARMDESC g_aArgsmemcpy[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(void *),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(const  void *),      NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(size_t),             NULL }
+};
+static const REMPARMDESC g_aArgsmemset[] =
+{
+    { REMPARMDESC_FLAGS_INT,        sizeof(void *),             NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(int),                NULL },
+    { REMPARMDESC_FLAGS_INT,        sizeof(size_t),             NULL }
+};
+
+# endif /* !VBOX_USE_BITNESS_SELECTOR */
+
+/** @} */
+
+/**
+ * Descriptors for the exported functions.
+ */
+static const REMFNDESC g_aExports[] =
+{  /* pszName,                                  (void *)pv,                                         pParams,                                    cParams,                                               fFlags,                     cb,             pvWrapper. */
+    { "REMR3Init",                              (void *)&pfnREMR3Init,                              &g_aArgsVM[0],                              RT_ELEMENTS(g_aArgsVM),                                REMFNDESC_FLAGS_RET_INT,    sizeof(int),    NULL },
+    { "REMR3InitFinalize",                      (void *)&pfnREMR3InitFinalize,                      &g_aArgsVM[0],                              RT_ELEMENTS(g_aArgsVM),                                REMFNDESC_FLAGS_RET_INT,    sizeof(int),    NULL },
+    { "REMR3Term",                              (void *)&pfnREMR3Term,                              &g_aArgsVM[0],                              RT_ELEMENTS(g_aArgsVM),                                REMFNDESC_FLAGS_RET_INT,    sizeof(int),    NULL },
+    { "REMR3Reset",                             (void *)&pfnREMR3Reset,                             &g_aArgsVM[0],                              RT_ELEMENTS(g_aArgsVM),                                REMFNDESC_FLAGS_RET_VOID,   0,              NULL },
+    { "REMR3Step",                              (void *)&pfnREMR3Step,                              &g_aArgsVMandVMCPU[0],                      RT_ELEMENTS(g_aArgsVMandVMCPU),                        REMFNDESC_FLAGS_RET_INT,    sizeof(int),    NULL },
+    { "REMR3BreakpointSet",                     (void *)&pfnREMR3BreakpointSet,                     &g_aArgsBreakpoint[0],                      RT_ELEMENTS(g_aArgsBreakpoint),                        REMFNDESC_FLAGS_RET_INT,    sizeof(int),    NULL },
+    { "REMR3BreakpointClear",                   (void *)&pfnREMR3BreakpointClear,                   &g_aArgsBreakpoint[0],                      RT_ELEMENTS(g_aArgsBreakpoint),                        REMFNDESC_FLAGS_RET_INT,    sizeof(int),    NULL },
+    { "REMR3EmulateInstruction",                (void *)&pfnREMR3EmulateInstruction,                &g_aArgsVMandVMCPU[0],                      RT_ELEMENTS(g_aArgsVMandVMCPU),                        REMFNDESC_FLAGS_RET_INT,    sizeof(int),    NULL },
+    { "REMR3Run",                               (void *)&pfnREMR3Run,                               &g_aArgsVMandVMCPU[0],                      RT_ELEMENTS(g_aArgsVMandVMCPU),                        REMFNDESC_FLAGS_RET_INT,    sizeof(int),    NULL },
+    { "REMR3State",                             (void *)&pfnREMR3State,                             &g_aArgsVMandVMCPU[0],                      RT_ELEMENTS(g_aArgsVMandVMCPU),                        REMFNDESC_FLAGS_RET_INT,    sizeof(int),    NULL },
+    { "REMR3StateBack",                         (void *)&pfnREMR3StateBack,                         &g_aArgsVMandVMCPU[0],                      RT_ELEMENTS(g_aArgsVMandVMCPU),                        REMFNDESC_FLAGS_RET_INT,    sizeof(int),    NULL },
+    { "REMR3StateUpdate",                       (void *)&pfnREMR3StateUpdate,                       &g_aArgsVMandVMCPU[0],                      RT_ELEMENTS(g_aArgsVMandVMCPU),                        REMFNDESC_FLAGS_RET_VOID,   0,              NULL },
+    { "REMR3A20Set",                            (void *)&pfnREMR3A20Set,                            &g_aArgsA20Set[0],                          RT_ELEMENTS(g_aArgsA20Set),                            REMFNDESC_FLAGS_RET_VOID,   0,              NULL },
+    { "REMR3ReplayHandlerNotifications",        (void *)&pfnREMR3ReplayHandlerNotifications,        &g_aArgsVM[0],                              RT_ELEMENTS(g_aArgsVM),                                REMFNDESC_FLAGS_RET_VOID,   0,              NULL },
+    { "REMR3NotifyPhysRamRegister",             (void *)&pfnREMR3NotifyPhysRamRegister,             &g_aArgsNotifyPhysRamRegister[0],           RT_ELEMENTS(g_aArgsNotifyPhysRamRegister),             REMFNDESC_FLAGS_RET_VOID,   0,              NULL },
+    { "REMR3NotifyPhysRamDeregister",           (void *)&pfnREMR3NotifyPhysRamDeregister,           &g_aArgsNotifyPhysRamDeregister[0],         RT_ELEMENTS(g_aArgsNotifyPhysRamDeregister),           REMFNDESC_FLAGS_RET_VOID,   0,              NULL },
+    { "REMR3NotifyPhysRomRegister",             (void *)&pfnREMR3NotifyPhysRomRegister,             &g_aArgsNotifyPhysRomRegister[0],           RT_ELEMENTS(g_aArgsNotifyPhysRomRegister),             REMFNDESC_FLAGS_RET_VOID,   0,              NULL },
+    { "REMR3NotifyHandlerPhysicalModify",       (void *)&pfnREMR3NotifyHandlerPhysicalModify,       &g_aArgsNotifyHandlerPhysicalModify[0],     RT_ELEMENTS(g_aArgsNotifyHandlerPhysicalModify),       REMFNDESC_FLAGS_RET_VOID,   0,              NULL },
+    { "REMR3NotifyHandlerPhysicalRegister",     (void *)&pfnREMR3NotifyHandlerPhysicalRegister,     &g_aArgsNotifyHandlerPhysicalRegister[0],   RT_ELEMENTS(g_aArgsNotifyHandlerPhysicalRegister),     REMFNDESC_FLAGS_RET_VOID,   0,              NULL },
+    { "REMR3NotifyHandlerPhysicalDeregister",   (void *)&pfnREMR3NotifyHandlerPhysicalDeregister,   &g_aArgsNotifyHandlerPhysicalDeregister[0], RT_ELEMENTS(g_aArgsNotifyHandlerPhysicalDeregister),   REMFNDESC_FLAGS_RET_VOID,   0,              NULL },
+    { "REMR3NotifyInterruptSet",                (void *)&pfnREMR3NotifyInterruptSet,                &g_aArgsVMandVMCPU[0],                      RT_ELEMENTS(g_aArgsVMandVMCPU),                        REMFNDESC_FLAGS_RET_VOID,   0,              NULL },
+    { "REMR3NotifyInterruptClear",              (void *)&pfnREMR3NotifyInterruptClear,              &g_aArgsVMandVMCPU[0],                      RT_ELEMENTS(g_aArgsVMandVMCPU),                        REMFNDESC_FLAGS_RET_VOID,   0,              NULL },
+    { "REMR3NotifyTimerPending",                (void *)&pfnREMR3NotifyTimerPending,                &g_aArgsVMandVMCPU[0],                      RT_ELEMENTS(g_aArgsVMandVMCPU),                        REMFNDESC_FLAGS_RET_VOID,   0,              NULL },
+    { "REMR3NotifyDmaPending",                  (void *)&pfnREMR3NotifyDmaPending,                  &g_aArgsVM[0],                              RT_ELEMENTS(g_aArgsVM),                                REMFNDESC_FLAGS_RET_VOID,   0,              NULL },
+    { "REMR3NotifyQueuePending",                (void *)&pfnREMR3NotifyQueuePending,                &g_aArgsVM[0],                              RT_ELEMENTS(g_aArgsVM),                                REMFNDESC_FLAGS_RET_VOID,   0,              NULL },
+    { "REMR3NotifyFF",                          (void *)&pfnREMR3NotifyFF,                          &g_aArgsVM[0],                              RT_ELEMENTS(g_aArgsVM),                                REMFNDESC_FLAGS_RET_VOID,   0,              NULL },
+    { "REMR3NotifyCodePageChanged",             (void *)&pfnREMR3NotifyCodePageChanged,             &g_aArgsNotifyCodePageChanged[0],           RT_ELEMENTS(g_aArgsNotifyCodePageChanged),             REMFNDESC_FLAGS_RET_INT,    sizeof(int),    NULL },
+    { "REMR3DisasEnableStepping",               (void *)&pfnREMR3DisasEnableStepping,               &g_aArgsDisasEnableStepping[0],             RT_ELEMENTS(g_aArgsDisasEnableStepping),               REMFNDESC_FLAGS_RET_INT,    sizeof(int),    NULL },
+    { "REMR3IsPageAccessHandled",               (void *)&pfnREMR3IsPageAccessHandled,               &g_aArgsIsPageAccessHandled[0],             RT_ELEMENTS(g_aArgsIsPageAccessHandled),               REMFNDESC_FLAGS_RET_INT,    sizeof(bool),   NULL }
+};
+
+# ifndef VBOX_USE_BITNESS_SELECTOR
+
+#  ifdef VBOX_WITHOUT_REM_LDR_CYCLE
+#   define VMM_FN(name)  NULL
+#  else
+#   define VMM_FN(name)  (void *)(uintptr_t)& name
+#  endif
+
+/**
+ * Descriptors for the functions imported from VBoxVMM.
+ */
+static REMFNDESC g_aVMMImports[] =
+{
+    { "APICUpdatePendingInterrupts",            VMM_FN(APICUpdatePendingInterrupts),    &g_aArgsAPICUpdatePendingInterrupts[0],     RT_ELEMENTS(g_aArgsAPICUpdatePendingInterrupts),       REMFNDESC_FLAGS_RET_VOID,   0,                  NULL },
+    { "APICGetTpr",                             VMM_FN(APICGetTpr),                     &g_aArgsAPICGetTpr[0],                      RT_ELEMENTS(g_aArgsAPICGetTpr),                        REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "APICSetTpr",                             VMM_FN(APICSetTpr),                     &g_aArgsAPICSetTpr[0],                      RT_ELEMENTS(g_aArgsAPICSetTpr),                        REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "CPUMR3RemEnter",                         VMM_FN(CPUMR3RemEnter),                 &g_aArgsCPUMR3RemEnter[0],                  RT_ELEMENTS(g_aArgsCPUMR3RemEnter),                    REMFNDESC_FLAGS_RET_INT,    sizeof(uint32_t),   NULL },
+    { "CPUMR3RemLeave",                         VMM_FN(CPUMR3RemLeave),                 &g_aArgsCPUMR3RemLeave[0],                  RT_ELEMENTS(g_aArgsCPUMR3RemLeave),                    REMFNDESC_FLAGS_RET_VOID,   0,                  NULL },
+    { "CPUMSetChangedFlags",                    VMM_FN(CPUMSetChangedFlags),            &g_aArgsCPUMSetChangedFlags[0],             RT_ELEMENTS(g_aArgsCPUMSetChangedFlags),               REMFNDESC_FLAGS_RET_VOID,   0,                  NULL },
+    { "CPUMGetGuestCPL",                        VMM_FN(CPUMGetGuestCPL),                &g_aArgsCPUMGetGuestCpl[0],                 RT_ELEMENTS(g_aArgsCPUMGetGuestCpl),                   REMFNDESC_FLAGS_RET_INT,    sizeof(unsigned),   NULL },
+    { "CPUMQueryGuestMsr",                      VMM_FN(CPUMQueryGuestMsr),              &g_aArgsCPUMQueryGuestMsr[0],               RT_ELEMENTS(g_aArgsCPUMQueryGuestMsr),                 REMFNDESC_FLAGS_RET_INT,    sizeof(uint64_t),   NULL },
+    { "CPUMSetGuestMsr",                        VMM_FN(CPUMSetGuestMsr),                &g_aArgsCPUMSetGuestMsr[0],                 RT_ELEMENTS(g_aArgsCPUMSetGuestMsr),                   REMFNDESC_FLAGS_RET_VOID,   0,                  NULL },
+    { "CPUMGetGuestCpuId",                      VMM_FN(CPUMGetGuestCpuId),              &g_aArgsCPUMGetGuestCpuId[0],               RT_ELEMENTS(g_aArgsCPUMGetGuestCpuId),                 REMFNDESC_FLAGS_RET_VOID,   0,                  NULL },
+    { "CPUMGetGuestEAX",                        VMM_FN(CPUMGetGuestEAX),                &g_aArgsVMCPU[0],                           RT_ELEMENTS(g_aArgsVMCPU),                             REMFNDESC_FLAGS_RET_INT,    sizeof(uint32_t),   NULL },
+    { "CPUMGetGuestEBP",                        VMM_FN(CPUMGetGuestEBP),                &g_aArgsVMCPU[0],                           RT_ELEMENTS(g_aArgsVMCPU),                             REMFNDESC_FLAGS_RET_INT,    sizeof(uint32_t),   NULL },
+    { "CPUMGetGuestEBX",                        VMM_FN(CPUMGetGuestEBX),                &g_aArgsVMCPU[0],                           RT_ELEMENTS(g_aArgsVMCPU),                             REMFNDESC_FLAGS_RET_INT,    sizeof(uint32_t),   NULL },
+    { "CPUMGetGuestECX",                        VMM_FN(CPUMGetGuestECX),                &g_aArgsVMCPU[0],                           RT_ELEMENTS(g_aArgsVMCPU),                             REMFNDESC_FLAGS_RET_INT,    sizeof(uint32_t),   NULL },
+    { "CPUMGetGuestEDI",                        VMM_FN(CPUMGetGuestEDI),                &g_aArgsVMCPU[0],                           RT_ELEMENTS(g_aArgsVMCPU),                             REMFNDESC_FLAGS_RET_INT,    sizeof(uint32_t),   NULL },
+    { "CPUMGetGuestEDX",                        VMM_FN(CPUMGetGuestEDX),                &g_aArgsVMCPU[0],                           RT_ELEMENTS(g_aArgsVMCPU),                             REMFNDESC_FLAGS_RET_INT,    sizeof(uint32_t),   NULL },
+    { "CPUMGetGuestEIP",                        VMM_FN(CPUMGetGuestEIP),                &g_aArgsVMCPU[0],                           RT_ELEMENTS(g_aArgsVMCPU),                             REMFNDESC_FLAGS_RET_INT,    sizeof(uint32_t),   NULL },
+    { "CPUMGetGuestESI",                        VMM_FN(CPUMGetGuestESI),                &g_aArgsVMCPU[0],                           RT_ELEMENTS(g_aArgsVMCPU),                             REMFNDESC_FLAGS_RET_INT,    sizeof(uint32_t),   NULL },
+    { "CPUMGetGuestESP",                        VMM_FN(CPUMGetGuestESP),                &g_aArgsVMCPU[0],                           RT_ELEMENTS(g_aArgsVMCPU),                             REMFNDESC_FLAGS_RET_INT,    sizeof(uint32_t),   NULL },
+    { "CPUMGetGuestCS",                         VMM_FN(CPUMGetGuestCS),                 &g_aArgsVMCPU[0],                           RT_ELEMENTS(g_aArgsVMCPU),                             REMFNDESC_FLAGS_RET_INT,    sizeof(RTSEL),      NULL },
+    { "CPUMGetGuestSS",                         VMM_FN(CPUMGetGuestSS),                 &g_aArgsVMCPU[0],                           RT_ELEMENTS(g_aArgsVMCPU),                             REMFNDESC_FLAGS_RET_INT,    sizeof(RTSEL),      NULL },
+    { "CPUMGetGuestCpuVendor",                  VMM_FN(CPUMGetGuestCpuVendor),          &g_aArgsVM[0],                              RT_ELEMENTS(g_aArgsVMCPU),                             REMFNDESC_FLAGS_RET_INT, sizeof(CPUMCPUVENDOR), NULL },
+    { "CPUMQueryGuestCtxPtr",                   VMM_FN(CPUMQueryGuestCtxPtr),           &g_aArgsCPUMQueryGuestCtxPtr[0],            RT_ELEMENTS(g_aArgsCPUMQueryGuestCtxPtr),              REMFNDESC_FLAGS_RET_INT,    sizeof(PCPUMCTX),   NULL },
+    { "CSAMR3MonitorPage",                      VMM_FN(CSAMR3MonitorPage),              &g_aArgsCSAMR3MonitorPage[0],               RT_ELEMENTS(g_aArgsCSAMR3MonitorPage),                 REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "CSAMR3UnmonitorPage",                    VMM_FN(CSAMR3UnmonitorPage),            &g_aArgsCSAMR3UnmonitorPage[0],             RT_ELEMENTS(g_aArgsCSAMR3UnmonitorPage),               REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "CSAMR3RecordCallAddress",                VMM_FN(CSAMR3RecordCallAddress),        &g_aArgsCSAMR3RecordCallAddress[0],         RT_ELEMENTS(g_aArgsCSAMR3RecordCallAddress),           REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+#  if defined(VBOX_WITH_DEBUGGER) && !(defined(RT_OS_WINDOWS) && defined(RT_ARCH_AMD64)) /* the callbacks are problematic */
+    { "DBGCRegisterCommands",                   VMM_FN(DBGCRegisterCommands),           &g_aArgsDBGCRegisterCommands[0],            RT_ELEMENTS(g_aArgsDBGCRegisterCommands),              REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+#  endif
+    { "DBGFR3DisasInstrEx",                     VMM_FN(DBGFR3DisasInstrEx),             &g_aArgsDBGFR3DisasInstrEx[0],              RT_ELEMENTS(g_aArgsDBGFR3DisasInstrEx),                REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "DBGFR3DisasInstrCurrentLogInternal",     VMM_FN(DBGFR3DisasInstrCurrentLogInternal), &g_aArgsDBGFR3DisasInstrCurrentLogInternal[0],  RT_ELEMENTS(g_aArgsDBGFR3DisasInstrCurrentLogInternal),REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "DBGFR3Info",                             VMM_FN(DBGFR3Info),                     &g_aArgsDBGFR3Info[0],                      RT_ELEMENTS(g_aArgsDBGFR3Info),                        REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "DBGFR3InfoLogRelHlp",                    VMM_FN(DBGFR3InfoLogRelHlp),            NULL,                                       0,                                                     REMFNDESC_FLAGS_RET_INT,    sizeof(void *),     NULL },
+    { "DBGFR3AsSymbolByAddr",                   VMM_FN(DBGFR3AsSymbolByAddr),           &g_aArgsDBGFR3AsSymbolByAddr[0],            RT_ELEMENTS(g_aArgsDBGFR3AsSymbolByAddr),              REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "DBGFR3AddrFromFlat",                     VMM_FN(DBGFR3AddrFromFlat),             &g_aArgsDBGFR3AddrFromFlat[0],              RT_ELEMENTS(g_aArgsDBGFR3AddrFromFlat),                REMFNDESC_FLAGS_RET_INT,    sizeof(PDBGFADDRESS),       NULL },
+    { "DISInstrToStr",                          VMM_FN(DISInstrToStr),                  &g_aArgsDISInstrToStr[0],                   RT_ELEMENTS(g_aArgsDISInstrToStr),                     REMFNDESC_FLAGS_RET_INT,    sizeof(bool),       NULL },
+    { "EMR3FatalError",                         VMM_FN(EMR3FatalError),                 &g_aArgsEMR3FatalError[0],                  RT_ELEMENTS(g_aArgsEMR3FatalError),                    REMFNDESC_FLAGS_RET_VOID,   0,                  NULL },
+    { "EMRemLock",                              VMM_FN(EMRemLock),                      &g_aArgsVM[0],                              RT_ELEMENTS(g_aArgsVM),                                REMFNDESC_FLAGS_RET_VOID,   0,                  NULL },
+    { "EMRemUnlock",                            VMM_FN(EMRemUnlock),                    &g_aArgsVM[0],                              RT_ELEMENTS(g_aArgsVM),                                REMFNDESC_FLAGS_RET_VOID,   0,                  NULL },
+    { "EMRemIsLockOwner",                       VMM_FN(EMRemIsLockOwner),               &g_aArgsVM[0],                              RT_ELEMENTS(g_aArgsVM),                                REMFNDESC_FLAGS_RET_VOID,   sizeof(bool),       NULL },
+    { "EMGetInhibitInterruptsPC",               VMM_FN(EMGetInhibitInterruptsPC),       &g_aArgsVMCPU[0],                           RT_ELEMENTS(g_aArgsVMCPU),                             REMFNDESC_FLAGS_RET_INT,    sizeof(RTGCPTR),    NULL },
+    { "EMSetInhibitInterruptsPC",               VMM_FN(EMSetInhibitInterruptsPC),       &g_aArgsEMSetInhibitInterruptsPC[0],        RT_ELEMENTS(g_aArgsEMSetInhibitInterruptsPC),          REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "HMIsEnabledNotMacro",                    VMM_FN(HMIsEnabledNotMacro),            &g_aArgsVM[0],                              RT_ELEMENTS(g_aArgsVM),                                REMFNDESC_FLAGS_RET_INT,    sizeof(bool),       NULL },
+    { "HMCanExecuteGuest",                      VMM_FN(HMCanExecuteGuest),              &g_aArgsHMCanExecuteGuest[0],               RT_ELEMENTS(g_aArgsHMCanExecuteGuest),                 REMFNDESC_FLAGS_RET_INT,    sizeof(bool),       NULL },
+    { "IOMIOPortRead",                          VMM_FN(IOMIOPortRead),                  &g_aArgsIOMIOPortRead[0],                   RT_ELEMENTS(g_aArgsIOMIOPortRead),                     REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "IOMIOPortWrite",                         VMM_FN(IOMIOPortWrite),                 &g_aArgsIOMIOPortWrite[0],                  RT_ELEMENTS(g_aArgsIOMIOPortWrite),                    REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "IOMMMIORead",                            VMM_FN(IOMMMIORead),                    &g_aArgsIOMMMIORead[0],                     RT_ELEMENTS(g_aArgsIOMMMIORead),                       REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "IOMMMIOWrite",                           VMM_FN(IOMMMIOWrite),                   &g_aArgsIOMMMIOWrite[0],                    RT_ELEMENTS(g_aArgsIOMMMIOWrite),                      REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "MMR3HeapAlloc",                          VMM_FN(MMR3HeapAlloc),                  &g_aArgsMMR3HeapAlloc[0],                   RT_ELEMENTS(g_aArgsMMR3HeapAlloc),                     REMFNDESC_FLAGS_RET_INT,    sizeof(void *),     NULL },
+    { "MMR3HeapAllocZ",                         VMM_FN(MMR3HeapAllocZ),                 &g_aArgsMMR3HeapAllocZ[0],                  RT_ELEMENTS(g_aArgsMMR3HeapAllocZ),                    REMFNDESC_FLAGS_RET_INT,    sizeof(void *),     NULL },
+    { "MMR3PhysGetRamSize",                     VMM_FN(MMR3PhysGetRamSize),             &g_aArgsVM[0],                              RT_ELEMENTS(g_aArgsVM),                                REMFNDESC_FLAGS_RET_INT,    sizeof(uint64_t),   NULL },
+    { "PATMIsPatchGCAddr",                      VMM_FN(PATMIsPatchGCAddr),              &g_aArgsPATMIsPatchGCAddr[0],               RT_ELEMENTS(g_aArgsPATMIsPatchGCAddr),                 REMFNDESC_FLAGS_RET_INT,    sizeof(bool),       NULL },
+    { "PATMR3QueryOpcode",                      VMM_FN(PATMR3QueryOpcode),              &g_aArgsPATMR3QueryOpcode[0],               RT_ELEMENTS(g_aArgsPATMR3QueryOpcode),                 REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "PDMR3DmaRun",                            VMM_FN(PDMR3DmaRun),                    &g_aArgsVM[0],                              RT_ELEMENTS(g_aArgsVM),                                REMFNDESC_FLAGS_RET_VOID,   0,                  NULL },
+    { "PDMR3CritSectInit",                      VMM_FN(PDMR3CritSectInit),              &g_aArgsPDMR3CritSectInit[0],               RT_ELEMENTS(g_aArgsPDMR3CritSectInit),                 REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "PDMCritSectEnter",                       VMM_FN(PDMCritSectEnter),               &g_aArgsPDMCritSectEnter[0],                RT_ELEMENTS(g_aArgsPDMCritSectEnter),                  REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "PDMCritSectLeave",                       VMM_FN(PDMCritSectLeave),               &g_aArgsPTR[0],                             RT_ELEMENTS(g_aArgsPTR),                               REMFNDESC_FLAGS_RET_VOID,   0,                  NULL },
+#  ifdef VBOX_STRICT
+    { "PDMCritSectEnterDebug",                  VMM_FN(PDMCritSectEnterDebug),          &g_aArgsPDMCritSectEnterDebug[0],           RT_ELEMENTS(g_aArgsPDMCritSectEnterDebug),             REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+#  endif
+    { "PDMGetInterrupt",                        VMM_FN(PDMGetInterrupt),                &g_aArgsPDMGetInterrupt[0],                 RT_ELEMENTS(g_aArgsPDMGetInterrupt),                   REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "PDMIsaSetIrq",                           VMM_FN(PDMIsaSetIrq),                   &g_aArgsPDMIsaSetIrq[0],                    RT_ELEMENTS(g_aArgsPDMIsaSetIrq),                      REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "PGMGetGuestMode",                        VMM_FN(PGMGetGuestMode),                &g_aArgsPGMGetGuestMode[0],                 RT_ELEMENTS(g_aArgsPGMGetGuestMode),                   REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "PGMGstGetPage",                          VMM_FN(PGMGstGetPage),                  &g_aArgsPGMGstGetPage[0],                   RT_ELEMENTS(g_aArgsPGMGstGetPage),                     REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "PGMInvalidatePage",                      VMM_FN(PGMInvalidatePage),              &g_aArgsPGMInvalidatePage[0],               RT_ELEMENTS(g_aArgsPGMInvalidatePage),                 REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "PGMPhysIsGCPhysValid",                   VMM_FN(PGMPhysIsGCPhysValid),           &g_aArgsPGMPhysIsGCPhysValid[0],            RT_ELEMENTS(g_aArgsPGMPhysIsGCPhysValid),              REMFNDESC_FLAGS_RET_INT,    sizeof(bool),       NULL },
+    { "PGMPhysIsA20Enabled",                    VMM_FN(PGMPhysIsA20Enabled),            &g_aArgsVMCPU[0],                           RT_ELEMENTS(g_aArgsVMCPU),                             REMFNDESC_FLAGS_RET_INT,    sizeof(bool),       NULL },
+    { "PGMPhysRead",                            VMM_FN(PGMPhysRead),                    &g_aArgsPGMPhysRead[0],                     RT_ELEMENTS(g_aArgsPGMPhysRead),                       REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "PGMPhysSimpleReadGCPtr",                 VMM_FN(PGMPhysSimpleReadGCPtr),         &g_aArgsPGMPhysSimpleReadGCPtr[0],          RT_ELEMENTS(g_aArgsPGMPhysSimpleReadGCPtr),            REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "PGMPhysWrite",                           VMM_FN(PGMPhysWrite),                   &g_aArgsPGMPhysWrite[0],                    RT_ELEMENTS(g_aArgsPGMPhysWrite),                      REMFNDESC_FLAGS_RET_VOID,   0,                  NULL },
+    { "PGMChangeMode",                          VMM_FN(PGMChangeMode),                  &g_aArgsPGMChangeMode[0],                   RT_ELEMENTS(g_aArgsPGMChangeMode),                     REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "PGMFlushTLB",                            VMM_FN(PGMFlushTLB),                    &g_aArgsPGMFlushTLB[0],                     RT_ELEMENTS(g_aArgsPGMFlushTLB),                       REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "PGMCr0WpEnabled",                        VMM_FN(PGMCr0WpEnabled),                &g_aArgsVMCPU[0],                           RT_ELEMENTS(g_aArgsVMCPU),                             REMFNDESC_FLAGS_RET_VOID,   0,                  NULL },
+    { "PGMR3PhysReadU8",                        VMM_FN(PGMR3PhysReadU8),                &g_aArgsPGMR3PhysReadUxx[0],                RT_ELEMENTS(g_aArgsPGMR3PhysReadUxx),                  REMFNDESC_FLAGS_RET_INT,    sizeof(uint8_t),    NULL },
+    { "PGMR3PhysReadU16",                       VMM_FN(PGMR3PhysReadU16),               &g_aArgsPGMR3PhysReadUxx[0],                RT_ELEMENTS(g_aArgsPGMR3PhysReadUxx),                  REMFNDESC_FLAGS_RET_INT,    sizeof(uint16_t),   NULL },
+    { "PGMR3PhysReadU32",                       VMM_FN(PGMR3PhysReadU32),               &g_aArgsPGMR3PhysReadUxx[0],                RT_ELEMENTS(g_aArgsPGMR3PhysReadUxx),                  REMFNDESC_FLAGS_RET_INT,    sizeof(uint32_t),   NULL },
+    { "PGMR3PhysReadU64",                       VMM_FN(PGMR3PhysReadU64),               &g_aArgsPGMR3PhysReadUxx[0],                RT_ELEMENTS(g_aArgsPGMR3PhysReadUxx),                  REMFNDESC_FLAGS_RET_INT,    sizeof(uint64_t),   NULL },
+    { "PGMR3PhysWriteU8",                       VMM_FN(PGMR3PhysWriteU8),               &g_aArgsPGMR3PhysWriteU8[0],                RT_ELEMENTS(g_aArgsPGMR3PhysWriteU8),                  REMFNDESC_FLAGS_RET_VOID,   0,                  NULL },
+    { "PGMR3PhysWriteU16",                      VMM_FN(PGMR3PhysWriteU16),              &g_aArgsPGMR3PhysWriteU16[0],               RT_ELEMENTS(g_aArgsPGMR3PhysWriteU16),                 REMFNDESC_FLAGS_RET_VOID,   0,                  NULL },
+    { "PGMR3PhysWriteU32",                      VMM_FN(PGMR3PhysWriteU32),              &g_aArgsPGMR3PhysWriteU32[0],               RT_ELEMENTS(g_aArgsPGMR3PhysWriteU32),                 REMFNDESC_FLAGS_RET_VOID,   0,                  NULL },
+    { "PGMR3PhysWriteU64",                      VMM_FN(PGMR3PhysWriteU64),              &g_aArgsPGMR3PhysWriteU64[0],               RT_ELEMENTS(g_aArgsPGMR3PhysWriteU32),                 REMFNDESC_FLAGS_RET_VOID,   0,                  NULL },
+    { "PGMR3PhysTlbGCPhys2Ptr",                 VMM_FN(PGMR3PhysTlbGCPhys2Ptr),         &g_aArgsPGMR3PhysTlbGCPhys2Ptr[0],          RT_ELEMENTS(g_aArgsPGMR3PhysTlbGCPhys2Ptr),            REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "PGMIsLockOwner",                         VMM_FN(PGMIsLockOwner),                 &g_aArgsVM[0],                              RT_ELEMENTS(g_aArgsVM),                                REMFNDESC_FLAGS_RET_INT,    sizeof(bool),       NULL },
+    { "SSMR3GetGCPtr",                          VMM_FN(SSMR3GetGCPtr),                  &g_aArgsSSMR3GetGCPtr[0],                   RT_ELEMENTS(g_aArgsSSMR3GetGCPtr),                     REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "SSMR3GetMem",                            VMM_FN(SSMR3GetMem),                    &g_aArgsSSMR3GetMem[0],                     RT_ELEMENTS(g_aArgsSSMR3GetMem),                       REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "SSMR3GetU32",                            VMM_FN(SSMR3GetU32),                    &g_aArgsSSMR3GetU32[0],                     RT_ELEMENTS(g_aArgsSSMR3GetU32),                       REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "SSMR3GetUInt",                           VMM_FN(SSMR3GetUInt),                   &g_aArgsSSMR3GetUInt[0],                    RT_ELEMENTS(g_aArgsSSMR3GetUInt),                      REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "SSMR3PutGCPtr",                          VMM_FN(SSMR3PutGCPtr),                  &g_aArgsSSMR3PutGCPtr[0],                   RT_ELEMENTS(g_aArgsSSMR3PutGCPtr),                     REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "SSMR3PutMem",                            VMM_FN(SSMR3PutMem),                    &g_aArgsSSMR3PutMem[0],                     RT_ELEMENTS(g_aArgsSSMR3PutMem),                       REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "SSMR3PutU32",                            VMM_FN(SSMR3PutU32),                    &g_aArgsSSMR3PutU32[0],                     RT_ELEMENTS(g_aArgsSSMR3PutU32),                       REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "SSMR3PutUInt",                           VMM_FN(SSMR3PutUInt),                   &g_aArgsSSMR3PutUInt[0],                    RT_ELEMENTS(g_aArgsSSMR3PutUInt),                      REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "SSMR3RegisterInternal",                  VMM_FN(SSMR3RegisterInternal),          &g_aArgsSSMR3RegisterInternal[0],           RT_ELEMENTS(g_aArgsSSMR3RegisterInternal),             REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "STAMR3Register",                         VMM_FN(STAMR3Register),                 &g_aArgsSTAMR3Register[0],                  RT_ELEMENTS(g_aArgsSTAMR3Register),                    REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "STAMR3Deregister",                       VMM_FN(STAMR3Deregister),               &g_aArgsSTAMR3Deregister[0],                RT_ELEMENTS(g_aArgsSTAMR3Deregister),                  REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "TMCpuTickGet",                           VMM_FN(TMCpuTickGet),                   &g_aArgsVMCPU[0],                           RT_ELEMENTS(g_aArgsVMCPU),                             REMFNDESC_FLAGS_RET_INT,    sizeof(uint64_t),   NULL },
+    { "TMR3NotifySuspend",                      VMM_FN(TMR3NotifySuspend),              &g_aArgsVMandVMCPU[0],                      RT_ELEMENTS(g_aArgsVMandVMCPU),                        REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "TMR3NotifyResume",                       VMM_FN(TMR3NotifyResume),               &g_aArgsVMandVMCPU[0],                      RT_ELEMENTS(g_aArgsVMandVMCPU),                        REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "TMNotifyEndOfExecution",                 VMM_FN(TMNotifyEndOfExecution),         &g_aArgsVMCPU[0],                           RT_ELEMENTS(g_aArgsVMCPU),                             REMFNDESC_FLAGS_RET_VOID,   0,                  NULL },
+    { "TMNotifyStartOfExecution",               VMM_FN(TMNotifyStartOfExecution),       &g_aArgsVMCPU[0],                           RT_ELEMENTS(g_aArgsVMCPU),                             REMFNDESC_FLAGS_RET_VOID,   0,                  NULL },
+    { "TMTimerPollBool",                        VMM_FN(TMTimerPollBool),                &g_aArgsVMandVMCPU[0],                      RT_ELEMENTS(g_aArgsVMandVMCPU),                        REMFNDESC_FLAGS_RET_VOID,   0,                  NULL },
+    { "TMR3TimerQueuesDo",                      VMM_FN(TMR3TimerQueuesDo),              &g_aArgsVM[0],                              RT_ELEMENTS(g_aArgsVM),                                REMFNDESC_FLAGS_RET_VOID,   0,                  NULL },
+    { "TRPMAssertTrap",                         VMM_FN(TRPMAssertTrap),                 &g_aArgsTRPMAssertTrap[0],                  RT_ELEMENTS(g_aArgsTRPMAssertTrap),                    REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "TRPMGetErrorCode",                       VMM_FN(TRPMGetErrorCode),               &g_aArgsVMCPU[0],                           RT_ELEMENTS(g_aArgsVMCPU),                             REMFNDESC_FLAGS_RET_INT,    sizeof(RTGCUINT),   NULL },
+    { "TRPMGetFaultAddress",                    VMM_FN(TRPMGetFaultAddress),            &g_aArgsVMCPU[0],                           RT_ELEMENTS(g_aArgsVMCPU),                             REMFNDESC_FLAGS_RET_INT,    sizeof(RTGCUINTPTR),NULL },
+    { "TRPMQueryTrap",                          VMM_FN(TRPMQueryTrap),                  &g_aArgsTRPMQueryTrap[0],                   RT_ELEMENTS(g_aArgsTRPMQueryTrap),                     REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "TRPMResetTrap",                          VMM_FN(TRPMResetTrap),                  &g_aArgsVMCPU[0],                           RT_ELEMENTS(g_aArgsVMCPU),                             REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "TRPMSetErrorCode",                       VMM_FN(TRPMSetErrorCode),               &g_aArgsTRPMSetErrorCode[0],                RT_ELEMENTS(g_aArgsTRPMSetErrorCode),                  REMFNDESC_FLAGS_RET_VOID,   0,                  NULL },
+    { "TRPMSetFaultAddress",                    VMM_FN(TRPMSetFaultAddress),            &g_aArgsTRPMSetFaultAddress[0],             RT_ELEMENTS(g_aArgsTRPMSetFaultAddress),               REMFNDESC_FLAGS_RET_VOID,   0,                  NULL },
+    { "VMMGetCpu",                              VMM_FN(VMMGetCpu),                      &g_aArgsVM[0],                              RT_ELEMENTS(g_aArgsVM),                                REMFNDESC_FLAGS_RET_INT,    sizeof(PVMCPU),     NULL },
+    { "VMR3ReqPriorityCallWait",                VMM_FN(VMR3ReqPriorityCallWait),        &g_aArgsVMR3ReqCallWait[0],                 RT_ELEMENTS(g_aArgsVMR3ReqCallWait),                   REMFNDESC_FLAGS_RET_INT | REMFNDESC_FLAGS_ELLIPSIS, sizeof(int), NULL },
+    { "VMR3ReqFree",                            VMM_FN(VMR3ReqFree),                    &g_aArgsVMR3ReqFree[0],                     RT_ELEMENTS(g_aArgsVMR3ReqFree),                       REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+//    { "",                        VMM_FN(),                &g_aArgsVM[0],                              RT_ELEMENTS(g_aArgsVM),                                REMFNDESC_FLAGS_RET_INT,    sizeof(int),   NULL },
+};
+
+
+/**
+ * Descriptors for the functions imported from VBoxRT.
+ */
+static REMFNDESC g_aRTImports[] =
+{
+    { "RTAssertMsg1",                           (void *)(uintptr_t)&RTAssertMsg1,                   &g_aArgsRTAssertMsg1[0],                    RT_ELEMENTS(g_aArgsRTAssertMsg1),                      REMFNDESC_FLAGS_RET_VOID,   0,                  NULL },
+    { "RTAssertMsg1Weak",                       (void *)(uintptr_t)&RTAssertMsg1Weak,               &g_aArgsRTAssertMsg1[0],                    RT_ELEMENTS(g_aArgsRTAssertMsg1),                      REMFNDESC_FLAGS_RET_VOID,   0,                  NULL },
+    { "RTAssertMsg2",                           (void *)(uintptr_t)&RTAssertMsg2,                   &g_aArgsRTAssertMsg2[0],                    RT_ELEMENTS(g_aArgsRTAssertMsg2),                      REMFNDESC_FLAGS_RET_VOID | REMFNDESC_FLAGS_ELLIPSIS, 0, NULL },
+    { "RTAssertMsg2V",                          (void *)(uintptr_t)&RTAssertMsg2V,                  &g_aArgsRTAssertMsg2V[0],                   RT_ELEMENTS(g_aArgsRTAssertMsg2V),                     REMFNDESC_FLAGS_RET_VOID | REMFNDESC_FLAGS_VALIST, 0, NULL },
+    { "RTAssertMsg2Weak",                       (void *)(uintptr_t)&RTAssertMsg2Weak,               &g_aArgsRTAssertMsg2[0],                    RT_ELEMENTS(g_aArgsRTAssertMsg2),                      REMFNDESC_FLAGS_RET_VOID | REMFNDESC_FLAGS_ELLIPSIS, 0, NULL },
+    { "RTAssertShouldPanic",                    (void *)(uintptr_t)&RTAssertShouldPanic,            NULL,                                       0,                                                     REMFNDESC_FLAGS_RET_INT,    sizeof(bool),       NULL },
+    { "RTLogDefaultInstance",                   (void *)(uintptr_t)&RTLogDefaultInstance,           NULL,                                       0,                                                     REMFNDESC_FLAGS_RET_INT,    sizeof(PRTLOGGER),  NULL },
+    { "RTLogRelGetDefaultInstance",             (void *)(uintptr_t)&RTLogRelGetDefaultInstance,     NULL,                                       0,                                                     REMFNDESC_FLAGS_RET_INT,    sizeof(PRTLOGGER),  NULL },
+    { "RTLogDefaultInstanceEx",                 (void *)(uintptr_t)&RTLogDefaultInstance,           &g_aArgsRTLogGetDefaultInstanceEx[0],       RT_ELEMENTS(g_aArgsRTLogGetDefaultInstanceEx),         REMFNDESC_FLAGS_RET_INT,    sizeof(PRTLOGGER),  NULL },
+    { "RTLogRelGetDefaultInstanceEx",           (void *)(uintptr_t)&RTLogRelGetDefaultInstance,     &g_aArgsRTLogGetDefaultInstanceEx[0],       RT_ELEMENTS(g_aArgsRTLogGetDefaultInstanceEx),         REMFNDESC_FLAGS_RET_INT,    sizeof(PRTLOGGER),  NULL },
+    { "RTLogFlags",                             (void *)(uintptr_t)&RTLogFlags,                     &g_aArgsRTLogFlags[0],                      RT_ELEMENTS(g_aArgsRTLogFlags),                        REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "RTLogFlush",                             (void *)(uintptr_t)&RTLogFlush,                     &g_aArgsRTLogFlush[0],                      RT_ELEMENTS(g_aArgsRTLogFlush),                        REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "RTLogLoggerEx",                          (void *)(uintptr_t)&RTLogLoggerEx,                  &g_aArgsRTLogLoggerEx[0],                   RT_ELEMENTS(g_aArgsRTLogLoggerEx),                     REMFNDESC_FLAGS_RET_VOID | REMFNDESC_FLAGS_ELLIPSIS, 0, NULL },
+    { "RTLogLoggerExV",                         (void *)(uintptr_t)&RTLogLoggerExV,                 &g_aArgsRTLogLoggerExV[0],                  RT_ELEMENTS(g_aArgsRTLogLoggerExV),                    REMFNDESC_FLAGS_RET_VOID | REMFNDESC_FLAGS_VALIST, 0, NULL },
+    { "RTLogPrintf",                            (void *)(uintptr_t)&RTLogPrintf,                    &g_aArgsRTLogPrintf[0],                     RT_ELEMENTS(g_aArgsRTLogPrintf),                       REMFNDESC_FLAGS_RET_VOID,   0,                  NULL },
+    { "RTLogRelPrintf",                         (void *)(uintptr_t)&RTLogRelPrintf,                 &g_aArgsRTLogPrintf[0],                     RT_ELEMENTS(g_aArgsRTLogPrintf),                       REMFNDESC_FLAGS_RET_VOID,   0,                  NULL },
+    { "RTMemAllocTag",                          (void *)(uintptr_t)&RTMemAllocTag,                  &g_aArgsSIZE_TTag[0],                       RT_ELEMENTS(g_aArgsSIZE_TTag),                         REMFNDESC_FLAGS_RET_INT,    sizeof(void *),     NULL },
+    { "RTMemAllocZTag",                         (void *)(uintptr_t)&RTMemAllocZTag,                 &g_aArgsSIZE_TTag[0],                       RT_ELEMENTS(g_aArgsSIZE_TTag),                         REMFNDESC_FLAGS_RET_INT,    sizeof(void *),     NULL },
+    { "RTMemReallocTag",                        (void *)(uintptr_t)&RTMemReallocTag,                &g_aArgsRTMemReallocTag[0],                 RT_ELEMENTS(g_aArgsRTMemReallocTag),                   REMFNDESC_FLAGS_RET_INT,    sizeof(void *),     NULL },
+    { "RTMemExecAllocTag",                      (void *)(uintptr_t)&RTMemExecAllocTag,              &g_aArgsSIZE_TTag[0],                       RT_ELEMENTS(g_aArgsSIZE_TTag),                         REMFNDESC_FLAGS_RET_INT,    sizeof(void *),     NULL },
+    { "RTMemExecFree",                          (void *)(uintptr_t)&RTMemExecFree,                  &g_aArgsPTR_SIZE_T[0],                      RT_ELEMENTS(g_aArgsPTR_SIZE_T),                        REMFNDESC_FLAGS_RET_VOID,   0,                  NULL },
+    { "RTMemFree",                              (void *)(uintptr_t)&RTMemFree,                      &g_aArgsPTR[0],                             RT_ELEMENTS(g_aArgsPTR),                               REMFNDESC_FLAGS_RET_VOID,   0,                  NULL },
+    { "RTMemPageAllocTag",                      (void *)(uintptr_t)&RTMemPageAllocTag,              &g_aArgsSIZE_TTag[0],                       RT_ELEMENTS(g_aArgsSIZE_TTag),                         REMFNDESC_FLAGS_RET_INT,    sizeof(void *),     NULL },
+    { "RTMemPageFree",                          (void *)(uintptr_t)&RTMemPageFree,                  &g_aArgsPTR_SIZE_T[0],                      RT_ELEMENTS(g_aArgsPTR_SIZE_T),                        REMFNDESC_FLAGS_RET_VOID,   0,                  NULL },
+    { "RTMemProtect",                           (void *)(uintptr_t)&RTMemProtect,                   &g_aArgsRTMemProtect[0],                    RT_ELEMENTS(g_aArgsRTMemProtect),                      REMFNDESC_FLAGS_RET_INT,    sizeof(int),        NULL },
+    { "RTMemEfAlloc",                           (void *)(uintptr_t)&RTMemEfAlloc,                   &g_aArgsSIZE_TTagLoc[0],                    RT_ELEMENTS(g_aArgsSIZE_TTagLoc),                      REMFNDESC_FLAGS_RET_INT,    sizeof(void *),     NULL },
+    { "RTMemEfAllocZ",                          (void *)(uintptr_t)&RTMemEfAllocZ,                  &g_aArgsSIZE_TTagLoc[0],                    RT_ELEMENTS(g_aArgsSIZE_TTagLoc),                      REMFNDESC_FLAGS_RET_INT,    sizeof(void *),     NULL },
+    { "RTMemEfRealloc",                         (void *)(uintptr_t)&RTMemEfRealloc,                 &g_aArgsRTMemEfRealloc[0],                  RT_ELEMENTS(g_aArgsRTMemEfRealloc),                    REMFNDESC_FLAGS_RET_INT,    sizeof(void *),     NULL },
+    { "RTMemEfFree",                            (void *)(uintptr_t)&RTMemEfFree,                    &g_aArgsPTRLoc[0],                          RT_ELEMENTS(g_aArgsPTRLoc),                            REMFNDESC_FLAGS_RET_VOID,   0,                  NULL },
+    { "RTStrPrintf",                            (void *)(uintptr_t)&RTStrPrintf,                    &g_aArgsRTStrPrintf[0],                     RT_ELEMENTS(g_aArgsRTStrPrintf),                       REMFNDESC_FLAGS_RET_INT | REMFNDESC_FLAGS_ELLIPSIS, sizeof(size_t), NULL },
+    { "RTStrPrintfV",                           (void *)(uintptr_t)&RTStrPrintfV,                   &g_aArgsRTStrPrintfV[0],                    RT_ELEMENTS(g_aArgsRTStrPrintfV),                      REMFNDESC_FLAGS_RET_INT | REMFNDESC_FLAGS_VALIST, sizeof(size_t), NULL },
+    { "RTThreadSelf",                           (void *)(uintptr_t)&RTThreadSelf,                   NULL,                                       0,                                                     REMFNDESC_FLAGS_RET_INT,    sizeof(RTTHREAD),    NULL },
+    { "RTThreadNativeSelf",                     (void *)(uintptr_t)&RTThreadNativeSelf,             NULL,                                       0,                                                     REMFNDESC_FLAGS_RET_INT, sizeof(RTNATIVETHREAD), NULL },
+    { "RTLockValidatorWriteLockGetCount",       (void *)(uintptr_t)&RTLockValidatorWriteLockGetCount, &g_aArgsThread[0],                        0,                                                     REMFNDESC_FLAGS_RET_INT,    sizeof(int32_t),     NULL },
+};
+
+
+/**
+ * Descriptors for the functions imported from VBoxRT.
+ */
+static REMFNDESC g_aCRTImports[] =
+{
+    { "memcpy",                                (void *)(uintptr_t)&memcpy,                          &g_aArgsmemcpy[0],                          RT_ELEMENTS(g_aArgsmemcpy),                            REMFNDESC_FLAGS_RET_INT,    sizeof(void *), NULL },
+    { "memset",                                (void *)(uintptr_t)&memset,                          &g_aArgsmemset[0],                          RT_ELEMENTS(g_aArgsmemset),                            REMFNDESC_FLAGS_RET_INT,    sizeof(void *), NULL }
+/*
+floor               floor
+memcpy              memcpy
+sqrt                sqrt
+sqrtf               sqrtf
+*/
+};
+
+
+#  if defined(USE_REM_CALLING_CONVENTION_GLUE) || defined(USE_REM_IMPORT_JUMP_GLUE)
+/** LIFO of read-write-executable memory chunks used for wrappers. */
+static PREMEXECMEM g_pExecMemHead;
+#  endif
+# endif /* !VBOX_USE_BITNESS_SELECTOR */
+
+
+
+/*********************************************************************************************************************************
+*   Internal Functions                                                                                                           *
+*********************************************************************************************************************************/
+# ifndef VBOX_USE_BITNESS_SELECTOR
+static int remGenerateExportGlue(PRTUINTPTR pValue, PCREMFNDESC pDesc);
+
+#  ifdef USE_REM_CALLING_CONVENTION_GLUE
+DECLASM(int) WrapGCC2MSC0Int(void);  DECLASM(int) WrapGCC2MSC0Int_EndProc(void);
+DECLASM(int) WrapGCC2MSC1Int(void);  DECLASM(int) WrapGCC2MSC1Int_EndProc(void);
+DECLASM(int) WrapGCC2MSC2Int(void);  DECLASM(int) WrapGCC2MSC2Int_EndProc(void);
+DECLASM(int) WrapGCC2MSC3Int(void);  DECLASM(int) WrapGCC2MSC3Int_EndProc(void);
+DECLASM(int) WrapGCC2MSC4Int(void);  DECLASM(int) WrapGCC2MSC4Int_EndProc(void);
+DECLASM(int) WrapGCC2MSC5Int(void);  DECLASM(int) WrapGCC2MSC5Int_EndProc(void);
+DECLASM(int) WrapGCC2MSC6Int(void);  DECLASM(int) WrapGCC2MSC6Int_EndProc(void);
+DECLASM(int) WrapGCC2MSC7Int(void);  DECLASM(int) WrapGCC2MSC7Int_EndProc(void);
+DECLASM(int) WrapGCC2MSC8Int(void);  DECLASM(int) WrapGCC2MSC8Int_EndProc(void);
+DECLASM(int) WrapGCC2MSC9Int(void);  DECLASM(int) WrapGCC2MSC9Int_EndProc(void);
+DECLASM(int) WrapGCC2MSC10Int(void); DECLASM(int) WrapGCC2MSC10Int_EndProc(void);
+DECLASM(int) WrapGCC2MSC11Int(void); DECLASM(int) WrapGCC2MSC11Int_EndProc(void);
+DECLASM(int) WrapGCC2MSC12Int(void); DECLASM(int) WrapGCC2MSC12Int_EndProc(void);
+DECLASM(int) WrapGCC2MSCVariadictInt(void); DECLASM(int) WrapGCC2MSCVariadictInt_EndProc(void);
+DECLASM(int) WrapGCC2MSC_SSMR3RegisterInternal(void); DECLASM(int) WrapGCC2MSC_SSMR3RegisterInternal_EndProc(void);
+
+DECLASM(int) WrapMSC2GCC0Int(void);  DECLASM(int) WrapMSC2GCC0Int_EndProc(void);
+DECLASM(int) WrapMSC2GCC1Int(void);  DECLASM(int) WrapMSC2GCC1Int_EndProc(void);
+DECLASM(int) WrapMSC2GCC2Int(void);  DECLASM(int) WrapMSC2GCC2Int_EndProc(void);
+DECLASM(int) WrapMSC2GCC3Int(void);  DECLASM(int) WrapMSC2GCC3Int_EndProc(void);
+DECLASM(int) WrapMSC2GCC4Int(void);  DECLASM(int) WrapMSC2GCC4Int_EndProc(void);
+DECLASM(int) WrapMSC2GCC5Int(void);  DECLASM(int) WrapMSC2GCC5Int_EndProc(void);
+DECLASM(int) WrapMSC2GCC6Int(void);  DECLASM(int) WrapMSC2GCC6Int_EndProc(void);
+DECLASM(int) WrapMSC2GCC7Int(void);  DECLASM(int) WrapMSC2GCC7Int_EndProc(void);
+DECLASM(int) WrapMSC2GCC8Int(void);  DECLASM(int) WrapMSC2GCC8Int_EndProc(void);
+DECLASM(int) WrapMSC2GCC9Int(void);  DECLASM(int) WrapMSC2GCC9Int_EndProc(void);
+#  endif
+
+
+#  if defined(USE_REM_CALLING_CONVENTION_GLUE) || defined(USE_REM_IMPORT_JUMP_GLUE)
+/**
+ * Allocates a block of memory for glue code.
+ *
+ * The returned memory is padded with INT3s.
+ *
+ * @returns Pointer to the allocated memory.
+ * @param   The amount of memory to allocate.
+ */
+static void *remAllocGlue(size_t cb)
+{
+    PREMEXECMEM pCur = g_pExecMemHead;
+    uint32_t cbAligned = (uint32_t)RT_ALIGN_32(cb, 32);
+    while (pCur)
+    {
+        if (pCur->cb - pCur->off >= cbAligned)
+        {
+            void *pv = (uint8_t *)pCur + pCur->off;
+            pCur->off += cbAligned;
+            return memset(pv, 0xcc, cbAligned);
+        }
+        pCur = pCur->pNext;
+    }
+
+    /* add a new chunk */
+    AssertReturn(_64K - RT_ALIGN_Z(sizeof(*pCur), 32) > cbAligned, NULL);
+    pCur = (PREMEXECMEM)RTMemExecAlloc(_64K);
+    AssertReturn(pCur, NULL);
+    pCur->cb = _64K;
+    pCur->off = RT_ALIGN_32(sizeof(*pCur), 32) + cbAligned;
+    pCur->pNext = g_pExecMemHead;
+    g_pExecMemHead = pCur;
+    return memset((uint8_t *)pCur + RT_ALIGN_Z(sizeof(*pCur), 32), 0xcc, cbAligned);
+}
+#  endif /* USE_REM_CALLING_CONVENTION_GLUE || USE_REM_IMPORT_JUMP_GLUE */
+
+
+#  ifdef USE_REM_CALLING_CONVENTION_GLUE
+/**
+ * Checks if a function is all straight forward integers.
+ *
+ * @returns True if it's simple, false if it's bothersome.
+ * @param   pDesc       The function descriptor.
+ */
+static bool remIsFunctionAllInts(PCREMFNDESC pDesc)
+{
+    if (    (   (pDesc->fFlags & REMFNDESC_FLAGS_RET_TYPE_MASK) != REMFNDESC_FLAGS_RET_INT
+             || pDesc->cbReturn > sizeof(uint64_t))
+        &&  (pDesc->fFlags & REMFNDESC_FLAGS_RET_TYPE_MASK) != REMFNDESC_FLAGS_RET_VOID)
+        return false;
+    unsigned i = pDesc->cParams;
+    while (i-- > 0)
+        switch (pDesc->paParams[i].fFlags & REMPARMDESC_FLAGS_TYPE_MASK)
+        {
+            case REMPARMDESC_FLAGS_INT:
+            case REMPARMDESC_FLAGS_GCPTR:
+            case REMPARMDESC_FLAGS_GCPHYS:
+            case REMPARMDESC_FLAGS_HCPHYS:
+                break;
+
+            default:
+                AssertReleaseMsgFailed(("Invalid param flags %#x for #%d of %s!\n", pDesc->paParams[i].fFlags, i, pDesc->pszName));
+            case REMPARMDESC_FLAGS_VALIST:
+            case REMPARMDESC_FLAGS_ELLIPSIS:
+            case REMPARMDESC_FLAGS_FLOAT:
+            case REMPARMDESC_FLAGS_STRUCT:
+            case REMPARMDESC_FLAGS_PFN:
+                return false;
+        }
+    return true;
+}
+
+
+/**
+ * Checks if the function has an ellipsis (...) argument.
+ *
+ * @returns true if it has an ellipsis, otherwise false.
+ * @param   pDesc       The function descriptor.
+ */
+static bool remHasFunctionEllipsis(PCREMFNDESC pDesc)
+{
+    unsigned i = pDesc->cParams;
+    while (i-- > 0)
+        if ((pDesc->paParams[i].fFlags & REMPARMDESC_FLAGS_TYPE_MASK) == REMPARMDESC_FLAGS_ELLIPSIS)
+            return true;
+    return false;
+}
+
+
+/**
+ * Checks if the function uses floating point (FP) arguments or return value.
+ *
+ * @returns true if it uses floating point, otherwise false.
+ * @param   pDesc       The function descriptor.
+ */
+static bool remIsFunctionUsingFP(PCREMFNDESC pDesc)
+{
+    if ((pDesc->fFlags & REMFNDESC_FLAGS_RET_TYPE_MASK) == REMFNDESC_FLAGS_RET_FLOAT)
+        return true;
+    unsigned i = pDesc->cParams;
+    while (i-- > 0)
+        if ((pDesc->paParams[i].fFlags & REMPARMDESC_FLAGS_TYPE_MASK) == REMPARMDESC_FLAGS_FLOAT)
+            return true;
+    return false;
+}
+
+
+/** @name The export and import fixups.
+ * @{ */
+#   define REM_FIXUP_32_REAL_STUFF    UINT32_C(0xdeadbeef)
+#   define REM_FIXUP_64_REAL_STUFF    UINT64_C(0xdeadf00df00ddead)
+#   define REM_FIXUP_64_DESC          UINT64_C(0xdead00010001dead)
+#   define REM_FIXUP_64_LOG_ENTRY     UINT64_C(0xdead00020002dead)
+#   define REM_FIXUP_64_LOG_EXIT      UINT64_C(0xdead00030003dead)
+#   define REM_FIXUP_64_WRAP_GCC_CB   UINT64_C(0xdead00040004dead)
+/** @} */
+
+
+/**
+ * Entry logger function.
+ *
+ * @param   pDesc       The description.
+ */
+DECLASM(void) remLogEntry(PCREMFNDESC pDesc)
+{
+    RTPrintf("calling %s\n", pDesc->pszName);
+}
+
+
+/**
+ * Exit logger function.
+ *
+ * @param   pDesc       The description.
+ * @param   pvRet       The return code.
+ */
+DECLASM(void) remLogExit(PCREMFNDESC pDesc, void *pvRet)
+{
+    RTPrintf("returning %p from %s\n", pvRet, pDesc->pszName);
+}
+
+
+/**
+ * Creates a wrapper for the specified callback function at run time.
+ *
+ * @param   pDesc       The function descriptor.
+ * @param   pValue      Upon entry *pValue contains the address of the function to be wrapped.
+ *                      Upon return *pValue contains the address of the wrapper glue function.
+ * @param   iParam      The parameter index in the function descriptor (0 based).
+ *                      If UINT32_MAX pDesc is the descriptor for *pValue.
+ */
+DECLASM(void) remWrapGCCCallback(PCREMFNDESC pDesc, PRTUINTPTR pValue, uint32_t iParam)
+{
+    AssertPtr(pDesc);
+    AssertPtr(pValue);
+
+    /*
+     * Simple?
+     */
+    if (!*pValue)
+        return;
+
+    /*
+     * Locate the right function descriptor.
+     */
+    if (iParam != UINT32_MAX)
+    {
+        AssertRelease(iParam < pDesc->cParams);
+        pDesc = (PCREMFNDESC)pDesc->paParams[iParam].pvExtra;
+        AssertPtr(pDesc);
+    }
+
+    /*
+     * When we get serious, here is where to insert the hash table lookup.
+     */
+
+    /*
+     * Create a new glue patch.
+     */
+#   ifdef RT_OS_WINDOWS
+    int rc = remGenerateExportGlue(pValue, pDesc);
+#   else
+#    error "port me"
+#   endif
+    AssertReleaseRC(rc);
+
+    /*
+     * Add it to the hash (later)
+     */
+}
+
+
+/**
+ * Fixes export glue.
+ *
+ * @param   pvGlue      The glue code.
+ * @param   cb          The size of the glue code.
+ * @param   pvExport    The address of the export we're wrapping.
+ * @param   pDesc       The export descriptor.
+ */
+static void remGenerateExportGlueFixup(void *pvGlue, size_t cb, uintptr_t uExport, PCREMFNDESC pDesc)
+{
+    union
+    {
+        uint8_t  *pu8;
+        int32_t  *pi32;
+        uint32_t *pu32;
+        uint64_t *pu64;
+        void     *pv;
+    } u;
+    u.pv = pvGlue;
+
+    while (cb >= 4)
+    {
+        /** @todo add defines for the fixup constants... */
+        if (*u.pu32 == REM_FIXUP_32_REAL_STUFF)
+        {
+            /* 32-bit rel jmp/call to real export. */
+            *u.pi32 = uExport - (uintptr_t)(u.pi32 + 1);
+            Assert((uintptr_t)(u.pi32 + 1) + *u.pi32 == uExport);
+            u.pi32++;
+            cb -= 4;
+            continue;
+        }
+        if (cb >= 8 && *u.pu64 == REM_FIXUP_64_REAL_STUFF)
+        {
+            /* 64-bit address to the real export. */
+            *u.pu64++ = uExport;
+            cb -= 8;
+            continue;
+        }
+        if (cb >= 8 && *u.pu64 == REM_FIXUP_64_DESC)
+        {
+            /* 64-bit address to the descriptor. */
+            *u.pu64++ = (uintptr_t)pDesc;
+            cb -= 8;
+            continue;
+        }
+        if (cb >= 8 && *u.pu64 == REM_FIXUP_64_WRAP_GCC_CB)
+        {
+            /* 64-bit address to the entry logger function. */
+            *u.pu64++ = (uintptr_t)remWrapGCCCallback;
+            cb -= 8;
+            continue;
+        }
+        if (cb >= 8 && *u.pu64 == REM_FIXUP_64_LOG_ENTRY)
+        {
+            /* 64-bit address to the entry logger function. */
+            *u.pu64++ = (uintptr_t)remLogEntry;
+            cb -= 8;
+            continue;
+        }
+        if (cb >= 8 && *u.pu64 == REM_FIXUP_64_LOG_EXIT)
+        {
+            /* 64-bit address to the entry logger function. */
+            *u.pu64++ = (uintptr_t)remLogExit;
+            cb -= 8;
+            continue;
+        }
+
+        /* move on. */
+        u.pu8++;
+        cb--;
+    }
+}
+
+
+/**
+ * Fixes import glue.
+ *
+ * @param   pvGlue  The glue code.
+ * @param   cb      The size of the glue code.
+ * @param   pDesc   The import descriptor.
+ */
+static void remGenerateImportGlueFixup(void *pvGlue, size_t cb, PCREMFNDESC pDesc)
+{
+    union
+    {
+        uint8_t  *pu8;
+        int32_t  *pi32;
+        uint32_t *pu32;
+        uint64_t *pu64;
+        void     *pv;
+    } u;
+    u.pv = pvGlue;
+
+    while (cb >= 4)
+    {
+        if (*u.pu32 == REM_FIXUP_32_REAL_STUFF)
+        {
+            /* 32-bit rel jmp/call to real function. */
+            *u.pi32 = (uintptr_t)pDesc->pv - (uintptr_t)(u.pi32 + 1);
+            Assert((uintptr_t)(u.pi32 + 1) + *u.pi32 == (uintptr_t)pDesc->pv);
+            u.pi32++;
+            cb -= 4;
+            continue;
+        }
+        if (cb >= 8 && *u.pu64 == REM_FIXUP_64_REAL_STUFF)
+        {
+            /* 64-bit address to the real function. */
+            *u.pu64++ = (uintptr_t)pDesc->pv;
+            cb -= 8;
+            continue;
+        }
+        if (cb >= 8 && *u.pu64 == REM_FIXUP_64_DESC)
+        {
+            /* 64-bit address to the descriptor. */
+            *u.pu64++ = (uintptr_t)pDesc;
+            cb -= 8;
+            continue;
+        }
+        if (cb >= 8 && *u.pu64 == REM_FIXUP_64_WRAP_GCC_CB)
+        {
+            /* 64-bit address to the entry logger function. */
+            *u.pu64++ = (uintptr_t)remWrapGCCCallback;
+            cb -= 8;
+            continue;
+        }
+        if (cb >= 8 && *u.pu64 == REM_FIXUP_64_LOG_ENTRY)
+        {
+            /* 64-bit address to the entry logger function. */
+            *u.pu64++ = (uintptr_t)remLogEntry;
+            cb -= 8;
+            continue;
+        }
+        if (cb >= 8 && *u.pu64 == REM_FIXUP_64_LOG_EXIT)
+        {
+            /* 64-bit address to the entry logger function. */
+            *u.pu64++ = (uintptr_t)remLogExit;
+            cb -= 8;
+            continue;
+        }
+
+        /* move on. */
+        u.pu8++;
+        cb--;
+    }
+}
+
+#  endif /* USE_REM_CALLING_CONVENTION_GLUE */
+
+
+/**
+ * Generate wrapper glue code for an export.
+ *
+ * This is only used on win64 when loading a 64-bit linux module. So, on other
+ * platforms it will not do anything.
+ *
+ * @returns VBox status code.
+ * @param   pValue      IN: Where to get the address of the function to wrap.
+ *                      OUT: Where to store the glue address.
+ * @param   pDesc       The export descriptor.
+ */
+static int remGenerateExportGlue(PRTUINTPTR pValue, PCREMFNDESC pDesc)
+{
+#  ifdef USE_REM_CALLING_CONVENTION_GLUE
+    uintptr_t *ppfn = (uintptr_t *)pDesc->pv;
+
+    uintptr_t pfn = 0; /* a little hack for the callback glue */
+    if (!ppfn)
+        ppfn = &pfn;
+
+    if (!*ppfn)
+    {
+        if (remIsFunctionAllInts(pDesc))
+        {
+            static const struct { void *pvStart, *pvEnd; } s_aTemplates[] =
+            {
+                { (void *)&WrapMSC2GCC0Int,  (void *)&WrapMSC2GCC0Int_EndProc },
+                { (void *)&WrapMSC2GCC1Int,  (void *)&WrapMSC2GCC1Int_EndProc },
+                { (void *)&WrapMSC2GCC2Int,  (void *)&WrapMSC2GCC2Int_EndProc },
+                { (void *)&WrapMSC2GCC3Int,  (void *)&WrapMSC2GCC3Int_EndProc },
+                { (void *)&WrapMSC2GCC4Int,  (void *)&WrapMSC2GCC4Int_EndProc },
+                { (void *)&WrapMSC2GCC5Int,  (void *)&WrapMSC2GCC5Int_EndProc },
+                { (void *)&WrapMSC2GCC6Int,  (void *)&WrapMSC2GCC6Int_EndProc },
+                { (void *)&WrapMSC2GCC7Int,  (void *)&WrapMSC2GCC7Int_EndProc },
+                { (void *)&WrapMSC2GCC8Int,  (void *)&WrapMSC2GCC8Int_EndProc },
+                { (void *)&WrapMSC2GCC9Int,  (void *)&WrapMSC2GCC9Int_EndProc },
+            };
+            const unsigned i = pDesc->cParams;
+            AssertReleaseMsg(i < RT_ELEMENTS(s_aTemplates), ("%d (%s)\n", i, pDesc->pszName));
+
+            /* duplicate the patch. */
+            const size_t cb = (uintptr_t)s_aTemplates[i].pvEnd - (uintptr_t)s_aTemplates[i].pvStart;
+            uint8_t *pb = (uint8_t *)remAllocGlue(cb);
+            AssertReturn(pb, VERR_NO_MEMORY);
+            memcpy(pb, s_aTemplates[i].pvStart, cb);
+
+            /* fix it up. */
+            remGenerateExportGlueFixup(pb, cb, *pValue, pDesc);
+            *ppfn = (uintptr_t)pb;
+        }
+        else
+        {
+            /* custom hacks - it's simpler to make assembly templates than writing a more generic code generator... */
+            static const struct { const char *pszName; PFNRT pvStart, pvEnd; } s_aTemplates[] =
+            {
+                { "somefunction",  (PFNRT)&WrapMSC2GCC9Int,  (PFNRT)&WrapMSC2GCC9Int_EndProc },
+            };
+            unsigned i;
+            for (i = 0; i < RT_ELEMENTS(s_aTemplates); i++)
+                if (!strcmp(pDesc->pszName, s_aTemplates[i].pszName))
+                    break;
+            AssertReleaseMsgReturn(i < RT_ELEMENTS(s_aTemplates), ("Not implemented! %s\n", pDesc->pszName), VERR_NOT_IMPLEMENTED);
+
+            /* duplicate the patch. */
+            const size_t cb = (uintptr_t)s_aTemplates[i].pvEnd - (uintptr_t)s_aTemplates[i].pvStart;
+            uint8_t *pb = (uint8_t *)remAllocGlue(cb);
+            AssertReturn(pb, VERR_NO_MEMORY);
+            memcpy(pb, s_aTemplates[i].pvStart, cb);
+
+            /* fix it up. */
+            remGenerateExportGlueFixup(pb, cb, *pValue, pDesc);
+            *ppfn = (uintptr_t)pb;
+        }
+    }
+    *pValue = *ppfn;
+    return VINF_SUCCESS;
+#  else  /* !USE_REM_CALLING_CONVENTION_GLUE */
+    return VINF_SUCCESS;
+#  endif /* !USE_REM_CALLING_CONVENTION_GLUE */
+}
+
+
+/**
+ * Generate wrapper glue code for an import.
+ *
+ * This is only used on win64 when loading a 64-bit linux module. So, on other
+ * platforms it will simply return the address of the imported function
+ * without generating any glue code.
+ *
+ * @returns VBox status code.
+ * @param   pValue      Where to store the glue address.
+ * @param   pDesc       The export descriptor.
+ */
+static int remGenerateImportGlue(PRTUINTPTR pValue, PREMFNDESC pDesc)
+{
+#  if defined(USE_REM_CALLING_CONVENTION_GLUE) || defined(USE_REM_IMPORT_JUMP_GLUE)
+    if (!pDesc->pvWrapper)
+    {
+#   ifdef USE_REM_CALLING_CONVENTION_GLUE
+        if (remIsFunctionAllInts(pDesc))
+        {
+            static const struct { void *pvStart, *pvEnd; } s_aTemplates[] =
+            {
+                { (void *)&WrapGCC2MSC0Int,  (void *)&WrapGCC2MSC0Int_EndProc },
+                { (void *)&WrapGCC2MSC1Int,  (void *)&WrapGCC2MSC1Int_EndProc },
+                { (void *)&WrapGCC2MSC2Int,  (void *)&WrapGCC2MSC2Int_EndProc },
+                { (void *)&WrapGCC2MSC3Int,  (void *)&WrapGCC2MSC3Int_EndProc },
+                { (void *)&WrapGCC2MSC4Int,  (void *)&WrapGCC2MSC4Int_EndProc },
+                { (void *)&WrapGCC2MSC5Int,  (void *)&WrapGCC2MSC5Int_EndProc },
+                { (void *)&WrapGCC2MSC6Int,  (void *)&WrapGCC2MSC6Int_EndProc },
+                { (void *)&WrapGCC2MSC7Int,  (void *)&WrapGCC2MSC7Int_EndProc },
+                { (void *)&WrapGCC2MSC8Int,  (void *)&WrapGCC2MSC8Int_EndProc },
+                { (void *)&WrapGCC2MSC9Int,  (void *)&WrapGCC2MSC9Int_EndProc },
+                { (void *)&WrapGCC2MSC10Int, (void *)&WrapGCC2MSC10Int_EndProc },
+                { (void *)&WrapGCC2MSC11Int, (void *)&WrapGCC2MSC11Int_EndProc },
+                { (void *)&WrapGCC2MSC12Int, (void *)&WrapGCC2MSC12Int_EndProc }
+            };
+            const unsigned i = pDesc->cParams;
+            AssertReleaseMsg(i < RT_ELEMENTS(s_aTemplates), ("%d (%s)\n", i, pDesc->pszName));
+
+            /* duplicate the patch. */
+            const size_t cb = (uintptr_t)s_aTemplates[i].pvEnd - (uintptr_t)s_aTemplates[i].pvStart;
+            pDesc->pvWrapper = remAllocGlue(cb);
+            AssertReturn(pDesc->pvWrapper, VERR_NO_MEMORY);
+            memcpy(pDesc->pvWrapper, s_aTemplates[i].pvStart, cb);
+
+            /* fix it up. */
+            remGenerateImportGlueFixup((uint8_t *)pDesc->pvWrapper, cb, pDesc);
+        }
+        else if (   remHasFunctionEllipsis(pDesc)
+                 && !remIsFunctionUsingFP(pDesc))
+        {
+            /* duplicate the patch. */
+            const size_t cb = (uintptr_t)&WrapGCC2MSCVariadictInt_EndProc - (uintptr_t)&WrapGCC2MSCVariadictInt;
+            pDesc->pvWrapper = remAllocGlue(cb);
+            AssertReturn(pDesc->pvWrapper, VERR_NO_MEMORY);
+            memcpy(pDesc->pvWrapper, (void *)&WrapGCC2MSCVariadictInt, cb);
+
+            /* fix it up. */
+            remGenerateImportGlueFixup((uint8_t *)pDesc->pvWrapper, cb, pDesc);
+        }
+        else
+        {
+            /* custom hacks - it's simpler to make assembly templates than writing a more generic code generator... */
+            static const struct { const char *pszName; PFNRT pvStart, pvEnd; } s_aTemplates[] =
+            {
+                { "SSMR3RegisterInternal",  (PFNRT)&WrapGCC2MSC_SSMR3RegisterInternal,  (PFNRT)&WrapGCC2MSC_SSMR3RegisterInternal_EndProc },
+            };
+            unsigned i;
+            for (i = 0; i < RT_ELEMENTS(s_aTemplates); i++)
+                if (!strcmp(pDesc->pszName, s_aTemplates[i].pszName))
+                    break;
+            AssertReleaseMsgReturn(i < RT_ELEMENTS(s_aTemplates), ("Not implemented! %s\n", pDesc->pszName), VERR_NOT_IMPLEMENTED);
+
+            /* duplicate the patch. */
+            const size_t cb = (uintptr_t)s_aTemplates[i].pvEnd - (uintptr_t)s_aTemplates[i].pvStart;
+            pDesc->pvWrapper = remAllocGlue(cb);
+            AssertReturn(pDesc->pvWrapper, VERR_NO_MEMORY);
+            memcpy(pDesc->pvWrapper, s_aTemplates[i].pvStart, cb);
+
+            /* fix it up. */
+            remGenerateImportGlueFixup((uint8_t *)pDesc->pvWrapper, cb, pDesc);
+        }
+#   else  /* !USE_REM_CALLING_CONVENTION_GLUE */
+
+        /*
+         * Generate a jump patch.
+         */
+        uint8_t *pb;
+#    ifdef RT_ARCH_AMD64
+        pDesc->pvWrapper = pb = (uint8_t *)remAllocGlue(32);
+        AssertReturn(pDesc->pvWrapper, VERR_NO_MEMORY);
+        /**pb++ = 0xcc;*/
+        *pb++ = 0xff;
+        *pb++ = 0x24;
+        *pb++ = 0x25;
+        *(uint32_t *)pb = (uintptr_t)pb + 5;
+        pb += 5;
+        *(uint64_t *)pb = (uint64_t)pDesc->pv;
+#    else
+        pDesc->pvWrapper = pb = (uint8_t *)remAllocGlue(8);
+        AssertReturn(pDesc->pvWrapper, VERR_NO_MEMORY);
+        *pb++ = 0xea;
+        *(uint32_t *)pb = (uint32_t)pDesc->pv;
+#    endif
+#   endif /* !USE_REM_CALLING_CONVENTION_GLUE */
+    }
+    *pValue = (uintptr_t)pDesc->pvWrapper;
+#  else  /* !USE_REM_CALLING_CONVENTION_GLUE */
+    *pValue = (uintptr_t)pDesc->pv;
+#  endif /* !USE_REM_CALLING_CONVENTION_GLUE */
+    return VINF_SUCCESS;
+}
+
+
+/**
+ * Resolve an external symbol during RTLdrGetBits().
+ *
+ * @returns iprt status code.
+ * @param   hLdrMod         The loader module handle.
+ * @param   pszModule       Module name.
+ * @param   pszSymbol       Symbol name, NULL if uSymbol should be used.
+ * @param   uSymbol         Symbol ordinal, ~0 if pszSymbol should be used.
+ * @param   pValue          Where to store the symbol value (address).
+ * @param   pvUser          User argument.
+ */
+static DECLCALLBACK(int) remGetImport(RTLDRMOD hLdrMod, const char *pszModule, const char *pszSymbol, unsigned uSymbol, RTUINTPTR *pValue, void *pvUser)
+{
+    unsigned i;
+    for (i = 0; i < RT_ELEMENTS(g_aVMMImports); i++)
+        if (!strcmp(g_aVMMImports[i].pszName, pszSymbol))
+            return remGenerateImportGlue(pValue, &g_aVMMImports[i]);
+    for (i = 0; i < RT_ELEMENTS(g_aRTImports); i++)
+        if (!strcmp(g_aRTImports[i].pszName, pszSymbol))
+            return remGenerateImportGlue(pValue, &g_aRTImports[i]);
+    for (i = 0; i < RT_ELEMENTS(g_aCRTImports); i++)
+        if (!strcmp(g_aCRTImports[i].pszName, pszSymbol))
+            return remGenerateImportGlue(pValue, &g_aCRTImports[i]);
+    LogRel(("Missing REM Import: %s\n", pszSymbol));
+#  if 1
+    *pValue = 0;
+    AssertMsgFailed(("%s.%s\n", pszModule, pszSymbol));
+    return VERR_SYMBOL_NOT_FOUND;
+#  else
+    return remGenerateImportGlue(pValue, &g_aCRTImports[0]);
+#  endif
+}
+
+/**
+ * Loads the linux object, resolves all imports and exports.
+ *
+ * @returns VBox status code.
+ */
+static int remLoadLinuxObj(void)
+{
+    size_t  offFilename;
+    char    szPath[RTPATH_MAX];
+    int rc = RTPathAppPrivateArch(szPath, sizeof(szPath) - 32);
+    AssertRCReturn(rc, rc);
+    offFilename = strlen(szPath);
+
+#  ifdef VBOX_WITHOUT_REM_LDR_CYCLE
+    /*
+     * Resolve all the VBoxVMM references.
+     */
+    if (g_ModVMM != NIL_RTLDRMOD)
+    {
+        rc = SUPR3HardenedLdrLoadAppPriv("VBoxVMM", &g_ModVMM, RTLDRLOAD_FLAGS_LOCAL, NULL);
+        AssertRCReturn(rc, rc);
+        for (size_t i = 0; i < RT_ELEMENTS(g_aVMMImports); i++)
+        {
+            rc = RTLdrGetSymbol(g_ModVMM, g_aVMMImports[i].pszName, &g_aVMMImports[i].pv);
+            AssertLogRelMsgRCReturn(rc, ("RTLdrGetSymbol(VBoxVMM,%s,) -> %Rrc\n", g_aVMMImports[i].pszName, rc), rc);
+        }
+    }
+#  endif
+
+    /*
+     * Load the VBoxREM2.rel object/DLL.
+     */
+    strcpy(&szPath[offFilename], "/VBoxREM2.rel");
+    rc = RTLdrOpen(szPath, 0, RTLDRARCH_HOST, &g_ModREM2);
+    if (RT_SUCCESS(rc))
+    {
+        g_cbREM2 = RTLdrSize(g_ModREM2);
+        g_pvREM2 = RTMemExecAlloc(g_cbREM2);
+        if (g_pvREM2)
+        {
+            RTPathChangeToUnixSlashes(szPath, true);
+#  ifdef DEBUG /* How to load the VBoxREM2.rel symbols into the GNU debugger. */
+            RTPrintf("VBoxREMWrapper: (gdb) add-symbol-file %s 0x%p\n", szPath, g_pvREM2);
+#  endif
+            LogRel(("REM: Loading %s at 0x%p (%d bytes)\n"
+                    "REM: (gdb) add-symbol-file %s 0x%p\n",
+                    szPath, g_pvREM2, RTLdrSize(g_ModREM2), szPath, g_pvREM2));
+            rc = RTLdrGetBits(g_ModREM2, g_pvREM2, (RTUINTPTR)g_pvREM2, remGetImport, NULL);
+            if (RT_SUCCESS(rc))
+            {
+                /*
+                 * Resolve exports.
+                 */
+                unsigned i;
+                for (i = 0; i < RT_ELEMENTS(g_aExports); i++)
+                {
+                    RTUINTPTR Value;
+                    rc = RTLdrGetSymbolEx(g_ModREM2, g_pvREM2, (RTUINTPTR)g_pvREM2, UINT32_MAX, g_aExports[i].pszName, &Value);
+                    AssertMsgRC(rc, ("%s rc=%Rrc\n", g_aExports[i].pszName, rc));
+                    if (RT_FAILURE(rc))
+                        break;
+                    rc = remGenerateExportGlue(&Value, &g_aExports[i]);
+                    if (RT_FAILURE(rc))
+                        break;
+                    *(void **)g_aExports[i].pv = (void *)(uintptr_t)Value;
+                }
+                return rc;
+            }
+
+            RTMemExecFree(g_pvREM2, g_cbREM2);
+            g_pvREM2 = NULL;
+        }
+        g_cbREM2 = 0;
+        RTLdrClose(g_ModREM2);
+        g_ModREM2 = NIL_RTLDRMOD;
+    }
+    LogRel(("REM: failed loading '%s', rc=%Rrc\n", szPath, rc));
+    return rc;
+}
+
+
+/**
+ * Unloads the linux object, freeing up all resources (dlls and
+ * import glue) we allocated during remLoadLinuxObj().
+ */
+static void remUnloadLinuxObj(void)
+{
+    unsigned i;
+
+    /* close modules. */
+    RTLdrClose(g_ModREM2);
+    g_ModREM2 = NIL_RTLDRMOD;
+    RTMemExecFree(g_pvREM2, g_cbREM2);
+    g_pvREM2 = NULL;
+    g_cbREM2 = 0;
+
+    /* clear the pointers. */
+    for (i = 0; i < RT_ELEMENTS(g_aExports); i++)
+        *(void **)g_aExports[i].pv = NULL;
+#  if defined(USE_REM_CALLING_CONVENTION_GLUE) || defined(USE_REM_IMPORT_JUMP_GLUE)
+    for (i = 0; i < RT_ELEMENTS(g_aVMMImports); i++)
+        g_aVMMImports[i].pvWrapper = NULL;
+    for (i = 0; i < RT_ELEMENTS(g_aRTImports); i++)
+        g_aRTImports[i].pvWrapper = NULL;
+    for (i = 0; i < RT_ELEMENTS(g_aCRTImports); i++)
+        g_aCRTImports[i].pvWrapper = NULL;
+
+    /* free wrapper memory. */
+    while (g_pExecMemHead)
+    {
+        PREMEXECMEM pCur = g_pExecMemHead;
+        g_pExecMemHead = pCur->pNext;
+        memset(pCur, 0xcc, pCur->cb);
+        RTMemExecFree(pCur, pCur->cb);
+    }
+#  endif
+}
+
+# else  /* VBOX_USE_BITNESS_SELECTOR */
+
+/**
+ * Checks if 64-bit support is enabled.
+ *
+ * @returns true / false.
+ * @param   pVM         Pointer to the shared VM structure.
+ */
+static bool remIs64bitEnabled(PVM pVM)
+{
+    bool f;
+    int  rc;
+
+#  ifdef VBOX_WITHOUT_REM_LDR_CYCLE
+    if (g_ModVMM == NIL_RTLDRMOD)
+    {
+        rc = SUPR3HardenedLdrLoadAppPriv("VBoxVMM", &g_ModVMM, RTLDRLOAD_FLAGS_LOCAL, NULL);
+        AssertRCReturn(rc, false);
+    }
+
+    DECLCALLBACKMEMBER(PCFGMNODE, pfnCFGMR3GetRoot)(PVM);
+    rc = RTLdrGetSymbol(g_ModVMM, "CFGMR3GetRoot", (void **)&pfnCFGMR3GetRoot);
+    AssertRCReturn(rc, false);
+
+    DECLCALLBACKMEMBER(PCFGMNODE, pfnCFGMR3GetChild)(PCFGMNODE, const char *);
+    rc = RTLdrGetSymbol(g_ModVMM, "CFGMR3GetChild", (void **)&pfnCFGMR3GetChild);
+    AssertRCReturn(rc, false);
+
+    DECLCALLBACKMEMBER(int,       pfnCFGMR3QueryBoolDef)(PCFGMNODE, const char *, bool *, bool);
+    rc = RTLdrGetSymbol(g_ModVMM, "CFGMR3QueryBoolDef", (void **)&pfnCFGMR3QueryBoolDef);
+    AssertRCReturn(rc, false);
+
+    rc = pfnCFGMR3QueryBoolDef(pfnCFGMR3GetChild(pfnCFGMR3GetRoot(pVM), "REM"), "64bitEnabled", &f, false);
+#  else
+    rc = CFGMR3QueryBoolDef(CFGMR3GetChild(CFGMR3GetRoot(pVM), "REM"), "64bitEnabled", &f, false);
+#  endif
+    AssertRCReturn(rc, false);
+    return f;
+}
+
+
+/**
+ * Loads real REM object, resolves all exports (imports are done by native loader).
+ *
+ * @returns VBox status code.
+ */
+static int remLoadProperObj(PVM pVM)
+{
+    /*
+     * Load the VBoxREM32/64 object/DLL.
+     */
+    const char *pszModule = remIs64bitEnabled(pVM) ? "VBoxREM64" : "VBoxREM32";
+    int rc = SUPR3HardenedLdrLoadAppPriv(pszModule, &g_ModREM2, RTLDRLOAD_FLAGS_LOCAL, NULL);
+    if (RT_SUCCESS(rc))
+    {
+        LogRel(("REM: %s\n", pszModule));
+
+        /*
+         * Resolve exports.
+         */
+        unsigned i;
+        for (i = 0; i < RT_ELEMENTS(g_aExports); i++)
+        {
+            void *pvValue;
+            rc = RTLdrGetSymbol(g_ModREM2, g_aExports[i].pszName, &pvValue);
+            AssertLogRelMsgRCBreak(rc, ("%s rc=%Rrc\n", g_aExports[i].pszName, rc));
+            *(void **)g_aExports[i].pv = pvValue;
+        }
+    }
+
+    return rc;
+}
+
+
+/**
+ * Unloads the real REM object.
+ */
+static void remUnloadProperObj(void)
+{
+    /* close module. */
+    RTLdrClose(g_ModREM2);
+    g_ModREM2 = NIL_RTLDRMOD;
+}
+
+# endif /* VBOX_USE_BITNESS_SELECTOR */
+#endif /* USE_REM_STUBS */
+
+REMR3DECL(int) REMR3Init(PVM pVM)
+{
+#ifdef USE_REM_STUBS
+    return VINF_SUCCESS;
+
+#elif defined(VBOX_USE_BITNESS_SELECTOR)
+    if (!pfnREMR3Init)
+    {
+        int rc = remLoadProperObj(pVM);
+        if (RT_FAILURE(rc))
+            return rc;
+    }
+    return pfnREMR3Init(pVM);
+
+#else
+    if (!pfnREMR3Init)
+    {
+        int rc = remLoadLinuxObj();
+        if (RT_FAILURE(rc))
+            return rc;
+    }
+    return pfnREMR3Init(pVM);
+#endif
+}
+
+REMR3DECL(int) REMR3InitFinalize(PVM pVM)
+{
+#ifndef USE_REM_STUBS
+    Assert(VALID_PTR(pfnREMR3InitFinalize));
+    return pfnREMR3InitFinalize(pVM);
+#endif
+}
+
+REMR3DECL(int) REMR3Term(PVM pVM)
+{
+#ifdef USE_REM_STUBS
+    return VINF_SUCCESS;
+
+#elif defined(VBOX_USE_BITNESS_SELECTOR)
+    int rc;
+    Assert(VALID_PTR(pfnREMR3Term));
+    rc = pfnREMR3Term(pVM);
+    remUnloadProperObj();
+    return rc;
+
+#else
+    int rc;
+    Assert(VALID_PTR(pfnREMR3Term));
+    rc = pfnREMR3Term(pVM);
+    remUnloadLinuxObj();
+    return rc;
+#endif
+}
+
+REMR3DECL(void) REMR3Reset(PVM pVM)
+{
+#ifndef USE_REM_STUBS
+    Assert(VALID_PTR(pfnREMR3Reset));
+    pfnREMR3Reset(pVM);
+#endif
+}
+
+REMR3DECL(int) REMR3Step(PVM pVM, PVMCPU pVCpu)
+{
+#ifdef USE_REM_STUBS
+    return VERR_NOT_IMPLEMENTED;
+#else
+    Assert(VALID_PTR(pfnREMR3Step));
+    return pfnREMR3Step(pVM, pVCpu);
+#endif
+}
+
+REMR3DECL(int) REMR3BreakpointSet(PVM pVM, RTGCUINTPTR Address)
+{
+#ifdef USE_REM_STUBS
+    return VERR_REM_NO_MORE_BP_SLOTS;
+#else
+    Assert(VALID_PTR(pfnREMR3BreakpointSet));
+    return pfnREMR3BreakpointSet(pVM, Address);
+#endif
+}
+
+REMR3DECL(int) REMR3BreakpointClear(PVM pVM, RTGCUINTPTR Address)
+{
+#ifdef USE_REM_STUBS
+    return VERR_NOT_IMPLEMENTED;
+#else
+    Assert(VALID_PTR(pfnREMR3BreakpointClear));
+    return pfnREMR3BreakpointClear(pVM, Address);
+#endif
+}
+
+REMR3DECL(int) REMR3EmulateInstruction(PVM pVM, PVMCPU pVCpu)
+{
+#ifdef USE_REM_STUBS
+    return VERR_NOT_IMPLEMENTED;
+#else
+    Assert(VALID_PTR(pfnREMR3EmulateInstruction));
+    return pfnREMR3EmulateInstruction(pVM, pVCpu);
+#endif
+}
+
+REMR3DECL(int) REMR3Run(PVM pVM, PVMCPU pVCpu)
+{
+#ifdef USE_REM_STUBS
+    return VERR_NOT_IMPLEMENTED;
+#else
+    Assert(VALID_PTR(pfnREMR3Run));
+    return pfnREMR3Run(pVM, pVCpu);
+#endif
+}
+
+REMR3DECL(int) REMR3State(PVM pVM, PVMCPU pVCpu)
+{
+#ifdef USE_REM_STUBS
+    return VERR_NOT_IMPLEMENTED;
+#else
+    Assert(VALID_PTR(pfnREMR3State));
+    return pfnREMR3State(pVM, pVCpu);
+#endif
+}
+
+REMR3DECL(int) REMR3StateBack(PVM pVM, PVMCPU pVCpu)
+{
+#ifdef USE_REM_STUBS
+    return VERR_NOT_IMPLEMENTED;
+#else
+    Assert(VALID_PTR(pfnREMR3StateBack));
+    return pfnREMR3StateBack(pVM, pVCpu);
+#endif
+}
+
+REMR3DECL(void) REMR3StateUpdate(PVM pVM, PVMCPU pVCpu)
+{
+#ifndef USE_REM_STUBS
+    Assert(VALID_PTR(pfnREMR3StateUpdate));
+    pfnREMR3StateUpdate(pVM, pVCpu);
+#endif
+}
+
+REMR3DECL(void) REMR3A20Set(PVM pVM, PVMCPU pVCpu, bool fEnable)
+{
+#ifndef USE_REM_STUBS
+    Assert(VALID_PTR(pfnREMR3A20Set));
+    pfnREMR3A20Set(pVM, pVCpu, fEnable);
+#endif
+}
+
+REMR3DECL(void) REMR3ReplayHandlerNotifications(PVM pVM)
+{
+#ifndef USE_REM_STUBS
+    Assert(VALID_PTR(pfnREMR3ReplayHandlerNotifications));
+    pfnREMR3ReplayHandlerNotifications(pVM);
+#endif
+}
+
+REMR3DECL(int) REMR3NotifyCodePageChanged(PVM pVM, PVMCPU pVCpu, RTGCPTR pvCodePage)
+{
+#ifdef USE_REM_STUBS
+    return VINF_SUCCESS;
+#else
+    Assert(VALID_PTR(pfnREMR3NotifyCodePageChanged));
+    return pfnREMR3NotifyCodePageChanged(pVM, pVCpu, pvCodePage);
+#endif
+}
+
+REMR3DECL(void) REMR3NotifyPhysRamRegister(PVM pVM, RTGCPHYS GCPhys, RTGCPHYS cb, unsigned fFlags)
+{
+#ifndef USE_REM_STUBS
+    Assert(VALID_PTR(pfnREMR3NotifyPhysRamRegister));
+    pfnREMR3NotifyPhysRamRegister(pVM, GCPhys, cb, fFlags);
+#endif
+}
+
+REMR3DECL(void) REMR3NotifyPhysRomRegister(PVM pVM, RTGCPHYS GCPhys, RTUINT cb, void *pvCopy, bool fShadow)
+{
+#ifndef USE_REM_STUBS
+    Assert(VALID_PTR(pfnREMR3NotifyPhysRomRegister));
+    pfnREMR3NotifyPhysRomRegister(pVM, GCPhys, cb, pvCopy, fShadow);
+#endif
+}
+
+REMR3DECL(void) REMR3NotifyPhysRamDeregister(PVM pVM, RTGCPHYS GCPhys, RTUINT cb)
+{
+#ifndef USE_REM_STUBS
+    Assert(VALID_PTR(pfnREMR3NotifyPhysRamDeregister));
+    pfnREMR3NotifyPhysRamDeregister(pVM, GCPhys, cb);
+#endif
+}
+
+REMR3DECL(void) REMR3NotifyHandlerPhysicalRegister(PVM pVM, PGMPHYSHANDLERKIND enmKind, RTGCPHYS GCPhys, RTGCPHYS cb, bool fHasHCHandler)
+{
+#ifndef USE_REM_STUBS
+    Assert(VALID_PTR(pfnREMR3NotifyHandlerPhysicalRegister));
+    pfnREMR3NotifyHandlerPhysicalRegister(pVM, enmKind, GCPhys, cb, fHasHCHandler);
+#endif
+}
+
+REMR3DECL(void) REMR3NotifyHandlerPhysicalDeregister(PVM pVM, PGMPHYSHANDLERKIND enmKind, RTGCPHYS GCPhys, RTGCPHYS cb, bool fHasHCHandler, bool fRestoreAsRAM)
+{
+#ifndef USE_REM_STUBS
+    Assert(VALID_PTR(pfnREMR3NotifyHandlerPhysicalDeregister));
+    pfnREMR3NotifyHandlerPhysicalDeregister(pVM, enmKind, GCPhys, cb, fHasHCHandler, fRestoreAsRAM);
+#endif
+}
+
+REMR3DECL(void) REMR3NotifyHandlerPhysicalModify(PVM pVM, PGMPHYSHANDLERKIND enmKind, RTGCPHYS GCPhysOld, RTGCPHYS GCPhysNew, RTGCPHYS cb, bool fHasHCHandler, bool fRestoreAsRAM)
+{
+#ifndef USE_REM_STUBS
+    Assert(VALID_PTR(pfnREMR3NotifyHandlerPhysicalModify));
+    pfnREMR3NotifyHandlerPhysicalModify(pVM, enmKind, GCPhysOld, GCPhysNew, cb, fHasHCHandler, fRestoreAsRAM);
+#endif
+}
+
+REMR3DECL(bool) REMR3IsPageAccessHandled(PVM pVM, RTGCPHYS GCPhys)
+{
+#ifdef USE_REM_STUBS
+    return false;
+#else
+    Assert(VALID_PTR(pfnREMR3IsPageAccessHandled));
+    return pfnREMR3IsPageAccessHandled(pVM, GCPhys);
+#endif
+}
+
+REMR3DECL(int) REMR3DisasEnableStepping(PVM pVM, bool fEnable)
+{
+#ifdef USE_REM_STUBS
+    return VERR_NOT_IMPLEMENTED;
+#else
+    Assert(VALID_PTR(pfnREMR3DisasEnableStepping));
+    return pfnREMR3DisasEnableStepping(pVM, fEnable);
+#endif
+}
+
+REMR3DECL(void) REMR3NotifyInterruptSet(PVM pVM, PVMCPU pVCpu)
+{
+#ifndef USE_REM_STUBS
+    Assert(VALID_PTR(pfnREMR3NotifyInterruptSet));
+    pfnREMR3NotifyInterruptSet(pVM, pVCpu);
+#endif
+}
+
+REMR3DECL(void) REMR3NotifyInterruptClear(PVM pVM, PVMCPU pVCpu)
+{
+#ifndef USE_REM_STUBS
+    Assert(VALID_PTR(pfnREMR3NotifyInterruptClear));
+    pfnREMR3NotifyInterruptClear(pVM, pVCpu);
+#endif
+}
+
+REMR3DECL(void) REMR3NotifyTimerPending(PVM pVM, PVMCPU pVCpuDst)
+{
+#ifndef USE_REM_STUBS
+    Assert(VALID_PTR(pfnREMR3NotifyTimerPending));
+    pfnREMR3NotifyTimerPending(pVM, pVCpuDst);
+#endif
+}
+
+REMR3DECL(void) REMR3NotifyDmaPending(PVM pVM)
+{
+#ifndef USE_REM_STUBS
+    Assert(VALID_PTR(pfnREMR3NotifyDmaPending));
+    pfnREMR3NotifyDmaPending(pVM);
+#endif
+}
+
+REMR3DECL(void) REMR3NotifyQueuePending(PVM pVM)
+{
+#ifndef USE_REM_STUBS
+    Assert(VALID_PTR(pfnREMR3NotifyQueuePending));
+    pfnREMR3NotifyQueuePending(pVM);
+#endif
+}
+
+REMR3DECL(void) REMR3NotifyFF(PVM pVM)
+{
+#ifndef USE_REM_STUBS
+    /* the timer can call this early on, so don't be picky. */
+    if (pfnREMR3NotifyFF)
+        pfnREMR3NotifyFF(pVM);
+#endif
+}
diff --git a/src/recompiler/VBoxREMWrapperA.asm b/src/recompiler/VBoxREMWrapperA.asm
new file mode 100644
index 00000000..a947031f
--- /dev/null
+++ b/src/recompiler/VBoxREMWrapperA.asm
@@ -0,0 +1,912 @@
+; $Id: VBoxREMWrapperA.asm $
+;; @file
+; VBoxREM Wrapper, Assembly routines and wrapper Templates.
+;
+
+;
+; Copyright (C) 2006-2019 Oracle Corporation
+;
+; This file is part of VirtualBox Open Source Edition (OSE), as
+; available from http://www.virtualbox.org. This file is free software;
+; you can redistribute it and/or modify it under the terms of the GNU
+; General Public License (GPL) as published by the Free Software
+; Foundation, in version 2 as it comes in the "COPYING" file of the
+; VirtualBox OSE distribution. VirtualBox OSE is distributed in the
+; hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
+;
+
+
+
+
+;*******************************************************************************
+;*  Header Files                                                               *
+;*******************************************************************************
+%include "iprt/asmdefs.mac"
+
+%define REM_FIXUP_32_REAL_STUFF    0deadbeefh
+%define REM_FIXUP_64_REAL_STUFF    0deadf00df00ddeadh
+%define REM_FIXUP_64_DESC          0dead00010001deadh
+%define REM_FIXUP_64_LOG_ENTRY     0dead00020002deadh
+%define REM_FIXUP_64_LOG_EXIT      0dead00030003deadh
+%define REM_FIXUP_64_WRAP_GCC_CB   0dead00040004deadh
+
+;%define ENTRY_LOGGING   1
+;%define EXIT_LOGGING    1
+
+
+%ifdef RT_ARCH_AMD64
+ ;;
+ ; 64-bit pushad
+ %macro MY_PUSHAQ 0
+    push    rax
+    push    rbx
+    push    rcx
+    push    rdx
+    push    rsi
+    push    rdi
+    push    rbp
+    push    r8
+    push    r9
+    push    r10
+    push    r11
+    push    r12
+    push    r13
+    push    r14
+    push    r15
+ %endmacro
+
+ ;;
+ ; 64-bit popad
+ %macro MY_POPAQ 0
+    pop     r15
+    pop     r14
+    pop     r13
+    pop     r12
+    pop     r11
+    pop     r10
+    pop     r9
+    pop     r8
+    pop     rbp
+    pop     rdi
+    pop     rsi
+    pop     rdx
+    pop     rcx
+    pop     rbx
+    pop     rax
+ %endmacro
+
+ ;;
+ ; Entry logging
+ %ifdef ENTRY_LOGGING
+  %macro LOG_ENTRY 0
+    MY_PUSHAQ
+    push    rbp
+    mov     rbp, rsp
+    and     rsp, ~0fh
+    sub     rsp, 20h                    ; shadow space
+
+   %ifdef RT_OS_WINDOWS
+    mov     rcx, REM_FIXUP_64_DESC
+   %else
+    mov     rdi, REM_FIXUP_64_DESC
+   %endif
+    mov     rax, REM_FIXUP_64_LOG_ENTRY
+    call    rax
+
+    leave
+    MY_POPAQ
+  %endmacro
+ %else
+  %define LOG_ENTRY
+ %endif
+
+ ;;
+ ; Exit logging
+ %ifdef EXIT_LOGGING
+  %macro LOG_EXIT 0
+    MY_PUSHAQ
+    push    rbp
+    mov     rbp, rsp
+    and     rsp, ~0fh
+    sub     rsp, 20h                    ; shadow space
+
+   %ifdef RT_OS_WINDOWS
+    mov     rdx, rax
+    mov     rcx, REM_FIXUP_64_DESC
+   %else
+    mov     rsi, eax
+    mov     rdi, REM_FIXUP_64_DESC
+   %endif
+    mov     rax, REM_FIXUP_64_LOG_EXIT
+    call    rax
+
+    leave
+    MY_POPAQ
+  %endmacro
+ %else
+  %define LOG_EXIT
+ %endif
+
+%else
+ %define LOG_ENTRY
+ %define LOG_EXIT
+%endif
+
+
+BEGINCODE
+
+%ifdef RT_OS_WINDOWS
+ %ifdef RT_ARCH_AMD64
+
+
+BEGINPROC WrapGCC2MSC0Int
+    LOG_ENTRY
+    push    rbp
+    mov     rbp, rsp
+    sub     rsp, 20h
+
+%ifdef USE_DIRECT_CALLS
+    call    $+5+REM_FIXUP_32_REAL_STUFF
+%else
+    mov     rax, REM_FIXUP_64_REAL_STUFF
+    call    rax
+%endif
+
+    leave
+    LOG_EXIT
+    ret
+ENDPROC WrapGCC2MSC0Int
+
+
+BEGINPROC WrapGCC2MSC1Int
+    LOG_ENTRY
+    push    rbp
+    mov     rbp, rsp
+    sub     rsp, 20h
+
+    mov     rcx, rdi
+%ifdef USE_DIRECT_CALLS
+    call    $+5+REM_FIXUP_32_REAL_STUFF
+%else
+    mov     rax, REM_FIXUP_64_REAL_STUFF
+    call    rax
+%endif
+
+    leave
+    LOG_EXIT
+    ret
+ENDPROC WrapGCC2MSC1Int
+
+
+BEGINPROC WrapGCC2MSC2Int
+    LOG_ENTRY
+    push    rbp
+    mov     rbp, rsp
+    sub     rsp, 20h
+
+    mov     rdx, rsi
+    mov     rcx, rdi
+%ifdef USE_DIRECT_CALLS
+    call    $+5+REM_FIXUP_32_REAL_STUFF
+%else
+    mov     rax, REM_FIXUP_64_REAL_STUFF
+    call    rax
+%endif
+
+    leave
+    LOG_EXIT
+    ret
+ENDPROC WrapGCC2MSC2Int
+
+
+BEGINPROC WrapGCC2MSC3Int
+    LOG_ENTRY
+    push    rbp
+    mov     rbp, rsp
+    sub     rsp, 20h
+
+    mov     r8, rdx
+    mov     rdx, rsi
+    mov     rcx, rdi
+%ifdef USE_DIRECT_CALLS
+    call    $+5+REM_FIXUP_32_REAL_STUFF
+%else
+    mov     rax, REM_FIXUP_64_REAL_STUFF
+    call    rax
+%endif
+
+    leave
+    LOG_EXIT
+    ret
+ENDPROC WrapGCC2MSC3Int
+
+
+BEGINPROC WrapGCC2MSC4Int
+    LOG_ENTRY
+    push    rbp
+    mov     rbp, rsp
+    sub     rsp, 20h
+
+    mov     r9, rcx
+    mov     r8, rdx
+    mov     rdx, rsi
+    mov     rcx, rdi
+%ifdef USE_DIRECT_CALLS
+    call    $+5+REM_FIXUP_32_REAL_STUFF
+%else
+    mov     rax, REM_FIXUP_64_REAL_STUFF
+    call    rax
+%endif
+
+    leave
+    LOG_EXIT
+    ret
+ENDPROC WrapGCC2MSC4Int
+
+
+BEGINPROC WrapGCC2MSC5Int
+    LOG_ENTRY
+    push    rbp
+    mov     rbp, rsp
+    sub     rsp, 30h
+
+    mov     [rsp + 20h], r8
+    mov     r9, rcx
+    mov     r8, rdx
+    mov     rdx, rsi
+    mov     rcx, rdi
+%ifdef USE_DIRECT_CALLS
+    call    $+5+REM_FIXUP_32_REAL_STUFF
+%else
+    mov     rax, REM_FIXUP_64_REAL_STUFF
+    call    rax
+%endif
+
+    leave
+    LOG_EXIT
+    ret
+ENDPROC WrapGCC2MSC5Int
+
+
+BEGINPROC WrapGCC2MSC6Int
+    LOG_ENTRY
+    push    rbp
+    mov     rbp, rsp
+    sub     rsp, 30h
+
+    mov     [rsp + 28h], r9
+    mov     [rsp + 20h], r8
+    mov     r9, rcx
+    mov     r8, rdx
+    mov     rdx, rsi
+    mov     rcx, rdi
+%ifdef USE_DIRECT_CALLS
+    call    $+5+REM_FIXUP_32_REAL_STUFF
+%else
+    mov     rax, REM_FIXUP_64_REAL_STUFF
+    call    rax
+%endif
+
+    leave
+    LOG_EXIT
+    ret
+ENDPROC WrapGCC2MSC6Int
+
+
+BEGINPROC WrapGCC2MSC7Int
+    LOG_ENTRY
+    push    rbp
+    mov     rbp, rsp
+    sub     rsp, 40h
+
+    mov     r11, [rbp + 10h]
+    mov     [rsp + 30h], r11
+    mov     [rsp + 28h], r9
+    mov     [rsp + 20h], r8
+    mov     r9, rcx
+    mov     r8, rdx
+    mov     rdx, rsi
+    mov     rcx, rdi
+%ifdef USE_DIRECT_CALLS
+    call    $+5+REM_FIXUP_32_REAL_STUFF
+%else
+    mov     rax, REM_FIXUP_64_REAL_STUFF
+    call    rax
+%endif
+
+    leave
+    LOG_EXIT
+    ret
+ENDPROC WrapGCC2MSC7Int
+
+
+BEGINPROC WrapGCC2MSC8Int
+    LOG_ENTRY
+    push    rbp
+    mov     rbp, rsp
+    sub     rsp, 40h
+
+    mov     r10, [rbp + 18h]
+    mov     [rsp + 38h], r10
+    mov     r11, [rbp + 10h]
+    mov     [rsp + 30h], r11
+    mov     [rsp + 28h], r9
+    mov     [rsp + 20h], r8
+    mov     r9, rcx
+    mov     r8, rdx
+    mov     rdx, rsi
+    mov     rcx, rdi
+%ifdef USE_DIRECT_CALLS
+    call    $+5+REM_FIXUP_32_REAL_STUFF
+%else
+    mov     rax, REM_FIXUP_64_REAL_STUFF
+    call    rax
+%endif
+
+    leave
+    LOG_EXIT
+    ret
+ENDPROC WrapGCC2MSC8Int
+
+
+BEGINPROC WrapGCC2MSC9Int
+    LOG_ENTRY
+    push    rbp
+    mov     rbp, rsp
+    sub     rsp, 50h
+
+    mov     rax, [rbp + 20h]
+    mov     [rsp + 40h], rax
+    mov     r10, [rbp + 18h]
+    mov     [rsp + 38h], r10
+    mov     r11, [rbp + 10h]
+    mov     [rsp + 30h], r11
+    mov     [rsp + 28h], r9
+    mov     [rsp + 20h], r8
+    mov     r9, rcx
+    mov     r8, rdx
+    mov     rdx, rsi
+    mov     rcx, rdi
+%ifdef USE_DIRECT_CALLS
+    call    $+5+REM_FIXUP_32_REAL_STUFF
+%else
+    mov     rax, REM_FIXUP_64_REAL_STUFF
+    call    rax
+%endif
+
+    leave
+    LOG_EXIT
+    ret
+ENDPROC WrapGCC2MSC9Int
+
+
+BEGINPROC WrapGCC2MSC10Int
+    LOG_ENTRY
+    push    rbp
+    mov     rbp, rsp
+    sub     rsp, 50h
+
+    mov     r11, [rbp + 28h]
+    mov     [rsp + 48h], r11
+    mov     rax, [rbp + 20h]
+    mov     [rsp + 40h], rax
+    mov     r10, [rbp + 18h]
+    mov     [rsp + 38h], r10
+    mov     r11, [rbp + 10h]
+    mov     [rsp + 30h], r11
+    mov     [rsp + 28h], r9
+    mov     [rsp + 20h], r8
+    mov     r9, rcx
+    mov     r8, rdx
+    mov     rdx, rsi
+    mov     rcx, rdi
+%ifdef USE_DIRECT_CALLS
+    call    $+5+REM_FIXUP_32_REAL_STUFF
+%else
+    mov     rax, REM_FIXUP_64_REAL_STUFF
+    call    rax
+%endif
+
+    leave
+    LOG_EXIT
+    ret
+ENDPROC WrapGCC2MSC10Int
+
+
+BEGINPROC WrapGCC2MSC11Int
+    LOG_ENTRY
+    push    rbp
+    mov     rbp, rsp
+    sub     rsp, 60h
+
+    mov     r10, [rbp + 30h]
+    mov     [rsp + 50h], r10
+    mov     r11, [rbp + 28h]
+    mov     [rsp + 48h], r11
+    mov     rax, [rbp + 20h]
+    mov     [rsp + 40h], rax
+    mov     r10, [rbp + 18h]
+    mov     [rsp + 38h], r10
+    mov     r11, [rbp + 10h]
+    mov     [rsp + 30h], r11
+    mov     [rsp + 28h], r9
+    mov     [rsp + 20h], r8
+    mov     r9, rcx
+    mov     r8, rdx
+    mov     rdx, rsi
+    mov     rcx, rdi
+%ifdef USE_DIRECT_CALLS
+    call    $+5+REM_FIXUP_32_REAL_STUFF
+%else
+    mov     rax, REM_FIXUP_64_REAL_STUFF
+    call    rax
+%endif
+
+    leave
+    LOG_EXIT
+    ret
+ENDPROC WrapGCC2MSC11Int
+
+
+BEGINPROC WrapGCC2MSC12Int
+    LOG_ENTRY
+    push    rbp
+    mov     rbp, rsp
+    sub     rsp, 60h
+
+    mov     rax, [rbp + 28h]
+    mov     [rsp + 48h], rax
+    mov     r10, [rbp + 30h]
+    mov     [rsp + 50h], r10
+    mov     r11, [rbp + 28h]
+    mov     [rsp + 48h], r11
+    mov     rax, [rbp + 20h]
+    mov     [rsp + 40h], rax
+    mov     r10, [rbp + 18h]
+    mov     [rsp + 38h], r10
+    mov     r11, [rbp + 10h]
+    mov     [rsp + 30h], r11
+    mov     [rsp + 28h], r9
+    mov     [rsp + 20h], r8
+    mov     r9, rcx
+    mov     r8, rdx
+    mov     rdx, rsi
+    mov     rcx, rdi
+%ifdef USE_DIRECT_CALLS
+    call    $+5+REM_FIXUP_32_REAL_STUFF
+%else
+    mov     rax, REM_FIXUP_64_REAL_STUFF
+    call    rax
+%endif
+
+    leave
+    LOG_EXIT
+    ret
+ENDPROC WrapGCC2MSC12Int
+
+
+
+BEGINPROC WrapGCC2MSCVariadictInt
+    LOG_ENTRY
+%ifdef DEBUG
+    ; check that there are NO floting point arguments in XMM registers!
+    or      rax, rax
+    jz      .ok
+    int3
+.ok:
+%endif
+    sub     rsp, 28h
+    mov     r11, [rsp + 28h]            ; r11 = return address.
+    mov     [rsp + 28h], r9
+    mov     [rsp + 20h], r8
+    mov     r9, rcx
+    mov     [rsp + 18h], r9             ; (*)
+    mov     r8, rdx
+    mov     [rsp + 14h], r8             ; (*)
+    mov     rdx, rsi
+    mov     [rsp + 8h], rdx             ; (*)
+    mov     rcx, rdi
+    mov     [rsp], rcx                  ; (*)
+    mov     rsi, r11                    ; rsi is preserved by the callee.
+%ifdef USE_DIRECT_CALLS
+    call    $+5+REM_FIXUP_32_REAL_STUFF
+%else
+    mov     rax, REM_FIXUP_64_REAL_STUFF
+    call    rax
+%endif
+
+    add     rsp, 30h
+    LOG_EXIT
+    jmp     rsi
+    ; (*) unconditionally spill the registers, just in case '...' implies weird stuff on MSC. Check this out!
+ENDPROC WrapGCC2MSCVariadictInt
+
+
+;;
+; Custom template for SSMR3RegisterInternal.
+;
+; (This is based on the WrapGCC2MSC11Int template.)
+;
+; @cproto
+;
+; SSMR3DECL(int) SSMR3RegisterInternal(PVM pVM, const char *pszName, uint32_t u32Instance, uint32_t u32Version, size_t cbGuess,
+;    PFNSSMINTLIVEPREP pfnLivePrep, PFNSSMINTLIVEEXEC pfnLiveExec, PFNSSMINTLIVEVOTE pfnLiveVote,
+;    PFNSSMINTSAVEPREP pfnSavePrep, PFNSSMINTSAVEEXEC pfnSaveExec, PFNSSMINTSAVEDONE pfnSaveDone,
+;    PFNSSMINTLOADPREP pfnLoadPrep, PFNSSMINTLOADEXEC pfnLoadExec, PFNSSMINTLOADDONE pfnLoadDone);
+;
+; @param    pVM             rdi              0
+; @param    pszName         rsi              1
+; @param    u32Instance     rdx              2
+; @param    u32Version      rcx              3
+; @param    cbGuess         r8               4
+; @param    pfnLivePrep     r9               5
+; @param    pfnLiveExec     rbp + 10h        6
+; @param    pfnLiveVote     rbp + 18h        7
+; @param    pfnSavePrep     rbp + 20h        8
+; @param    pfnSaveExec     rbp + 28h        9
+; @param    pfnSaveDone     rbp + 30h       10
+; @param    pfnLoadPrep     rbp + 38h       11
+; @param    pfnLoadExec     rbp + 40h       12
+; @param    pfnLoadDone     rbp + 48h       13
+;
+BEGINPROC WrapGCC2MSC_SSMR3RegisterInternal
+    LOG_ENTRY
+    push    rbp
+    mov     rbp, rsp
+
+    sub     rsp, 80h
+
+    mov     r10, [rbp + 48h]
+    mov     [rsp + 68h], r10            ; pfnLiveDone
+    mov     r11, [rbp + 40h]
+    mov     [rsp + 60h], r11            ; pfnLiveExec
+    mov     rax, [rbp + 38h]
+    mov     [rsp + 58h], rax            ; pfnLivePrep
+    mov     r10, [rbp + 30h]
+    mov     [rsp + 50h], r10            ; pfnLoadDone
+    mov     r11, [rbp + 28h]
+    mov     [rsp + 48h], r11            ; pfnLoadExec
+    mov     rax, [rbp + 20h]
+    mov     [rsp + 40h], rax            ; pfnLoadPrep
+    mov     r10, [rbp + 18h]
+    mov     [rsp + 38h], r10            ; pfnSaveDone
+    mov     r11, [rbp + 10h]
+    mov     [rsp + 30h], r11            ; pfnSaveExec
+    mov     [rsp + 28h], r9             ; pfnSavePrep
+    mov     [rsp + 20h], r8
+    mov     [rsp + 18h], rcx            ; -> r9
+    mov     [rsp + 10h], rdx            ; -> r8
+    mov     [rsp + 08h], rsi            ; -> rdx
+    mov     [rsp], rdi                  ; -> rcx
+
+    ; Now convert the function pointers. Have to setup a new shadow
+    ; space here since the SSMR3RegisterInternal one is already in use.
+    sub     rsp, 20h
+
+    mov     rcx, REM_FIXUP_64_DESC      ; pDesc
+    lea     rdx, [rsp + 28h + 20h]      ; pValue
+    mov     r8d, 5                      ; iParam
+    mov     rax, REM_FIXUP_64_WRAP_GCC_CB
+    call    rax
+
+    mov     rcx, REM_FIXUP_64_DESC      ; pDesc
+    lea     rdx, [rsp + 30h + 20h]      ; pValue
+    mov     r8d, 6                      ; iParam
+    mov     rax, REM_FIXUP_64_WRAP_GCC_CB
+    call    rax
+
+    mov     rcx, REM_FIXUP_64_DESC      ; pDesc
+    lea     rdx, [rsp + 38h + 20h]      ; pValue
+    mov     r8d, 7                      ; iParam
+    mov     rax, REM_FIXUP_64_WRAP_GCC_CB
+    call    rax
+
+    mov     rcx, REM_FIXUP_64_DESC      ; pDesc
+    lea     rdx, [rsp + 40h + 20h]      ; pValue
+    mov     r8d, 8                      ; iParam
+    mov     rax, REM_FIXUP_64_WRAP_GCC_CB
+    call    rax
+
+    mov     rcx, REM_FIXUP_64_DESC      ; pDesc
+    lea     rdx, [rsp + 48h + 20h]      ; pValue
+    mov     r8d, 9                      ; iParam
+    mov     rax, REM_FIXUP_64_WRAP_GCC_CB
+    call    rax
+
+    mov     rcx, REM_FIXUP_64_DESC      ; pDesc
+    lea     rdx, [rsp + 50h + 20h]      ; pValue
+    mov     r8d, 10                     ; iParam
+    mov     rax, REM_FIXUP_64_WRAP_GCC_CB
+    call    rax
+
+    mov     rcx, REM_FIXUP_64_DESC      ; pDesc
+    lea     rdx, [rsp + 58h + 20h]      ; pValue
+    mov     r8d, 11                     ; iParam
+    mov     rax, REM_FIXUP_64_WRAP_GCC_CB
+    call    rax
+
+    mov     rcx, REM_FIXUP_64_DESC      ; pDesc
+    lea     rdx, [rsp + 60h + 20h]      ; pValue
+    mov     r8d, 12                     ; iParam
+    mov     rax, REM_FIXUP_64_WRAP_GCC_CB
+    call    rax
+
+    mov     rcx, REM_FIXUP_64_DESC      ; pDesc
+    lea     rdx, [rsp + 68h + 20h]      ; pValue
+    mov     r8d, 13                     ; iParam
+    mov     rax, REM_FIXUP_64_WRAP_GCC_CB
+    call    rax
+
+    add     rsp, 20h
+
+    ; finally do the call.
+    mov     r9,  [rsp + 18h]
+    mov     r8,  [rsp + 10h]
+    mov     rdx, [rsp + 08h]
+    mov     rcx, [rsp]
+%ifdef USE_DIRECT_CALLS
+    call    $+5+REM_FIXUP_32_REAL_STUFF
+%else
+    mov     rax, REM_FIXUP_64_REAL_STUFF
+    call    rax
+%endif
+
+    leave
+    LOG_EXIT
+    ret
+ENDPROC WrapGCC2MSC_SSMR3RegisterInternal
+
+
+;
+; The other way around:
+;
+
+
+BEGINPROC WrapMSC2GCC0Int
+    LOG_ENTRY
+    push    rbp
+    mov     rbp, rsp
+    sub     rsp, 10h
+    mov     [rbp - 10h], rsi
+    mov     [rbp - 18h], rdi
+
+%ifdef USE_DIRECT_CALLS
+    call    $+5+REM_FIXUP_32_REAL_STUFF
+%else
+    mov     rax, REM_FIXUP_64_REAL_STUFF
+    call    rax
+%endif
+
+    mov     rdi, [rbp - 18h]
+    mov     rsi, [rbp - 10h]
+    leave
+    LOG_EXIT
+    ret
+ENDPROC WrapMSC2GCC0Int
+
+
+BEGINPROC WrapMSC2GCC1Int
+    LOG_ENTRY
+    push    rbp
+    mov     rbp, rsp
+    sub     rsp, 20h
+    mov     [rbp - 10h], rsi
+    mov     [rbp - 18h], rdi
+
+    mov     rdi, rcx
+%ifdef USE_DIRECT_CALLS
+    call    $+5+REM_FIXUP_32_REAL_STUFF
+%else
+    mov     rax, REM_FIXUP_64_REAL_STUFF
+    call    rax
+%endif
+
+    mov     rdi, [rbp - 18h]
+    mov     rsi, [rbp - 10h]
+    leave
+    LOG_EXIT
+    ret
+ENDPROC WrapMSC2GCC1Int
+
+
+BEGINPROC WrapMSC2GCC2Int
+    LOG_ENTRY
+    push    rbp
+    mov     rbp, rsp
+    sub     rsp, 20h
+    mov     [rbp - 10h], rsi
+    mov     [rbp - 18h], rdi
+
+    mov     rdi, rcx
+    mov     rsi, rdx
+%ifdef USE_DIRECT_CALLS
+    call    $+5+REM_FIXUP_32_REAL_STUFF
+%else
+    mov     rax, REM_FIXUP_64_REAL_STUFF
+    call    rax
+%endif
+
+    mov     rdi, [rbp - 18h]
+    mov     rsi, [rbp - 10h]
+    leave
+    LOG_EXIT
+    ret
+ENDPROC WrapMSC2GCC2Int
+
+
+BEGINPROC WrapMSC2GCC3Int
+    LOG_ENTRY
+    push    rbp
+    mov     rbp, rsp
+    sub     rsp, 20h
+    mov     [rbp - 10h], rsi
+    mov     [rbp - 18h], rdi
+
+    mov     rdi, rcx
+    mov     rsi, rdx
+    mov     rdx, r8
+    call    $+5+REM_FIXUP_32_REAL_STUFF
+
+    mov     rdi, [rbp - 18h]
+    mov     rsi, [rbp - 10h]
+    leave
+    LOG_EXIT
+    ret
+ENDPROC WrapMSC2GCC3Int
+
+
+BEGINPROC WrapMSC2GCC4Int
+    LOG_ENTRY
+    push    rbp
+    mov     rbp, rsp
+    sub     rsp, 20h
+    mov     [rbp - 10h], rsi
+    mov     [rbp - 18h], rdi
+
+    mov     rdi, rcx
+    mov     rsi, rdx
+    mov     rdx, r8
+    mov     rcx, r9
+    call    $+5+REM_FIXUP_32_REAL_STUFF
+
+    mov     rdi, [rbp - 18h]
+    mov     rsi, [rbp - 10h]
+    leave
+    LOG_EXIT
+    ret
+ENDPROC WrapMSC2GCC4Int
+
+
+BEGINPROC WrapMSC2GCC5Int
+    LOG_ENTRY
+    push    rbp
+    mov     rbp, rsp
+    sub     rsp, 20h
+    mov     [rbp - 10h], rsi
+    mov     [rbp - 18h], rdi
+
+    mov     rdi, rcx
+    mov     rsi, rdx
+    mov     rdx, r8
+    mov     rcx, r9
+    mov     r8, [rbp + 30h]
+    call    $+5+REM_FIXUP_32_REAL_STUFF
+
+    mov     rdi, [rbp - 18h]
+    mov     rsi, [rbp - 10h]
+    leave
+    LOG_EXIT
+    ret
+ENDPROC WrapMSC2GCC5Int
+
+
+BEGINPROC WrapMSC2GCC6Int
+    LOG_ENTRY
+    push    rbp
+    mov     rbp, rsp
+    sub     rsp, 20h
+    mov     [rbp - 10h], rsi
+    mov     [rbp - 18h], rdi
+
+    mov     rdi, rcx
+    mov     rsi, rdx
+    mov     rdx, r8
+    mov     rcx, r9
+    mov     r8, [rbp + 30h]
+    mov     r9, [rbp + 38h]
+    call    $+5+REM_FIXUP_32_REAL_STUFF
+
+    mov     rdi, [rbp - 18h]
+    mov     rsi, [rbp - 10h]
+    leave
+    LOG_EXIT
+    ret
+ENDPROC WrapMSC2GCC6Int
+
+
+BEGINPROC WrapMSC2GCC7Int
+    LOG_ENTRY
+    push    rbp
+    mov     rbp, rsp
+    sub     rsp, 30h
+    mov     [rbp - 10h], rsi
+    mov     [rbp - 18h], rdi
+
+    mov     rdi, rcx
+    mov     rsi, rdx
+    mov     rdx, r8
+    mov     rcx, r9
+    mov     r8, [rbp + 30h]
+    mov     r9, [rbp + 38h]
+    mov     r10, [rbp + 40h]
+    mov     [rsp], r10
+    call    $+5+REM_FIXUP_32_REAL_STUFF
+
+    mov     rdi, [rbp - 18h]
+    mov     rsi, [rbp - 10h]
+    leave
+    LOG_EXIT
+    ret
+ENDPROC WrapMSC2GCC7Int
+
+
+BEGINPROC WrapMSC2GCC8Int
+    LOG_ENTRY
+    push    rbp
+    mov     rbp, rsp
+    sub     rsp, 30h
+    mov     [rbp - 10h], rsi
+    mov     [rbp - 18h], rdi
+
+    mov     rdi, rcx
+    mov     rsi, rdx
+    mov     rdx, r8
+    mov     rcx, r9
+    mov     r8, [rbp + 30h]
+    mov     r9, [rbp + 38h]
+    mov     r10, [rbp + 40h]
+    mov     [rsp], r10
+    mov     r11, [rbp + 48h]
+    mov     [rsp + 8], r11
+    call    $+5+REM_FIXUP_32_REAL_STUFF
+
+    mov     rdi, [rbp - 18h]
+    mov     rsi, [rbp - 10h]
+    leave
+    LOG_EXIT
+    ret
+ENDPROC WrapMSC2GCC8Int
+
+
+BEGINPROC WrapMSC2GCC9Int
+    LOG_ENTRY
+    push    rbp
+    mov     rbp, rsp
+    sub     rsp, 40h
+    mov     [rbp - 10h], rsi
+    mov     [rbp - 18h], rdi
+
+    mov     rdi, rcx
+    mov     rsi, rdx
+    mov     rdx, r8
+    mov     rcx, r9
+    mov     r8, [rbp + 30h]
+    mov     r9, [rbp + 38h]
+    mov     r10, [rbp + 40h]
+    mov     [rsp], r10
+    mov     r11, [rbp + 48h]
+    mov     [rsp + 8], r11
+    mov     rax, [rbp + 50h]
+    mov     [rsp + 10h], rax
+    call    $+5+REM_FIXUP_32_REAL_STUFF
+
+    mov     rdi, [rbp - 18h]
+    mov     rsi, [rbp - 10h]
+    leave
+    LOG_EXIT
+    ret
+ENDPROC WrapMSC2GCC9Int
+
+ %endif ; RT_ARCH_AMD64
+%endif ; RT_OS_WINDOWS
+
diff --git a/src/recompiler/VBoxRecompiler.c b/src/recompiler/VBoxRecompiler.c
new file mode 100644
index 00000000..fd19df22
--- /dev/null
+++ b/src/recompiler/VBoxRecompiler.c
@@ -0,0 +1,5481 @@
+/* $Id: VBoxRecompiler.c $ */
+/** @file
+ * VBox Recompiler - QEMU.
+ */
+
+/*
+ * Copyright (C) 2006-2019 Oracle Corporation
+ *
+ * This file is part of VirtualBox Open Source Edition (OSE), as
+ * available from http://www.virtualbox.org. This file is free software;
+ * you can redistribute it and/or modify it under the terms of the GNU
+ * General Public License (GPL) as published by the Free Software
+ * Foundation, in version 2 as it comes in the "COPYING" file of the
+ * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
+ * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
+ */
+
+/** @page pg_rem REM - Recompiled Execution Manager.
+ *
+ * The recompiled exeuction manager (REM) serves the final fallback for guest
+ * execution, after HM / raw-mode and IEM have given up.
+ *
+ * The REM is qemu with a whole bunch of VBox specific customization for
+ * interfacing with PATM, CSAM, PGM and other components.
+ *
+ * @sa @ref grp_rem
+ */
+
+
+/*********************************************************************************************************************************
+*   Header Files                                                                                                                 *
+*********************************************************************************************************************************/
+#define LOG_GROUP LOG_GROUP_REM
+#include <stdio.h>      /* FILE */
+#include "osdep.h"
+#include "config.h"
+#include "cpu.h"
+#include "exec-all.h"
+#include "ioport.h"
+
+#include <VBox/vmm/rem.h>
+#include <VBox/vmm/vmapi.h>
+#include <VBox/vmm/tm.h>
+#include <VBox/vmm/ssm.h>
+#include <VBox/vmm/em.h>
+#include <VBox/vmm/iem.h>
+#include <VBox/vmm/trpm.h>
+#include <VBox/vmm/iom.h>
+#include <VBox/vmm/mm.h>
+#include <VBox/vmm/pgm.h>
+#include <VBox/vmm/pdm.h>
+#include <VBox/vmm/dbgf.h>
+#include <VBox/dbg.h>
+#include <VBox/vmm/apic.h>
+#include <VBox/vmm/hm.h>
+#include <VBox/vmm/patm.h>
+#include <VBox/vmm/csam.h>
+#include "REMInternal.h"
+#include <VBox/vmm/vm.h>
+#include <VBox/vmm/uvm.h>
+#include <VBox/param.h>
+#include <VBox/err.h>
+
+#include <VBox/log.h>
+#include <iprt/alloca.h>
+#include <iprt/semaphore.h>
+#include <iprt/asm.h>
+#include <iprt/assert.h>
+#include <iprt/thread.h>
+#include <iprt/string.h>
+
+/* Don't wanna include everything. */
+extern void cpu_exec_init_all(uintptr_t tb_size);
+extern void cpu_x86_update_cr3(CPUX86State *env, target_ulong new_cr3);
+extern void cpu_x86_update_cr0(CPUX86State *env, uint32_t new_cr0);
+extern void cpu_x86_update_cr4(CPUX86State *env, uint32_t new_cr4);
+extern void tlb_flush_page(CPUX86State *env, target_ulong addr);
+extern void tlb_flush(CPUX86State *env, int flush_global);
+extern void sync_seg(CPUX86State *env1, int seg_reg, int selector);
+extern void sync_ldtr(CPUX86State *env1, int selector);
+
+#ifdef VBOX_STRICT
+ram_addr_t get_phys_page_offset(target_ulong addr);
+#endif
+
+
+/*********************************************************************************************************************************
+*   Defined Constants And Macros                                                                                                 *
+*********************************************************************************************************************************/
+
+/** Copy 80-bit fpu register at pSrc to pDst.
+ * This is probably faster than *calling* memcpy.
+ */
+#define REM_COPY_FPU_REG(pDst, pSrc) \
+    do { *(PX86FPUMMX)(pDst) = *(const X86FPUMMX *)(pSrc); } while (0)
+
+/** How remR3RunLoggingStep operates. */
+#define REM_USE_QEMU_SINGLE_STEP_FOR_LOGGING
+
+
+/** Selector flag shift between qemu and VBox.
+ * VBox shifts the qemu bits to the right. */
+#define SEL_FLAGS_SHIFT     (8)
+/** Mask applied to the shifted qemu selector flags to get the attributes VBox
+ * (VT-x) needs. */
+#define SEL_FLAGS_SMASK     UINT32_C(0x1F0FF)
+
+
+/*********************************************************************************************************************************
+*   Internal Functions                                                                                                           *
+*********************************************************************************************************************************/
+static DECLCALLBACK(int) remR3Save(PVM pVM, PSSMHANDLE pSSM);
+static DECLCALLBACK(int) remR3Load(PVM pVM, PSSMHANDLE pSSM, uint32_t uVersion, uint32_t uPass);
+static DECLCALLBACK(int) remR3LoadDone(PVM pVM, PSSMHANDLE pSSM);
+static void     remR3StateUpdate(PVM pVM, PVMCPU pVCpu);
+static int      remR3InitPhysRamSizeAndDirtyMap(PVM pVM, bool fGuarded);
+
+static uint32_t remR3MMIOReadU8(void *pvEnv, target_phys_addr_t GCPhys);
+static uint32_t remR3MMIOReadU16(void *pvEnv, target_phys_addr_t GCPhys);
+static uint32_t remR3MMIOReadU32(void *pvEnv, target_phys_addr_t GCPhys);
+static void     remR3MMIOWriteU8(void *pvEnv, target_phys_addr_t GCPhys, uint32_t u32);
+static void     remR3MMIOWriteU16(void *pvEnv, target_phys_addr_t GCPhys, uint32_t u32);
+static void     remR3MMIOWriteU32(void *pvEnv, target_phys_addr_t GCPhys, uint32_t u32);
+
+static uint32_t remR3HandlerReadU8(void *pvVM, target_phys_addr_t GCPhys);
+static uint32_t remR3HandlerReadU16(void *pvVM, target_phys_addr_t GCPhys);
+static uint32_t remR3HandlerReadU32(void *pvVM, target_phys_addr_t GCPhys);
+static void     remR3HandlerWriteU8(void *pvVM, target_phys_addr_t GCPhys, uint32_t u32);
+static void     remR3HandlerWriteU16(void *pvVM, target_phys_addr_t GCPhys, uint32_t u32);
+static void     remR3HandlerWriteU32(void *pvVM, target_phys_addr_t GCPhys, uint32_t u32);
+
+static void remR3NotifyHandlerPhysicalDeregister(PVM pVM, PGMPHYSHANDLERKIND enmKind, RTGCPHYS GCPhys, RTGCPHYS cb, bool fHasHCHandler, bool fRestoreAsRAM);
+static void remR3NotifyHandlerPhysicalRegister(PVM pVM, PGMPHYSHANDLERKIND enmKind, RTGCPHYS GCPhys, RTGCPHYS cb, bool fHasHCHandler);
+static void remR3NotifyHandlerPhysicalModify(PVM pVM, PGMPHYSHANDLERKIND enmKind, RTGCPHYS GCPhysOld, RTGCPHYS GCPhysNew, RTGCPHYS cb, bool fHasHCHandler, bool fRestoreAsRAM);
+
+
+/*********************************************************************************************************************************
+*   Global Variables                                                                                                             *
+*********************************************************************************************************************************/
+
+/** @todo Move stats to REM::s some rainy day we have nothing do to. */
+#ifdef VBOX_WITH_STATISTICS
+static STAMPROFILEADV gStatExecuteSingleInstr;
+static STAMPROFILEADV gStatCompilationQEmu;
+static STAMPROFILEADV gStatRunCodeQEmu;
+static STAMPROFILEADV gStatTotalTimeQEmu;
+static STAMPROFILEADV gStatTimers;
+static STAMPROFILEADV gStatTBLookup;
+static STAMPROFILEADV gStatIRQ;
+static STAMPROFILEADV gStatRawCheck;
+static STAMPROFILEADV gStatMemRead;
+static STAMPROFILEADV gStatMemWrite;
+static STAMPROFILE    gStatGCPhys2HCVirt;
+static STAMCOUNTER    gStatCpuGetTSC;
+static STAMCOUNTER    gStatRefuseTFInhibit;
+static STAMCOUNTER    gStatRefuseVM86;
+static STAMCOUNTER    gStatRefusePaging;
+static STAMCOUNTER    gStatRefusePAE;
+static STAMCOUNTER    gStatRefuseIOPLNot0;
+static STAMCOUNTER    gStatRefuseIF0;
+static STAMCOUNTER    gStatRefuseCode16;
+static STAMCOUNTER    gStatRefuseWP0;
+static STAMCOUNTER    gStatRefuseRing1or2;
+static STAMCOUNTER    gStatRefuseCanExecute;
+static STAMCOUNTER    gaStatRefuseStale[6];
+static STAMCOUNTER    gStatREMGDTChange;
+static STAMCOUNTER    gStatREMIDTChange;
+static STAMCOUNTER    gStatREMLDTRChange;
+static STAMCOUNTER    gStatREMTRChange;
+static STAMCOUNTER    gStatSelOutOfSync[6];
+static STAMCOUNTER    gStatSelOutOfSyncStateBack[6];
+static STAMCOUNTER    gStatFlushTBs;
+#endif
+/* in exec.c */
+extern uint32_t       tlb_flush_count;
+extern uint32_t       tb_flush_count;
+extern uint32_t       tb_phys_invalidate_count;
+
+/*
+ * Global stuff.
+ */
+
+/** MMIO read callbacks. */
+CPUReadMemoryFunc  *g_apfnMMIORead[3] =
+{
+    remR3MMIOReadU8,
+    remR3MMIOReadU16,
+    remR3MMIOReadU32
+};
+
+/** MMIO write callbacks. */
+CPUWriteMemoryFunc *g_apfnMMIOWrite[3] =
+{
+    remR3MMIOWriteU8,
+    remR3MMIOWriteU16,
+    remR3MMIOWriteU32
+};
+
+/** Handler read callbacks. */
+CPUReadMemoryFunc  *g_apfnHandlerRead[3] =
+{
+    remR3HandlerReadU8,
+    remR3HandlerReadU16,
+    remR3HandlerReadU32
+};
+
+/** Handler write callbacks. */
+CPUWriteMemoryFunc *g_apfnHandlerWrite[3] =
+{
+    remR3HandlerWriteU8,
+    remR3HandlerWriteU16,
+    remR3HandlerWriteU32
+};
+
+
+#ifdef VBOX_WITH_DEBUGGER
+/*
+ * Debugger commands.
+ */
+static FNDBGCCMD remR3CmdDisasEnableStepping;;
+
+/** '.remstep' arguments. */
+static const DBGCVARDESC    g_aArgRemStep[] =
+{
+    /* cTimesMin,   cTimesMax,  enmCategory,            fFlags,                         pszName,        pszDescription */
+    {  0,           ~0U,        DBGCVAR_CAT_NUMBER,     0,                              "on/off",       "Boolean value/mnemonic indicating the new state." },
+};
+
+/** Command descriptors. */
+static const DBGCCMD    g_aCmds[] =
+{
+    {
+        .pszCmd ="remstep",
+        .cArgsMin = 0,
+        .cArgsMax = 1,
+        .paArgDescs = &g_aArgRemStep[0],
+        .cArgDescs = RT_ELEMENTS(g_aArgRemStep),
+        .fFlags = 0,
+        .pfnHandler = remR3CmdDisasEnableStepping,
+        .pszSyntax = "[on/off]",
+        .pszDescription = "Enable or disable the single stepping with logged disassembly. "
+                          "If no arguments show the current state."
+    }
+};
+#endif
+
+/** Prologue code, must be in lower 4G to simplify jumps to/from generated code.
+ * @todo huh??? That cannot be the case on the mac... So, this
+ *       point is probably not valid any longer. */
+uint8_t *code_gen_prologue;
+
+
+/*********************************************************************************************************************************
+*   Internal Functions                                                                                                           *
+*********************************************************************************************************************************/
+void remAbort(int rc, const char *pszTip);
+extern int testmath(void);
+
+/* Put them here to avoid unused variable warning. */
+AssertCompile(RT_SIZEOFMEMB(VM, rem.padding) >= RT_SIZEOFMEMB(VM, rem.s));
+#if !defined(IPRT_NO_CRT) && (defined(RT_OS_LINUX) || defined(RT_OS_DARWIN) || defined(RT_OS_WINDOWS))
+//AssertCompileMemberSize(REM, Env, REM_ENV_SIZE);
+/* Why did this have to be identical?? */
+AssertCompile(RT_SIZEOFMEMB(REM, Env) <= REM_ENV_SIZE);
+#else
+AssertCompile(RT_SIZEOFMEMB(REM, Env) <= REM_ENV_SIZE);
+#endif
+
+
+/**
+ * Initializes the REM.
+ *
+ * @returns VBox status code.
+ * @param   pVM         The VM to operate on.
+ */
+REMR3DECL(int) REMR3Init(PVM pVM)
+{
+    PREMHANDLERNOTIFICATION pCur;
+    uint32_t                u32Dummy;
+    int                     rc;
+    unsigned                i;
+
+#ifdef VBOX_ENABLE_VBOXREM64
+    LogRel(("Using 64-bit aware REM\n"));
+#endif
+
+    /*
+     * Assert sanity.
+     */
+    AssertReleaseMsg(sizeof(pVM->rem.padding) >= sizeof(pVM->rem.s), ("%#x >= %#x; sizeof(Env)=%#x\n", sizeof(pVM->rem.padding), sizeof(pVM->rem.s), sizeof(pVM->rem.s.Env)));
+    AssertReleaseMsg(sizeof(pVM->rem.s.Env) <= REM_ENV_SIZE, ("%#x == %#x\n", sizeof(pVM->rem.s.Env), REM_ENV_SIZE));
+    AssertReleaseMsg(!(RT_UOFFSETOF(VM, rem) & 31), ("off=%#zx\n", RT_UOFFSETOF(VM, rem)));
+#if 0 /* just an annoyance at the moment. */
+#if defined(DEBUG) && !defined(RT_OS_SOLARIS) && !defined(RT_OS_FREEBSD) /// @todo fix the solaris and freebsd math stuff.
+    Assert(!testmath());
+#endif
+#endif
+
+    /*
+     * Init some internal data members.
+     */
+    pVM->rem.s.offVM = RT_UOFFSETOF(VM, rem.s);
+    pVM->rem.s.Env.pVM = pVM;
+#ifdef CPU_RAW_MODE_INIT
+    pVM->rem.s.state |= CPU_RAW_MODE_INIT;
+#endif
+
+    /*
+     * Initialize the REM critical section.
+     *
+     * Note: This is not a 100% safe solution as updating the internal memory state while another VCPU
+     *       is executing code could be dangerous. Taking the REM lock is not an option due to the danger of
+     *       deadlocks. (mostly pgm vs rem locking)
+     */
+    rc = PDMR3CritSectInit(pVM, &pVM->rem.s.CritSectRegister, RT_SRC_POS, "REM-Register");
+    AssertRCReturn(rc, rc);
+
+    /* ctx. */
+    pVM->rem.s.pCtx = NULL;     /* set when executing code. */
+    AssertMsg(MMR3PhysGetRamSize(pVM) == 0, ("Init order has changed! REM depends on notification about ALL physical memory registrations\n"));
+
+    /* ignore all notifications */
+    ASMAtomicIncU32(&pVM->rem.s.cIgnoreAll);
+
+    code_gen_prologue = RTMemExecAlloc(_1K);
+    AssertLogRelReturn(code_gen_prologue, VERR_NO_MEMORY);
+
+    cpu_exec_init_all(0);
+
+    /*
+     * Init the recompiler.
+     */
+    if (!cpu_x86_init(&pVM->rem.s.Env, "vbox"))
+    {
+        AssertMsgFailed(("cpu_x86_init failed - impossible!\n"));
+        return VERR_GENERAL_FAILURE;
+    }
+    PVMCPU pVCpu = VMMGetCpu(pVM);
+    CPUMGetGuestCpuId(pVCpu,          1, 0, &u32Dummy, &u32Dummy, &pVM->rem.s.Env.cpuid_ext_features, &pVM->rem.s.Env.cpuid_features);
+    CPUMGetGuestCpuId(pVCpu, 0x80000001, 0, &u32Dummy, &u32Dummy, &pVM->rem.s.Env.cpuid_ext3_features, &pVM->rem.s.Env.cpuid_ext2_features);
+
+    EMRemLock(pVM);
+    cpu_reset(&pVM->rem.s.Env);
+    EMRemUnlock(pVM);
+
+    /* allocate code buffer for single instruction emulation. */
+    pVM->rem.s.Env.cbCodeBuffer = 4096;
+    pVM->rem.s.Env.pvCodeBuffer = RTMemExecAlloc(pVM->rem.s.Env.cbCodeBuffer);
+    AssertMsgReturn(pVM->rem.s.Env.pvCodeBuffer, ("Failed to allocate code buffer!\n"), VERR_NO_MEMORY);
+
+    /* Finally, set the cpu_single_env global. */
+    cpu_single_env = &pVM->rem.s.Env;
+
+    /* Nothing is pending by default */
+    pVM->rem.s.uStateLoadPendingInterrupt = REM_NO_PENDING_IRQ;
+
+    /*
+     * Register ram types.
+     */
+    pVM->rem.s.iMMIOMemType    = cpu_register_io_memory(g_apfnMMIORead, g_apfnMMIOWrite, &pVM->rem.s.Env);
+    AssertReleaseMsg(pVM->rem.s.iMMIOMemType >= 0, ("pVM->rem.s.iMMIOMemType=%d\n", pVM->rem.s.iMMIOMemType));
+    pVM->rem.s.iHandlerMemType = cpu_register_io_memory(g_apfnHandlerRead, g_apfnHandlerWrite, pVM);
+    AssertReleaseMsg(pVM->rem.s.iHandlerMemType >= 0, ("pVM->rem.s.iHandlerMemType=%d\n", pVM->rem.s.iHandlerMemType));
+    Log2(("REM: iMMIOMemType=%d iHandlerMemType=%d\n", pVM->rem.s.iMMIOMemType, pVM->rem.s.iHandlerMemType));
+
+    /* stop ignoring. */
+    ASMAtomicDecU32(&pVM->rem.s.cIgnoreAll);
+
+    /*
+     * Register the saved state data unit.
+     */
+    rc = SSMR3RegisterInternal(pVM, "rem", 1, REM_SAVED_STATE_VERSION, sizeof(uint32_t) * 10,
+                               NULL, NULL, NULL,
+                               NULL, remR3Save, NULL,
+                               NULL, remR3Load, remR3LoadDone);
+    if (RT_FAILURE(rc))
+        return rc;
+
+#ifdef VBOX_WITH_DEBUGGER
+    /*
+     * Debugger commands.
+     */
+    static bool fRegisteredCmds = false;
+    if (!fRegisteredCmds)
+    {
+        int rc = DBGCRegisterCommands(&g_aCmds[0], RT_ELEMENTS(g_aCmds));
+        if (RT_SUCCESS(rc))
+            fRegisteredCmds = true;
+    }
+#endif
+
+#ifdef VBOX_WITH_STATISTICS
+    /*
+     * Statistics.
+     */
+    STAM_REG(pVM, &gStatExecuteSingleInstr, STAMTYPE_PROFILE, "/PROF/REM/SingleInstr",STAMUNIT_TICKS_PER_CALL, "Profiling single instruction emulation.");
+    STAM_REG(pVM, &gStatCompilationQEmu,    STAMTYPE_PROFILE, "/PROF/REM/Compile",    STAMUNIT_TICKS_PER_CALL, "Profiling QEmu compilation.");
+    STAM_REG(pVM, &gStatRunCodeQEmu,        STAMTYPE_PROFILE, "/PROF/REM/Runcode",    STAMUNIT_TICKS_PER_CALL, "Profiling QEmu code execution.");
+    STAM_REG(pVM, &gStatTotalTimeQEmu,      STAMTYPE_PROFILE, "/PROF/REM/Emulate",    STAMUNIT_TICKS_PER_CALL, "Profiling code emulation.");
+    STAM_REG(pVM, &gStatTimers,             STAMTYPE_PROFILE, "/PROF/REM/Timers",     STAMUNIT_TICKS_PER_CALL, "Profiling timer queue processing.");
+    STAM_REG(pVM, &gStatTBLookup,           STAMTYPE_PROFILE, "/PROF/REM/TBLookup",   STAMUNIT_TICKS_PER_CALL, "Profiling translation block lookup.");
+    STAM_REG(pVM, &gStatIRQ,                STAMTYPE_PROFILE, "/PROF/REM/IRQ",        STAMUNIT_TICKS_PER_CALL, "Profiling IRQ delivery.");
+    STAM_REG(pVM, &gStatRawCheck,           STAMTYPE_PROFILE, "/PROF/REM/RawCheck",   STAMUNIT_TICKS_PER_CALL, "Profiling remR3CanExecuteRaw calls.");
+    STAM_REG(pVM, &gStatMemRead,            STAMTYPE_PROFILE, "/PROF/REM/MemRead",    STAMUNIT_TICKS_PER_CALL, "Profiling memory access.");
+    STAM_REG(pVM, &gStatMemWrite,           STAMTYPE_PROFILE, "/PROF/REM/MemWrite",   STAMUNIT_TICKS_PER_CALL, "Profiling memory access.");
+    STAM_REG(pVM, &gStatGCPhys2HCVirt,      STAMTYPE_PROFILE, "/PROF/REM/GCPhys2HCVirt", STAMUNIT_TICKS_PER_CALL, "Profiling memory conversion (PGMR3PhysTlbGCPhys2Ptr).");
+
+    STAM_REG(pVM, &gStatCpuGetTSC,          STAMTYPE_COUNTER, "/REM/CpuGetTSC",         STAMUNIT_OCCURENCES,     "cpu_get_tsc calls");
+
+    STAM_REG(pVM, &gStatRefuseTFInhibit,    STAMTYPE_COUNTER, "/REM/Refuse/TFInibit", STAMUNIT_OCCURENCES,     "Raw mode refused because of TF or irq inhibit");
+    STAM_REG(pVM, &gStatRefuseVM86,         STAMTYPE_COUNTER, "/REM/Refuse/VM86",     STAMUNIT_OCCURENCES,     "Raw mode refused because of VM86");
+    STAM_REG(pVM, &gStatRefusePaging,       STAMTYPE_COUNTER, "/REM/Refuse/Paging",   STAMUNIT_OCCURENCES,     "Raw mode refused because of disabled paging/pm");
+    STAM_REG(pVM, &gStatRefusePAE,          STAMTYPE_COUNTER, "/REM/Refuse/PAE",      STAMUNIT_OCCURENCES,     "Raw mode refused because of PAE");
+    STAM_REG(pVM, &gStatRefuseIOPLNot0,     STAMTYPE_COUNTER, "/REM/Refuse/IOPLNot0", STAMUNIT_OCCURENCES,     "Raw mode refused because of IOPL != 0");
+    STAM_REG(pVM, &gStatRefuseIF0,          STAMTYPE_COUNTER, "/REM/Refuse/IF0",      STAMUNIT_OCCURENCES,     "Raw mode refused because of IF=0");
+    STAM_REG(pVM, &gStatRefuseCode16,       STAMTYPE_COUNTER, "/REM/Refuse/Code16",   STAMUNIT_OCCURENCES,     "Raw mode refused because of 16 bit code");
+    STAM_REG(pVM, &gStatRefuseWP0,          STAMTYPE_COUNTER, "/REM/Refuse/WP0",      STAMUNIT_OCCURENCES,     "Raw mode refused because of WP=0");
+    STAM_REG(pVM, &gStatRefuseRing1or2,     STAMTYPE_COUNTER, "/REM/Refuse/Ring1or2", STAMUNIT_OCCURENCES,     "Raw mode refused because of ring 1/2 execution");
+    STAM_REG(pVM, &gStatRefuseCanExecute,   STAMTYPE_COUNTER, "/REM/Refuse/CanExecuteRaw", STAMUNIT_OCCURENCES,     "Raw mode refused because of cCanExecuteRaw");
+    STAM_REG(pVM, &gaStatRefuseStale[R_ES], STAMTYPE_COUNTER, "/REM/Refuse/StaleES",  STAMUNIT_OCCURENCES,     "Raw mode refused because of stale ES");
+    STAM_REG(pVM, &gaStatRefuseStale[R_CS], STAMTYPE_COUNTER, "/REM/Refuse/StaleCS",  STAMUNIT_OCCURENCES,     "Raw mode refused because of stale CS");
+    STAM_REG(pVM, &gaStatRefuseStale[R_SS], STAMTYPE_COUNTER, "/REM/Refuse/StaleSS",  STAMUNIT_OCCURENCES,     "Raw mode refused because of stale SS");
+    STAM_REG(pVM, &gaStatRefuseStale[R_DS], STAMTYPE_COUNTER, "/REM/Refuse/StaleDS",  STAMUNIT_OCCURENCES,     "Raw mode refused because of stale DS");
+    STAM_REG(pVM, &gaStatRefuseStale[R_FS], STAMTYPE_COUNTER, "/REM/Refuse/StaleFS",  STAMUNIT_OCCURENCES,     "Raw mode refused because of stale FS");
+    STAM_REG(pVM, &gaStatRefuseStale[R_GS], STAMTYPE_COUNTER, "/REM/Refuse/StaleGS",  STAMUNIT_OCCURENCES,     "Raw mode refused because of stale GS");
+    STAM_REG(pVM, &gStatFlushTBs,           STAMTYPE_COUNTER, "/REM/FlushTB",         STAMUNIT_OCCURENCES,     "Number of TB flushes");
+
+    STAM_REG(pVM, &gStatREMGDTChange,       STAMTYPE_COUNTER, "/REM/Change/GDTBase",   STAMUNIT_OCCURENCES,     "GDT base changes");
+    STAM_REG(pVM, &gStatREMLDTRChange,      STAMTYPE_COUNTER, "/REM/Change/LDTR",      STAMUNIT_OCCURENCES,     "LDTR changes");
+    STAM_REG(pVM, &gStatREMIDTChange,       STAMTYPE_COUNTER, "/REM/Change/IDTBase",   STAMUNIT_OCCURENCES,     "IDT base changes");
+    STAM_REG(pVM, &gStatREMTRChange,        STAMTYPE_COUNTER, "/REM/Change/TR",        STAMUNIT_OCCURENCES,     "TR selector changes");
+
+    STAM_REG(pVM, &gStatSelOutOfSync[0],    STAMTYPE_COUNTER, "/REM/State/SelOutOfSync/ES",        STAMUNIT_OCCURENCES,     "ES out of sync");
+    STAM_REG(pVM, &gStatSelOutOfSync[1],    STAMTYPE_COUNTER, "/REM/State/SelOutOfSync/CS",        STAMUNIT_OCCURENCES,     "CS out of sync");
+    STAM_REG(pVM, &gStatSelOutOfSync[2],    STAMTYPE_COUNTER, "/REM/State/SelOutOfSync/SS",        STAMUNIT_OCCURENCES,     "SS out of sync");
+    STAM_REG(pVM, &gStatSelOutOfSync[3],    STAMTYPE_COUNTER, "/REM/State/SelOutOfSync/DS",        STAMUNIT_OCCURENCES,     "DS out of sync");
+    STAM_REG(pVM, &gStatSelOutOfSync[4],    STAMTYPE_COUNTER, "/REM/State/SelOutOfSync/FS",        STAMUNIT_OCCURENCES,     "FS out of sync");
+    STAM_REG(pVM, &gStatSelOutOfSync[5],    STAMTYPE_COUNTER, "/REM/State/SelOutOfSync/GS",        STAMUNIT_OCCURENCES,     "GS out of sync");
+
+    STAM_REG(pVM, &gStatSelOutOfSyncStateBack[0],   STAMTYPE_COUNTER,   "/REM/StateBack/SelOutOfSync/ES",   STAMUNIT_OCCURENCES, "ES out of sync");
+    STAM_REG(pVM, &gStatSelOutOfSyncStateBack[1],   STAMTYPE_COUNTER,   "/REM/StateBack/SelOutOfSync/CS",   STAMUNIT_OCCURENCES, "CS out of sync");
+    STAM_REG(pVM, &gStatSelOutOfSyncStateBack[2],   STAMTYPE_COUNTER,   "/REM/StateBack/SelOutOfSync/SS",   STAMUNIT_OCCURENCES, "SS out of sync");
+    STAM_REG(pVM, &gStatSelOutOfSyncStateBack[3],   STAMTYPE_COUNTER,   "/REM/StateBack/SelOutOfSync/DS",   STAMUNIT_OCCURENCES, "DS out of sync");
+    STAM_REG(pVM, &gStatSelOutOfSyncStateBack[4],   STAMTYPE_COUNTER,   "/REM/StateBack/SelOutOfSync/FS",   STAMUNIT_OCCURENCES, "FS out of sync");
+    STAM_REG(pVM, &gStatSelOutOfSyncStateBack[5],   STAMTYPE_COUNTER,   "/REM/StateBack/SelOutOfSync/GS",   STAMUNIT_OCCURENCES, "GS out of sync");
+
+    STAM_REG(pVM, &pVM->rem.s.Env.StatTbFlush,      STAMTYPE_PROFILE,   "/REM/TbFlush",     STAMUNIT_TICKS_PER_CALL, "profiling tb_flush().");
+#endif /* VBOX_WITH_STATISTICS */
+    AssertCompileMemberAlignment(CPUX86State, StatTbFlush, 4);
+    AssertCompileMemberAlignment(CPUX86State, StatTbFlush, 8);
+
+    STAM_REL_REG(pVM, &tb_flush_count,              STAMTYPE_U32_RESET, "/REM/TbFlushCount",                STAMUNIT_OCCURENCES, "tb_flush() calls");
+    STAM_REL_REG(pVM, &tb_phys_invalidate_count,    STAMTYPE_U32_RESET, "/REM/TbPhysInvldCount",            STAMUNIT_OCCURENCES, "tb_phys_invalidate() calls");
+    STAM_REL_REG(pVM, &tlb_flush_count,             STAMTYPE_U32_RESET, "/REM/TlbFlushCount",               STAMUNIT_OCCURENCES, "tlb_flush() calls");
+
+
+#ifdef DEBUG_ALL_LOGGING
+    loglevel = ~0;
+#endif
+
+    /*
+     * Init the handler notification lists.
+     */
+    pVM->rem.s.idxPendingList = UINT32_MAX;
+    pVM->rem.s.idxFreeList    = 0;
+
+    for (i = 0 ; i < RT_ELEMENTS(pVM->rem.s.aHandlerNotifications); i++)
+    {
+        pCur = &pVM->rem.s.aHandlerNotifications[i];
+        pCur->idxNext = i + 1;
+        pCur->idxSelf = i;
+    }
+    pCur->idxNext = UINT32_MAX;         /* the last record. */
+
+    return rc;
+}
+
+
+/**
+ * Finalizes the REM initialization.
+ *
+ * This is called after all components, devices and drivers has
+ * been initialized. Its main purpose it to finish the RAM related
+ * initialization.
+ *
+ * @returns VBox status code.
+ *
+ * @param   pVM         The VM handle.
+ */
+REMR3DECL(int) REMR3InitFinalize(PVM pVM)
+{
+    int rc;
+
+    /*
+     * Ram size & dirty bit map.
+     */
+    Assert(!pVM->rem.s.fGCPhysLastRamFixed);
+    pVM->rem.s.fGCPhysLastRamFixed = true;
+#ifdef RT_STRICT
+    rc = remR3InitPhysRamSizeAndDirtyMap(pVM, true /* fGuarded */);
+#else
+    rc = remR3InitPhysRamSizeAndDirtyMap(pVM, false /* fGuarded */);
+#endif
+    return rc;
+}
+
+/**
+ * Initializes ram_list.phys_dirty and ram_list.phys_dirty_size.
+ *
+ * @returns VBox status code.
+ * @param   pVM         The VM handle.
+ * @param   fGuarded    Whether to guard the map.
+ */
+static int remR3InitPhysRamSizeAndDirtyMap(PVM pVM, bool fGuarded)
+{
+    int      rc = VINF_SUCCESS;
+    RTGCPHYS cb;
+
+    AssertLogRelReturn(QLIST_EMPTY(&ram_list.blocks), VERR_INTERNAL_ERROR_2);
+
+    cb = pVM->rem.s.GCPhysLastRam + 1;
+    AssertLogRelMsgReturn(cb > pVM->rem.s.GCPhysLastRam,
+                          ("GCPhysLastRam=%RGp - out of range\n", pVM->rem.s.GCPhysLastRam),
+                          VERR_OUT_OF_RANGE);
+
+    ram_list.phys_dirty_size = cb >> PAGE_SHIFT;
+    AssertMsg(((RTGCPHYS)ram_list.phys_dirty_size << PAGE_SHIFT) == cb, ("%RGp\n", cb));
+
+    if (!fGuarded)
+    {
+        ram_list.phys_dirty = MMR3HeapAlloc(pVM, MM_TAG_REM, ram_list.phys_dirty_size);
+        AssertLogRelMsgReturn(ram_list.phys_dirty, ("Failed to allocate %u bytes of dirty page map bytes\n", ram_list.phys_dirty_size), VERR_NO_MEMORY);
+    }
+    else
+    {
+        /*
+         * Fill it up the nearest 4GB RAM and leave at least _64KB of guard after it.
+         */
+        uint32_t cbBitmapAligned = RT_ALIGN_32(ram_list.phys_dirty_size, PAGE_SIZE);
+        uint32_t cbBitmapFull    = RT_ALIGN_32(ram_list.phys_dirty_size, (_4G >> PAGE_SHIFT));
+        if (cbBitmapFull == cbBitmapAligned)
+            cbBitmapFull += _4G >> PAGE_SHIFT;
+        else if (cbBitmapFull - cbBitmapAligned < _64K)
+            cbBitmapFull += _64K;
+
+        ram_list.phys_dirty = RTMemPageAlloc(cbBitmapFull);
+        AssertLogRelMsgReturn(ram_list.phys_dirty, ("Failed to allocate %u bytes of dirty page map bytes\n", cbBitmapFull), VERR_NO_MEMORY);
+
+        rc = RTMemProtect(ram_list.phys_dirty + cbBitmapAligned, cbBitmapFull - cbBitmapAligned, RTMEM_PROT_NONE);
+        if (RT_FAILURE(rc))
+        {
+            RTMemPageFree(ram_list.phys_dirty, cbBitmapFull);
+            AssertLogRelRCReturn(rc, rc);
+        }
+
+        ram_list.phys_dirty += cbBitmapAligned - ram_list.phys_dirty_size;
+    }
+
+    /* initialize it. */
+    memset(ram_list.phys_dirty, 0xff, ram_list.phys_dirty_size);
+    return rc;
+}
+
+
+/**
+ * Terminates the REM.
+ *
+ * Termination means cleaning up and freeing all resources,
+ * the VM it self is at this point powered off or suspended.
+ *
+ * @returns VBox status code.
+ * @param   pVM         The VM to operate on.
+ */
+REMR3DECL(int) REMR3Term(PVM pVM)
+{
+    /*
+     * Statistics.
+     */
+    STAMR3Deregister(pVM->pUVM, "/PROF/REM/*");
+    STAMR3Deregister(pVM->pUVM, "/REM/*");
+
+    return VINF_SUCCESS;
+}
+
+
+/**
+ * The VM is being reset.
+ *
+ * For the REM component this means to call the cpu_reset() and
+ * reinitialize some state variables.
+ *
+ * @param   pVM     VM handle.
+ */
+REMR3DECL(void) REMR3Reset(PVM pVM)
+{
+    EMRemLock(pVM); /* Only pro forma, we're in a rendezvous. */
+
+    /*
+     * Reset the REM cpu.
+     */
+    Assert(pVM->rem.s.cIgnoreAll == 0);
+    ASMAtomicIncU32(&pVM->rem.s.cIgnoreAll);
+    cpu_reset(&pVM->rem.s.Env);
+    pVM->rem.s.cInvalidatedPages = 0;
+    ASMAtomicDecU32(&pVM->rem.s.cIgnoreAll);
+    Assert(pVM->rem.s.cIgnoreAll == 0);
+
+    /* Clear raw ring 0 init state */
+    pVM->rem.s.Env.state &= ~CPU_RAW_RING0;
+
+    /* Flush the TBs the next time we execute code here. */
+    pVM->rem.s.fFlushTBs = true;
+
+    EMRemUnlock(pVM);
+}
+
+
+/**
+ * Execute state save operation.
+ *
+ * @returns VBox status code.
+ * @param   pVM             VM Handle.
+ * @param   pSSM            SSM operation handle.
+ */
+static DECLCALLBACK(int) remR3Save(PVM pVM, PSSMHANDLE pSSM)
+{
+    PREM pRem = &pVM->rem.s;
+
+    /*
+     * Save the required CPU Env bits.
+     * (Not much because we're never in REM when doing the save.)
+     */
+    LogFlow(("remR3Save:\n"));
+    Assert(!pRem->fInREM);
+    SSMR3PutU32(pSSM,   pRem->Env.hflags);
+    SSMR3PutU32(pSSM,   ~0);            /* separator */
+
+    /* Remember if we've entered raw mode (vital for ring 1 checks in e.g. iret emulation). */
+    SSMR3PutU32(pSSM, !!(pRem->Env.state & CPU_RAW_RING0));
+    SSMR3PutU32(pSSM, REM_NO_PENDING_IRQ);
+
+    return SSMR3PutU32(pSSM, ~0);       /* terminator */
+}
+
+
+/**
+ * Execute state load operation.
+ *
+ * @returns VBox status code.
+ * @param   pVM             VM Handle.
+ * @param   pSSM            SSM operation handle.
+ * @param   uVersion        Data layout version.
+ * @param   uPass           The data pass.
+ */
+static DECLCALLBACK(int) remR3Load(PVM pVM, PSSMHANDLE pSSM, uint32_t uVersion, uint32_t uPass)
+{
+    uint32_t u32Dummy;
+    uint32_t fRawRing0 = false;
+    uint32_t u32Sep;
+    uint32_t i;
+    int rc;
+    PREM pRem;
+
+    LogFlow(("remR3Load:\n"));
+    Assert(uPass == SSM_PASS_FINAL); NOREF(uPass);
+
+    /*
+     * Validate version.
+     */
+    if (    uVersion != REM_SAVED_STATE_VERSION
+        &&  uVersion != REM_SAVED_STATE_VERSION_VER1_6)
+    {
+        AssertMsgFailed(("remR3Load: Invalid version uVersion=%d!\n", uVersion));
+        return VERR_SSM_UNSUPPORTED_DATA_UNIT_VERSION;
+    }
+
+    /*
+     * Do a reset to be on the safe side...
+     */
+    REMR3Reset(pVM);
+
+    /*
+     * Ignore all ignorable notifications.
+     * (Not doing this will cause serious trouble.)
+     */
+    ASMAtomicIncU32(&pVM->rem.s.cIgnoreAll);
+
+    /*
+     * Load the required CPU Env bits.
+     * (Not much because we're never in REM when doing the save.)
+     */
+    pRem = &pVM->rem.s;
+    Assert(!pRem->fInREM);
+    SSMR3GetU32(pSSM,   &pRem->Env.hflags);
+    if (uVersion == REM_SAVED_STATE_VERSION_VER1_6)
+    {
+        /* Redundant REM CPU state has to be loaded, but can be ignored. */
+        CPUX86State_Ver16 temp;
+        SSMR3GetMem(pSSM,   &temp, RT_UOFFSETOF(CPUX86State_Ver16, jmp_env));
+    }
+
+    rc = SSMR3GetU32(pSSM, &u32Sep);            /* separator */
+    if (RT_FAILURE(rc))
+        return rc;
+    if (u32Sep != ~0U)
+    {
+        AssertMsgFailed(("u32Sep=%#x\n", u32Sep));
+        return VERR_SSM_DATA_UNIT_FORMAT_CHANGED;
+    }
+
+    /* Remember if we've entered raw mode (vital for ring 1 checks in e.g. iret emulation). */
+    SSMR3GetUInt(pSSM, &fRawRing0);
+    if (fRawRing0)
+        pRem->Env.state |= CPU_RAW_RING0;
+
+    if (uVersion == REM_SAVED_STATE_VERSION_VER1_6)
+    {
+        /*
+         * Load the REM stuff.
+         */
+        /** @todo r=bird: We should just drop all these items, restoring doesn't make
+         *        sense. */
+        rc = SSMR3GetU32(pSSM, (uint32_t *)&pRem->cInvalidatedPages);
+        if (RT_FAILURE(rc))
+            return rc;
+        if (pRem->cInvalidatedPages > RT_ELEMENTS(pRem->aGCPtrInvalidatedPages))
+        {
+            AssertMsgFailed(("cInvalidatedPages=%#x\n", pRem->cInvalidatedPages));
+            return VERR_SSM_DATA_UNIT_FORMAT_CHANGED;
+        }
+        for (i = 0; i < pRem->cInvalidatedPages; i++)
+            SSMR3GetGCPtr(pSSM, &pRem->aGCPtrInvalidatedPages[i]);
+    }
+
+    rc = SSMR3GetUInt(pSSM, &pVM->rem.s.uStateLoadPendingInterrupt);
+    AssertRCReturn(rc, rc);
+    AssertLogRelMsgReturn(   pVM->rem.s.uStateLoadPendingInterrupt == REM_NO_PENDING_IRQ
+                          || pVM->rem.s.uStateLoadPendingInterrupt < 256,
+                          ("uStateLoadPendingInterrupt=%#x\n", pVM->rem.s.uStateLoadPendingInterrupt),
+                          VERR_SSM_UNEXPECTED_DATA);
+
+    /* check the terminator. */
+    rc = SSMR3GetU32(pSSM, &u32Sep);
+    if (RT_FAILURE(rc))
+        return rc;
+    if (u32Sep != ~0U)
+    {
+        AssertMsgFailed(("u32Sep=%#x (term)\n", u32Sep));
+        return VERR_SSM_DATA_UNIT_FORMAT_CHANGED;
+    }
+
+    /*
+     * Get the CPUID features.
+     */
+    PVMCPU pVCpu = VMMGetCpu(pVM);
+    CPUMGetGuestCpuId(pVCpu,          1, 0, &u32Dummy, &u32Dummy, &pVM->rem.s.Env.cpuid_ext_features, &pVM->rem.s.Env.cpuid_features);
+    CPUMGetGuestCpuId(pVCpu, 0x80000001, 0, &u32Dummy, &u32Dummy, &u32Dummy, &pVM->rem.s.Env.cpuid_ext2_features);
+
+    /*
+     * Stop ignoring ignorable notifications.
+     */
+    ASMAtomicDecU32(&pVM->rem.s.cIgnoreAll);
+
+    /*
+     * Sync the whole CPU state when executing code in the recompiler.
+     */
+    for (i = 0; i < pVM->cCpus; i++)
+    {
+        PVMCPU pVCpu = &pVM->aCpus[i];
+        CPUMSetChangedFlags(pVCpu, CPUM_CHANGED_ALL);
+    }
+    return VINF_SUCCESS;
+}
+
+
+/**
+ * @callback_method_impl{FNSSMINTLOADDONE,
+ *    For pushing misdesigned pending-interrupt mess to TRPM where it belongs. }
+ */
+static DECLCALLBACK(int) remR3LoadDone(PVM pVM, PSSMHANDLE pSSM)
+{
+    if (pVM->rem.s.uStateLoadPendingInterrupt != REM_NO_PENDING_IRQ)
+    {
+        int rc = TRPMAssertTrap(&pVM->aCpus[0], pVM->rem.s.uStateLoadPendingInterrupt, TRPM_HARDWARE_INT);
+        AssertLogRelMsgReturn(rc, ("uStateLoadPendingInterrupt=%#x rc=%Rrc\n", pVM->rem.s.uStateLoadPendingInterrupt, rc), rc);
+        pVM->rem.s.uStateLoadPendingInterrupt = REM_NO_PENDING_IRQ;
+    }
+    return VINF_SUCCESS;
+}
+
+
+#undef LOG_GROUP
+#define LOG_GROUP LOG_GROUP_REM_RUN
+
+/**
+ * Single steps an instruction in recompiled mode.
+ *
+ * Before calling this function the REM state needs to be in sync with
+ * the VM. Call REMR3State() to perform the sync. It's only necessary
+ * (and permitted) to sync at the first call to REMR3Step()/REMR3Run()
+ * and after calling REMR3StateBack().
+ *
+ * @returns VBox status code.
+ *
+ * @param   pVM         VM Handle.
+ * @param   pVCpu       VMCPU Handle.
+ */
+REMR3DECL(int) REMR3Step(PVM pVM, PVMCPU pVCpu)
+{
+    int         rc, interrupt_request;
+    RTGCPTR     GCPtrPC;
+    bool        fBp;
+
+    /*
+     * Lock the REM - we don't wanna have anyone interrupting us
+     * while stepping - and enabled single stepping. We also ignore
+     * pending interrupts and suchlike.
+     */
+    interrupt_request = pVM->rem.s.Env.interrupt_request;
+    Assert(!(interrupt_request & ~(CPU_INTERRUPT_HARD | CPU_INTERRUPT_EXITTB | CPU_INTERRUPT_TIMER | CPU_INTERRUPT_EXTERNAL_HARD | CPU_INTERRUPT_EXTERNAL_EXIT | CPU_INTERRUPT_EXTERNAL_FLUSH_TLB | CPU_INTERRUPT_EXTERNAL_TIMER)));
+    pVM->rem.s.Env.interrupt_request = 0;
+    cpu_single_step(&pVM->rem.s.Env, 1);
+
+    /*
+     * If we're standing at a breakpoint, that have to be disabled before we start stepping.
+     */
+    GCPtrPC = pVM->rem.s.Env.eip + pVM->rem.s.Env.segs[R_CS].base;
+    fBp = !cpu_breakpoint_remove(&pVM->rem.s.Env, GCPtrPC, BP_GDB);
+
+    /*
+     * Execute and handle the return code.
+     * We execute without enabling the cpu tick, so on success we'll
+     * just flip it on and off to make sure it moves
+     */
+    rc = cpu_exec(&pVM->rem.s.Env);
+    if (rc == EXCP_DEBUG)
+    {
+        TMR3NotifyResume(pVM, pVCpu);
+        TMR3NotifySuspend(pVM, pVCpu);
+        rc = VINF_EM_DBG_STEPPED;
+    }
+    else
+    {
+        switch (rc)
+        {
+            case EXCP_INTERRUPT:    rc = VINF_SUCCESS; break;
+            case EXCP_HLT:
+            case EXCP_HALTED:       rc = VINF_EM_HALT; break;
+            case EXCP_RC:
+                rc = pVM->rem.s.rc;
+                pVM->rem.s.rc = VERR_INTERNAL_ERROR;
+                break;
+            case EXCP_EXECUTE_RAW:
+            case EXCP_EXECUTE_HM:
+                /** @todo is it correct? No! */
+                rc = VINF_SUCCESS;
+                break;
+            default:
+                AssertReleaseMsgFailed(("This really shouldn't happen, rc=%d!\n", rc));
+                rc = VERR_INTERNAL_ERROR;
+                break;
+        }
+    }
+
+    /*
+     * Restore the stuff we changed to prevent interruption.
+     * Unlock the REM.
+     */
+    if (fBp)
+    {
+        int rc2 = cpu_breakpoint_insert(&pVM->rem.s.Env, GCPtrPC, BP_GDB, NULL);
+        Assert(rc2 == 0); NOREF(rc2);
+    }
+    cpu_single_step(&pVM->rem.s.Env, 0);
+    pVM->rem.s.Env.interrupt_request = interrupt_request;
+
+    return rc;
+}
+
+
+/**
+ * Set a breakpoint using the REM facilities.
+ *
+ * @returns VBox status code.
+ * @param   pVM             The VM handle.
+ * @param   Address         The breakpoint address.
+ * @thread  The emulation thread.
+ */
+REMR3DECL(int) REMR3BreakpointSet(PVM pVM, RTGCUINTPTR Address)
+{
+    VM_ASSERT_EMT(pVM);
+    if (!cpu_breakpoint_insert(&pVM->rem.s.Env, Address, BP_GDB, NULL))
+    {
+        LogFlow(("REMR3BreakpointSet: Address=%RGv\n", Address));
+        return VINF_SUCCESS;
+    }
+    LogFlow(("REMR3BreakpointSet: Address=%RGv - failed!\n", Address));
+    return VERR_REM_NO_MORE_BP_SLOTS;
+}
+
+
+/**
+ * Clears a breakpoint set by REMR3BreakpointSet().
+ *
+ * @returns VBox status code.
+ * @param   pVM             The VM handle.
+ * @param   Address         The breakpoint address.
+ * @thread  The emulation thread.
+ */
+REMR3DECL(int) REMR3BreakpointClear(PVM pVM, RTGCUINTPTR Address)
+{
+    VM_ASSERT_EMT(pVM);
+    if (!cpu_breakpoint_remove(&pVM->rem.s.Env, Address, BP_GDB))
+    {
+        LogFlow(("REMR3BreakpointClear: Address=%RGv\n", Address));
+        return VINF_SUCCESS;
+    }
+    LogFlow(("REMR3BreakpointClear: Address=%RGv - not found!\n", Address));
+    return VERR_REM_BP_NOT_FOUND;
+}
+
+
+/**
+ * Emulate an instruction.
+ *
+ * This function executes one instruction without letting anyone
+ * interrupt it. This is intended for being called while being in
+ * raw mode and thus will take care of all the state syncing between
+ * REM and the rest.
+ *
+ * @returns VBox status code.
+ * @param   pVM         VM handle.
+ * @param   pVCpu       VMCPU Handle.
+ */
+REMR3DECL(int) REMR3EmulateInstruction(PVM pVM, PVMCPU pVCpu)
+{
+    bool fFlushTBs;
+
+    int rc, rc2;
+    Log2(("REMR3EmulateInstruction: (cs:eip=%04x:%08x)\n", CPUMGetGuestCS(pVCpu), CPUMGetGuestEIP(pVCpu)));
+
+    /* Make sure this flag is set; we might never execute remR3CanExecuteRaw in the AMD-V case.
+     * CPU_RAW_HM makes sure we never execute interrupt handlers in the recompiler.
+     */
+    if (!VM_IS_RAW_MODE_ENABLED(pVM))
+        pVM->rem.s.Env.state |= CPU_RAW_HM;
+
+    /* Skip the TB flush as that's rather expensive and not necessary for single instruction emulation. */
+    fFlushTBs = pVM->rem.s.fFlushTBs;
+    pVM->rem.s.fFlushTBs = false;
+
+    /*
+     * Sync the state and enable single instruction / single stepping.
+     */
+    rc = REMR3State(pVM, pVCpu);
+    pVM->rem.s.fFlushTBs = fFlushTBs;
+    if (RT_SUCCESS(rc))
+    {
+        int interrupt_request = pVM->rem.s.Env.interrupt_request;
+        Assert(!(  interrupt_request
+                 & ~(CPU_INTERRUPT_HARD | CPU_INTERRUPT_EXITTB | CPU_INTERRUPT_TIMER | CPU_INTERRUPT_EXTERNAL_HARD
+                     | CPU_INTERRUPT_EXTERNAL_EXIT | CPU_INTERRUPT_EXTERNAL_FLUSH_TLB | CPU_INTERRUPT_EXTERNAL_TIMER
+                     | CPU_INTERRUPT_EXTERNAL_DMA)));
+#ifdef REM_USE_QEMU_SINGLE_STEP_FOR_LOGGING
+        cpu_single_step(&pVM->rem.s.Env, 0);
+#endif
+        Assert(!pVM->rem.s.Env.singlestep_enabled);
+
+        /*
+         * Now we set the execute single instruction flag and enter the cpu_exec loop.
+         */
+        TMNotifyStartOfExecution(pVCpu);
+        pVM->rem.s.Env.interrupt_request = CPU_INTERRUPT_SINGLE_INSTR;
+        rc = cpu_exec(&pVM->rem.s.Env);
+        TMNotifyEndOfExecution(pVCpu);
+        switch (rc)
+        {
+            /*
+             * Executed without anything out of the way happening.
+             */
+            case EXCP_SINGLE_INSTR:
+                rc = VINF_EM_RESCHEDULE;
+                Log2(("REMR3EmulateInstruction: cpu_exec -> EXCP_SINGLE_INSTR\n"));
+                break;
+
+            /*
+             * If we take a trap or start servicing a pending interrupt, we might end up here.
+             * (Timer thread or some other thread wishing EMT's attention.)
+             */
+            case EXCP_INTERRUPT:
+                Log2(("REMR3EmulateInstruction: cpu_exec -> EXCP_INTERRUPT\n"));
+                rc = VINF_EM_RESCHEDULE;
+                break;
+
+            /*
+             * Single step, we assume!
+             * If there was a breakpoint there we're fucked now.
+             */
+            case EXCP_DEBUG:
+                if (pVM->rem.s.Env.watchpoint_hit)
+                {
+                    /** @todo deal with watchpoints */
+                    Log2(("REMR3EmulateInstruction: cpu_exec -> EXCP_DEBUG rc=%Rrc !watchpoint_hit!\n", rc));
+                    rc = VINF_EM_DBG_BREAKPOINT;
+                }
+                else
+                {
+                    CPUBreakpoint  *pBP;
+                    RTGCPTR         GCPtrPC = pVM->rem.s.Env.eip + pVM->rem.s.Env.segs[R_CS].base;
+                    QTAILQ_FOREACH(pBP, &pVM->rem.s.Env.breakpoints, entry)
+                        if (pBP->pc == GCPtrPC)
+                            break;
+                    rc = pBP ? VINF_EM_DBG_BREAKPOINT : VINF_EM_DBG_STEPPED;
+                    Log2(("REMR3EmulateInstruction: cpu_exec -> EXCP_DEBUG rc=%Rrc pBP=%p GCPtrPC=%RGv\n", rc, pBP, GCPtrPC));
+                }
+                break;
+
+            /*
+             * hlt instruction.
+             */
+            case EXCP_HLT:
+                Log2(("REMR3EmulateInstruction: cpu_exec -> EXCP_HLT\n"));
+                rc = VINF_EM_HALT;
+                break;
+
+            /*
+             * The VM has halted.
+             */
+            case EXCP_HALTED:
+                Log2(("REMR3EmulateInstruction: cpu_exec -> EXCP_HALTED\n"));
+                rc = VINF_EM_HALT;
+                break;
+
+            /*
+             * Switch to RAW-mode.
+             */
+            case EXCP_EXECUTE_RAW:
+                Log2(("REMR3EmulateInstruction: cpu_exec -> EXCP_EXECUTE_RAW\n"));
+                rc = VINF_EM_RESCHEDULE_RAW;
+                break;
+
+            /*
+             * Switch to hardware accelerated RAW-mode.
+             */
+            case EXCP_EXECUTE_HM:
+                Log2(("REMR3EmulateInstruction: cpu_exec -> EXCP_EXECUTE_HM\n"));
+                rc = VINF_EM_RESCHEDULE_HM;
+                break;
+
+            /*
+             * An EM RC was raised (VMR3Reset/Suspend/PowerOff/some-fatal-error).
+             */
+            case EXCP_RC:
+                Log2(("REMR3EmulateInstruction: cpu_exec -> EXCP_RC\n"));
+                rc = pVM->rem.s.rc;
+                pVM->rem.s.rc = VERR_INTERNAL_ERROR;
+                break;
+
+            /*
+             * Figure out the rest when they arrive....
+             */
+            default:
+                AssertMsgFailed(("rc=%d\n", rc));
+                Log2(("REMR3EmulateInstruction: cpu_exec -> %d\n", rc));
+                rc = VINF_EM_RESCHEDULE;
+                break;
+        }
+
+        /*
+         * Switch back the state.
+         */
+        pVM->rem.s.Env.interrupt_request = interrupt_request;
+        rc2 = REMR3StateBack(pVM, pVCpu);
+        AssertRC(rc2);
+    }
+
+    Log2(("REMR3EmulateInstruction: returns %Rrc (cs:eip=%04x:%RGv)\n",
+          rc, pVM->rem.s.Env.segs[R_CS].selector, (RTGCPTR)pVM->rem.s.Env.eip));
+    return rc;
+}
+
+
+/**
+ * Used by REMR3Run to handle the case where CPU_EMULATE_SINGLE_STEP is set.
+ *
+ * @returns VBox status code.
+ *
+ * @param   pVM                 The VM handle.
+ * @param   pVCpu               The Virtual CPU handle.
+ */
+static int remR3RunLoggingStep(PVM pVM, PVMCPU pVCpu)
+{
+    int rc;
+
+    Assert(pVM->rem.s.fInREM);
+#ifdef REM_USE_QEMU_SINGLE_STEP_FOR_LOGGING
+    cpu_single_step(&pVM->rem.s.Env, 1);
+#else
+    Assert(!pVM->rem.s.Env.singlestep_enabled);
+#endif
+
+    /*
+     * Now we set the execute single instruction flag and enter the cpu_exec loop.
+     */
+    for (;;)
+    {
+        char szBuf[256];
+
+        /*
+         * Log the current registers state and instruction.
+         */
+        remR3StateUpdate(pVM, pVCpu);
+        DBGFR3Info(pVM->pUVM, "cpumguest", NULL, NULL);
+        szBuf[0] = '\0';
+        rc = DBGFR3DisasInstrEx(pVM->pUVM,
+                                pVCpu->idCpu,
+                                0, /* Sel */  0, /* GCPtr */
+                                DBGF_DISAS_FLAGS_CURRENT_GUEST | DBGF_DISAS_FLAGS_DEFAULT_MODE,
+                                szBuf,
+                                sizeof(szBuf),
+                                NULL);
+        if (RT_FAILURE(rc))
+            RTStrPrintf(szBuf, sizeof(szBuf), "DBGFR3DisasInstrEx failed with rc=%Rrc\n", rc);
+        RTLogPrintf("CPU%d: %s\n", pVCpu->idCpu, szBuf);
+
+        /*
+         * Execute the instruction.
+         */
+        TMNotifyStartOfExecution(pVCpu);
+
+        if (   pVM->rem.s.Env.exception_index < 0
+            || pVM->rem.s.Env.exception_index > 256)
+            pVM->rem.s.Env.exception_index = -1; /** @todo We need to do similar stuff elsewhere, I think. */
+
+#ifdef REM_USE_QEMU_SINGLE_STEP_FOR_LOGGING
+        pVM->rem.s.Env.interrupt_request = 0;
+#else
+        pVM->rem.s.Env.interrupt_request = CPU_INTERRUPT_SINGLE_INSTR;
+#endif
+        if (VMCPU_FF_IS_ANY_SET(pVCpu, VMCPU_FF_UPDATE_APIC | VMCPU_FF_INTERRUPT_APIC | VMCPU_FF_INTERRUPT_PIC))
+            pVM->rem.s.Env.interrupt_request |= CPU_INTERRUPT_HARD;
+        RTLogPrintf("remR3RunLoggingStep: interrupt_request=%#x halted=%d exception_index=%#x\n",
+                    pVM->rem.s.Env.interrupt_request,
+                    pVM->rem.s.Env.halted,
+                    pVM->rem.s.Env.exception_index
+                    );
+
+        rc = cpu_exec(&pVM->rem.s.Env);
+
+        RTLogPrintf("remR3RunLoggingStep: cpu_exec -> %#x interrupt_request=%#x halted=%d exception_index=%#x\n", rc,
+                    pVM->rem.s.Env.interrupt_request,
+                    pVM->rem.s.Env.halted,
+                    pVM->rem.s.Env.exception_index
+                    );
+
+        TMNotifyEndOfExecution(pVCpu);
+
+        switch (rc)
+        {
+#ifndef REM_USE_QEMU_SINGLE_STEP_FOR_LOGGING
+            /*
+             * The normal exit.
+             */
+            case EXCP_SINGLE_INSTR:
+                if (   !VM_FF_IS_ANY_SET(pVM, VM_FF_ALL_REM_MASK)
+                    && !VMCPU_FF_IS_ANY_SET(pVCpu, VMCPU_FF_ALL_REM_MASK))
+                    continue;
+                RTLogPrintf("remR3RunLoggingStep: rc=VINF_SUCCESS w/ FFs (%#x/%#RX64)\n",
+                            pVM->fGlobalForcedActions, (uint64_t)pVCpu->fLocalForcedActions);
+                rc = VINF_SUCCESS;
+                break;
+
+#else
+            /*
+             * The normal exit, check for breakpoints at PC just to be sure.
+             */
+#endif
+            case EXCP_DEBUG:
+                if (pVM->rem.s.Env.watchpoint_hit)
+                {
+                    /** @todo deal with watchpoints */
+                    Log2(("remR3RunLoggingStep: cpu_exec -> EXCP_DEBUG rc=%Rrc !watchpoint_hit!\n", rc));
+                    rc = VINF_EM_DBG_BREAKPOINT;
+                }
+                else
+                {
+                    CPUBreakpoint  *pBP;
+                    RTGCPTR         GCPtrPC = pVM->rem.s.Env.eip + pVM->rem.s.Env.segs[R_CS].base;
+                    QTAILQ_FOREACH(pBP, &pVM->rem.s.Env.breakpoints, entry)
+                        if (pBP->pc == GCPtrPC)
+                            break;
+                    rc = pBP ? VINF_EM_DBG_BREAKPOINT : VINF_EM_DBG_STEPPED;
+                    Log2(("remR3RunLoggingStep: cpu_exec -> EXCP_DEBUG rc=%Rrc pBP=%p GCPtrPC=%RGv\n", rc, pBP, GCPtrPC));
+                }
+#ifdef REM_USE_QEMU_SINGLE_STEP_FOR_LOGGING
+                if (rc == VINF_EM_DBG_STEPPED)
+                {
+                    if (   !VM_FF_IS_ANY_SET(pVM, VM_FF_ALL_REM_MASK)
+                        && !VMCPU_FF_IS_ANY_SET(pVCpu, VMCPU_FF_ALL_REM_MASK))
+                        continue;
+
+                    RTLogPrintf("remR3RunLoggingStep: rc=VINF_SUCCESS w/ FFs (%#x/%#RX64)\n",
+                                pVM->fGlobalForcedActions, (uint64_t)pVCpu->fLocalForcedActions);
+                    rc = VINF_SUCCESS;
+                }
+#endif
+                break;
+
+            /*
+             * If we take a trap or start servicing a pending interrupt, we might end up here.
+             * (Timer thread or some other thread wishing EMT's attention.)
+             */
+            case EXCP_INTERRUPT:
+                RTLogPrintf("remR3RunLoggingStep: cpu_exec -> EXCP_INTERRUPT rc=VINF_SUCCESS\n");
+                rc = VINF_SUCCESS;
+                break;
+
+            /*
+             * hlt instruction.
+             */
+            case EXCP_HLT:
+                RTLogPrintf("remR3RunLoggingStep: cpu_exec -> EXCP_HLT rc=VINF_EM_HALT\n");
+                rc = VINF_EM_HALT;
+                break;
+
+            /*
+             * The VM has halted.
+             */
+            case EXCP_HALTED:
+                RTLogPrintf("remR3RunLoggingStep: cpu_exec -> EXCP_HALTED rc=VINF_EM_HALT\n");
+                rc = VINF_EM_HALT;
+                break;
+
+            /*
+             * Switch to RAW-mode.
+             */
+            case EXCP_EXECUTE_RAW:
+                RTLogPrintf("remR3RunLoggingStep: cpu_exec -> EXCP_EXECUTE_RAW rc=VINF_EM_RESCHEDULE_RAW\n");
+                rc = VINF_EM_RESCHEDULE_RAW;
+                break;
+
+            /*
+             * Switch to hardware accelerated RAW-mode.
+             */
+            case EXCP_EXECUTE_HM:
+                RTLogPrintf("remR3RunLoggingStep: cpu_exec -> EXCP_EXECUTE_HM rc=VINF_EM_RESCHEDULE_HM\n");
+                rc = VINF_EM_RESCHEDULE_HM;
+                break;
+
+            /*
+             * An EM RC was raised (VMR3Reset/Suspend/PowerOff/some-fatal-error).
+             */
+            case EXCP_RC:
+                RTLogPrintf("remR3RunLoggingStep: cpu_exec -> EXCP_RC rc=%Rrc\n", pVM->rem.s.rc);
+                rc = pVM->rem.s.rc;
+                pVM->rem.s.rc = VERR_INTERNAL_ERROR;
+                break;
+
+            /*
+             * Figure out the rest when they arrive....
+             */
+            default:
+                AssertMsgFailed(("rc=%d\n", rc));
+                RTLogPrintf("remR3RunLoggingStep: cpu_exec -> %d rc=VINF_EM_RESCHEDULE\n", rc);
+                rc = VINF_EM_RESCHEDULE;
+                break;
+        }
+        break;
+    }
+
+#ifdef REM_USE_QEMU_SINGLE_STEP_FOR_LOGGING
+//    cpu_single_step(&pVM->rem.s.Env, 0);
+#else
+    pVM->rem.s.Env.interrupt_request &= ~(CPU_INTERRUPT_SINGLE_INSTR | CPU_INTERRUPT_SINGLE_INSTR_IN_FLIGHT);
+#endif
+    return rc;
+}
+
+
+/**
+ * Runs code in recompiled mode.
+ *
+ * Before calling this function the REM state needs to be in sync with
+ * the VM. Call REMR3State() to perform the sync. It's only necessary
+ * (and permitted) to sync at the first call to REMR3Step()/REMR3Run()
+ * and after calling REMR3StateBack().
+ *
+ * @returns VBox status code.
+ *
+ * @param   pVM         VM Handle.
+ * @param   pVCpu       VMCPU Handle.
+ */
+REMR3DECL(int) REMR3Run(PVM pVM, PVMCPU pVCpu)
+{
+    int rc;
+
+    if (RT_UNLIKELY(pVM->rem.s.Env.state & CPU_EMULATE_SINGLE_STEP))
+        return remR3RunLoggingStep(pVM, pVCpu);
+
+    Assert(pVM->rem.s.fInREM);
+    Log2(("REMR3Run: (cs:eip=%04x:%RGv)\n", pVM->rem.s.Env.segs[R_CS].selector, (RTGCPTR)pVM->rem.s.Env.eip));
+
+    TMNotifyStartOfExecution(pVCpu);
+    rc = cpu_exec(&pVM->rem.s.Env);
+    TMNotifyEndOfExecution(pVCpu);
+    switch (rc)
+    {
+        /*
+         * This happens when the execution was interrupted
+         * by an external event, like pending timers.
+         */
+        case EXCP_INTERRUPT:
+            Log2(("REMR3Run: cpu_exec -> EXCP_INTERRUPT\n"));
+            rc = VINF_SUCCESS;
+            break;
+
+        /*
+         * hlt instruction.
+         */
+        case EXCP_HLT:
+            Log2(("REMR3Run: cpu_exec -> EXCP_HLT\n"));
+            rc = VINF_EM_HALT;
+            break;
+
+        /*
+         * The VM has halted.
+         */
+        case EXCP_HALTED:
+            Log2(("REMR3Run: cpu_exec -> EXCP_HALTED\n"));
+            rc = VINF_EM_HALT;
+            break;
+
+        /*
+         * Breakpoint/single step.
+         */
+        case EXCP_DEBUG:
+            if (pVM->rem.s.Env.watchpoint_hit)
+            {
+                /** @todo deal with watchpoints */
+                Log2(("REMR3Run: cpu_exec -> EXCP_DEBUG rc=%Rrc !watchpoint_hit!\n", rc));
+                rc = VINF_EM_DBG_BREAKPOINT;
+            }
+            else
+            {
+                CPUBreakpoint  *pBP;
+                RTGCPTR         GCPtrPC = pVM->rem.s.Env.eip + pVM->rem.s.Env.segs[R_CS].base;
+                QTAILQ_FOREACH(pBP, &pVM->rem.s.Env.breakpoints, entry)
+                    if (pBP->pc == GCPtrPC)
+                        break;
+                rc = pBP ? VINF_EM_DBG_BREAKPOINT : VINF_EM_DBG_STEPPED;
+                Log2(("REMR3Run: cpu_exec -> EXCP_DEBUG rc=%Rrc pBP=%p GCPtrPC=%RGv\n", rc, pBP, GCPtrPC));
+            }
+            break;
+
+        /*
+         * Switch to RAW-mode.
+         */
+        case EXCP_EXECUTE_RAW:
+            Log2(("REMR3Run: cpu_exec -> EXCP_EXECUTE_RAW pc=%RGv\n", pVM->rem.s.Env.eip));
+            rc = VINF_EM_RESCHEDULE_RAW;
+            break;
+
+        /*
+         * Switch to hardware accelerated RAW-mode.
+         */
+        case EXCP_EXECUTE_HM:
+            Log2(("REMR3Run: cpu_exec -> EXCP_EXECUTE_HM\n"));
+            rc = VINF_EM_RESCHEDULE_HM;
+            break;
+
+        /*
+         * An EM RC was raised (VMR3Reset/Suspend/PowerOff/some-fatal-error).
+         */
+        case EXCP_RC:
+            Log2(("REMR3Run: cpu_exec -> EXCP_RC rc=%Rrc\n", pVM->rem.s.rc));
+            rc = pVM->rem.s.rc;
+            pVM->rem.s.rc = VERR_INTERNAL_ERROR;
+            break;
+
+        /*
+         * Figure out the rest when they arrive....
+         */
+        default:
+            AssertMsgFailed(("rc=%d\n", rc));
+            Log2(("REMR3Run: cpu_exec -> %d\n", rc));
+            rc = VINF_SUCCESS;
+            break;
+    }
+
+    Log2(("REMR3Run: returns %Rrc (cs:eip=%04x:%RGv)\n", rc, pVM->rem.s.Env.segs[R_CS].selector, (RTGCPTR)pVM->rem.s.Env.eip));
+    return rc;
+}
+
+
+/**
+ * Check if the cpu state is suitable for Raw execution.
+ *
+ * @returns true if RAW/HWACC mode is ok, false if we should stay in REM.
+ *
+ * @param   env         The CPU env struct.
+ * @param   eip         The EIP to check this for (might differ from env->eip).
+ * @param   fFlags      hflags OR'ed with IOPL, TF and VM from eflags.
+ * @param   piException Stores EXCP_EXECUTE_RAW/HWACC in case raw mode is supported in this context
+ *
+ * @remark  This function must be kept in perfect sync with the scheduler in EM.cpp!
+ */
+bool remR3CanExecuteRaw(CPUX86State *env, RTGCPTR eip, unsigned fFlags, int *piException)
+{
+    /* !!! THIS MUST BE IN SYNC WITH emR3Reschedule !!! */
+    /* !!! THIS MUST BE IN SYNC WITH emR3Reschedule !!! */
+    /* !!! THIS MUST BE IN SYNC WITH emR3Reschedule !!! */
+    uint32_t u32CR0;
+
+    /* Update counter. */
+    env->pVM->rem.s.cCanExecuteRaw++;
+
+    /* Never when single stepping+logging guest code. */
+    if (env->state & CPU_EMULATE_SINGLE_STEP)
+        return false;
+
+    if (!VM_IS_RAW_MODE_ENABLED(env->pVM))
+    {
+#ifdef RT_OS_WINDOWS
+        PCPUMCTX pCtx = alloca(sizeof(*pCtx));
+#else
+        CPUMCTX Ctx;
+        PCPUMCTX pCtx = &Ctx;
+#endif
+        /** @todo NEM: scheduling.   */
+
+        env->state |= CPU_RAW_HM;
+
+        /*
+         * The simple check first...
+         */
+        if (!EMIsHwVirtExecutionEnabled(env->pVM))
+            return false;
+
+        /*
+         * Create partial context for HMCanExecuteGuest.
+         */
+        pCtx->cr0            = env->cr[0];
+        pCtx->cr3            = env->cr[3];
+        pCtx->cr4            = env->cr[4];
+
+        pCtx->tr.Sel         = env->tr.selector;
+        pCtx->tr.ValidSel    = env->tr.selector;
+        pCtx->tr.fFlags      = CPUMSELREG_FLAGS_VALID;
+        pCtx->tr.u64Base     = env->tr.base;
+        pCtx->tr.u32Limit    = env->tr.limit;
+        pCtx->tr.Attr.u      = (env->tr.flags >> SEL_FLAGS_SHIFT) & SEL_FLAGS_SMASK;
+
+        pCtx->ldtr.Sel       = env->ldt.selector;
+        pCtx->ldtr.ValidSel  = env->ldt.selector;
+        pCtx->ldtr.fFlags    = CPUMSELREG_FLAGS_VALID;
+        pCtx->ldtr.u64Base   = env->ldt.base;
+        pCtx->ldtr.u32Limit  = env->ldt.limit;
+        pCtx->ldtr.Attr.u    = (env->ldt.flags >> SEL_FLAGS_SHIFT) & SEL_FLAGS_SMASK;
+
+        pCtx->idtr.cbIdt     = env->idt.limit;
+        pCtx->idtr.pIdt      = env->idt.base;
+
+        pCtx->gdtr.cbGdt     = env->gdt.limit;
+        pCtx->gdtr.pGdt      = env->gdt.base;
+
+        pCtx->rsp            = env->regs[R_ESP];
+        pCtx->rip            = env->eip;
+
+        pCtx->eflags.u32     = env->eflags;
+
+        pCtx->cs.Sel         = env->segs[R_CS].selector;
+        pCtx->cs.ValidSel    = env->segs[R_CS].selector;
+        pCtx->cs.fFlags      = CPUMSELREG_FLAGS_VALID;
+        pCtx->cs.u64Base     = env->segs[R_CS].base;
+        pCtx->cs.u32Limit    = env->segs[R_CS].limit;
+        pCtx->cs.Attr.u      = (env->segs[R_CS].flags >> SEL_FLAGS_SHIFT) & SEL_FLAGS_SMASK;
+
+        pCtx->ds.Sel         = env->segs[R_DS].selector;
+        pCtx->ds.ValidSel    = env->segs[R_DS].selector;
+        pCtx->ds.fFlags      = CPUMSELREG_FLAGS_VALID;
+        pCtx->ds.u64Base     = env->segs[R_DS].base;
+        pCtx->ds.u32Limit    = env->segs[R_DS].limit;
+        pCtx->ds.Attr.u      = (env->segs[R_DS].flags >> SEL_FLAGS_SHIFT) & SEL_FLAGS_SMASK;
+
+        pCtx->es.Sel         = env->segs[R_ES].selector;
+        pCtx->es.ValidSel    = env->segs[R_ES].selector;
+        pCtx->es.fFlags      = CPUMSELREG_FLAGS_VALID;
+        pCtx->es.u64Base     = env->segs[R_ES].base;
+        pCtx->es.u32Limit    = env->segs[R_ES].limit;
+        pCtx->es.Attr.u      = (env->segs[R_ES].flags >> SEL_FLAGS_SHIFT) & SEL_FLAGS_SMASK;
+
+        pCtx->fs.Sel         = env->segs[R_FS].selector;
+        pCtx->fs.ValidSel    = env->segs[R_FS].selector;
+        pCtx->fs.fFlags      = CPUMSELREG_FLAGS_VALID;
+        pCtx->fs.u64Base     = env->segs[R_FS].base;
+        pCtx->fs.u32Limit    = env->segs[R_FS].limit;
+        pCtx->fs.Attr.u      = (env->segs[R_FS].flags >> SEL_FLAGS_SHIFT) & SEL_FLAGS_SMASK;
+
+        pCtx->gs.Sel         = env->segs[R_GS].selector;
+        pCtx->gs.ValidSel    = env->segs[R_GS].selector;
+        pCtx->gs.fFlags      = CPUMSELREG_FLAGS_VALID;
+        pCtx->gs.u64Base     = env->segs[R_GS].base;
+        pCtx->gs.u32Limit    = env->segs[R_GS].limit;
+        pCtx->gs.Attr.u      = (env->segs[R_GS].flags >> SEL_FLAGS_SHIFT) & SEL_FLAGS_SMASK;
+
+        pCtx->ss.Sel         = env->segs[R_SS].selector;
+        pCtx->ss.ValidSel    = env->segs[R_SS].selector;
+        pCtx->ss.fFlags      = CPUMSELREG_FLAGS_VALID;
+        pCtx->ss.u64Base     = env->segs[R_SS].base;
+        pCtx->ss.u32Limit    = env->segs[R_SS].limit;
+        pCtx->ss.Attr.u      = (env->segs[R_SS].flags >> SEL_FLAGS_SHIFT) & SEL_FLAGS_SMASK;
+
+        pCtx->msrEFER        = env->efer;
+
+        /*
+         * Hardware accelerated mode:
+         * Typically only 32-bits protected mode, with paging enabled, code is allowed here.
+         */
+        PVMCPU pVCpu = &env->pVM->aCpus[0];
+        if (HMCanExecuteGuest(pVCpu, pCtx))
+        {
+            *piException = EXCP_EXECUTE_HM;
+            return true;
+        }
+        return false;
+    }
+
+    /*
+     * Here we only support 16 & 32 bits protected mode ring 3 code that has no IO privileges
+     * or 32 bits protected mode ring 0 code
+     *
+     * The tests are ordered by the likelihood of being true during normal execution.
+     */
+    if (fFlags & (HF_TF_MASK | HF_INHIBIT_IRQ_MASK))
+    {
+        STAM_COUNTER_INC(&gStatRefuseTFInhibit);
+        Log2(("raw mode refused: fFlags=%#x\n", fFlags));
+        return false;
+    }
+
+#ifndef VBOX_RAW_V86
+    if (fFlags & VM_MASK) {
+        STAM_COUNTER_INC(&gStatRefuseVM86);
+        Log2(("raw mode refused: VM_MASK\n"));
+        return false;
+    }
+#endif
+
+    if (env->state & CPU_EMULATE_SINGLE_INSTR)
+    {
+#ifndef DEBUG_bird
+        Log2(("raw mode refused: CPU_EMULATE_SINGLE_INSTR\n"));
+#endif
+        return false;
+    }
+
+    if (env->singlestep_enabled)
+    {
+        //Log2(("raw mode refused: Single step\n"));
+        return false;
+    }
+
+    if (!QTAILQ_EMPTY(&env->breakpoints))
+    {
+        //Log2(("raw mode refused: Breakpoints\n"));
+        return false;
+    }
+
+    if (!QTAILQ_EMPTY(&env->watchpoints))
+    {
+        //Log2(("raw mode refused: Watchpoints\n"));
+        return false;
+    }
+
+    u32CR0 = env->cr[0];
+    if ((u32CR0 & (X86_CR0_PG | X86_CR0_PE)) != (X86_CR0_PG | X86_CR0_PE))
+    {
+        STAM_COUNTER_INC(&gStatRefusePaging);
+        //Log2(("raw mode refused: %s%s%s\n", (u32CR0 & X86_CR0_PG) ? "" : " !PG", (u32CR0 & X86_CR0_PE) ? "" : " !PE", (u32CR0 & X86_CR0_AM) ? "" : " !AM"));
+        return false;
+    }
+
+    if (env->cr[4] & CR4_PAE_MASK)
+    {
+        if (!(env->cpuid_features & X86_CPUID_FEATURE_EDX_PAE))
+        {
+            STAM_COUNTER_INC(&gStatRefusePAE);
+            return false;
+        }
+    }
+
+    if (((fFlags >> HF_CPL_SHIFT) & 3) == 3)
+    {
+        if (!EMIsRawRing3Enabled(env->pVM))
+            return false;
+
+        if (!(env->eflags & IF_MASK))
+        {
+            STAM_COUNTER_INC(&gStatRefuseIF0);
+            Log2(("raw mode refused: IF (RawR3)\n"));
+            return false;
+        }
+
+        if (!(u32CR0 & CR0_WP_MASK) && EMIsRawRing0Enabled(env->pVM))
+        {
+            STAM_COUNTER_INC(&gStatRefuseWP0);
+            Log2(("raw mode refused: CR0.WP + RawR0\n"));
+            return false;
+        }
+    }
+    else
+    {
+        if (!EMIsRawRing0Enabled(env->pVM))
+            return false;
+
+        // Let's start with pure 32 bits ring 0 code first
+        if ((fFlags & (HF_SS32_MASK | HF_CS32_MASK)) != (HF_SS32_MASK | HF_CS32_MASK))
+        {
+            STAM_COUNTER_INC(&gStatRefuseCode16);
+            Log2(("raw r0 mode refused: HF_[S|C]S32_MASK fFlags=%#x\n", fFlags));
+            return false;
+        }
+
+        if (EMIsRawRing1Enabled(env->pVM))
+        {
+            /* Only ring 0 and 1 supervisor code. */
+            if (((fFlags >> HF_CPL_SHIFT) & 3) == 2) /* ring 1 code is moved into ring 2, so we can't support ring-2 in that case. */
+            {
+                Log2(("raw r0 mode refused: CPL %d\n", (fFlags >> HF_CPL_SHIFT) & 3));
+                return false;
+            }
+        }
+        /* Only R0. */
+        else if (((fFlags >> HF_CPL_SHIFT) & 3) != 0)
+        {
+            STAM_COUNTER_INC(&gStatRefuseRing1or2);
+            Log2(("raw r0 mode refused: CPL %d\n", ((fFlags >> HF_CPL_SHIFT) & 3) ));
+            return false;
+        }
+
+        if (!(u32CR0 & CR0_WP_MASK))
+        {
+            STAM_COUNTER_INC(&gStatRefuseWP0);
+            Log2(("raw r0 mode refused: CR0.WP=0!\n"));
+            return false;
+        }
+
+#ifdef VBOX_WITH_RAW_MODE
+        if (PATMIsPatchGCAddr(env->pVM, eip))
+        {
+            Log2(("raw r0 mode forced: patch code\n"));
+            *piException = EXCP_EXECUTE_RAW;
+            return true;
+        }
+#endif
+
+#if !defined(VBOX_ALLOW_IF0) && !defined(VBOX_RUN_INTERRUPT_GATE_HANDLERS)
+        if (!(env->eflags & IF_MASK))
+        {
+            STAM_COUNTER_INC(&gStatRefuseIF0);
+            ////Log2(("R0: IF=0 VIF=%d %08X\n", eip, *env->pVMeflags));
+            //Log2(("RR0: Interrupts turned off; fall back to emulation\n"));
+            return false;
+        }
+#endif
+
+#ifndef VBOX_WITH_RAW_RING1
+        if (((env->eflags >> IOPL_SHIFT) & 3) != 0)
+        {
+            Log2(("raw r0 mode refused: IOPL %d\n", ((env->eflags >> IOPL_SHIFT) & 3)));
+            return false;
+        }
+#endif
+        env->state |= CPU_RAW_RING0;
+    }
+
+    /*
+     * Don't reschedule the first time we're called, because there might be
+     * special reasons why we're here that is not covered by the above checks.
+     */
+    if (env->pVM->rem.s.cCanExecuteRaw == 1)
+    {
+        Log2(("raw mode refused: first scheduling\n"));
+        STAM_COUNTER_INC(&gStatRefuseCanExecute);
+        return false;
+    }
+
+    /*
+     * Stale hidden selectors means raw-mode is unsafe (being very careful).
+     */
+    if (env->segs[R_CS].fVBoxFlags & CPUMSELREG_FLAGS_STALE)
+    {
+        Log2(("raw mode refused: stale CS (%#x)\n", env->segs[R_CS].selector));
+        STAM_COUNTER_INC(&gaStatRefuseStale[R_CS]);
+        return false;
+    }
+    if (env->segs[R_SS].fVBoxFlags & CPUMSELREG_FLAGS_STALE)
+    {
+        Log2(("raw mode refused: stale SS (%#x)\n", env->segs[R_SS].selector));
+        STAM_COUNTER_INC(&gaStatRefuseStale[R_SS]);
+        return false;
+    }
+    if (env->segs[R_DS].fVBoxFlags & CPUMSELREG_FLAGS_STALE)
+    {
+        Log2(("raw mode refused: stale DS (%#x)\n", env->segs[R_DS].selector));
+        STAM_COUNTER_INC(&gaStatRefuseStale[R_DS]);
+        return false;
+    }
+    if (env->segs[R_ES].fVBoxFlags & CPUMSELREG_FLAGS_STALE)
+    {
+        Log2(("raw mode refused: stale ES (%#x)\n", env->segs[R_ES].selector));
+        STAM_COUNTER_INC(&gaStatRefuseStale[R_ES]);
+        return false;
+    }
+    if (env->segs[R_FS].fVBoxFlags & CPUMSELREG_FLAGS_STALE)
+    {
+        Log2(("raw mode refused: stale FS (%#x)\n", env->segs[R_FS].selector));
+        STAM_COUNTER_INC(&gaStatRefuseStale[R_FS]);
+        return false;
+    }
+    if (env->segs[R_GS].fVBoxFlags & CPUMSELREG_FLAGS_STALE)
+    {
+        Log2(("raw mode refused: stale GS (%#x)\n", env->segs[R_GS].selector));
+        STAM_COUNTER_INC(&gaStatRefuseStale[R_GS]);
+        return false;
+    }
+
+/*    Assert(env->pVCpu && PGMPhysIsA20Enabled(env->pVCpu));*/
+    *piException = EXCP_EXECUTE_RAW;
+    return true;
+}
+
+
+#ifdef VBOX_WITH_RAW_MODE
+/**
+ * Fetches a code byte.
+ *
+ * @returns Success indicator (bool) for ease of use.
+ * @param   env         The CPU environment structure.
+ * @param   GCPtrInstr  Where to fetch code.
+ * @param   pu8Byte     Where to store the byte on success
+ */
+bool remR3GetOpcode(CPUX86State *env, RTGCPTR GCPtrInstr, uint8_t *pu8Byte)
+{
+    int rc = PATMR3QueryOpcode(env->pVM, GCPtrInstr, pu8Byte);
+    if (RT_SUCCESS(rc))
+        return true;
+    return false;
+}
+#endif /* VBOX_WITH_RAW_MODE */
+
+
+/**
+ * Flush (or invalidate if you like) page table/dir entry.
+ *
+ * (invlpg instruction; tlb_flush_page)
+ *
+ * @param   env         Pointer to cpu environment.
+ * @param   GCPtr       The virtual address which page table/dir entry should be invalidated.
+ */
+void remR3FlushPage(CPUX86State *env, RTGCPTR GCPtr)
+{
+    PVM pVM = env->pVM;
+    PCPUMCTX pCtx;
+    int rc;
+
+    Assert(EMRemIsLockOwner(env->pVM));
+
+    /*
+     * When we're replaying invlpg instructions or restoring a saved
+     * state we disable this path.
+     */
+    if (pVM->rem.s.fIgnoreInvlPg || pVM->rem.s.cIgnoreAll)
+        return;
+    LogFlow(("remR3FlushPage: GCPtr=%RGv\n", GCPtr));
+    Assert(pVM->rem.s.fInREM || pVM->rem.s.fInStateSync);
+
+    //RAWEx_ProfileStop(env, STATS_QEMU_TOTAL);
+
+    /*
+     * Update the control registers before calling PGMFlushPage.
+     */
+    pCtx = (PCPUMCTX)pVM->rem.s.pCtx;
+    Assert(pCtx);
+    pCtx->cr0 = env->cr[0];
+    pCtx->cr3 = env->cr[3];
+#ifdef VBOX_WITH_RAW_MODE
+    if (((env->cr[4] ^ pCtx->cr4) & X86_CR4_VME) && VM_IS_RAW_MODE_ENABLED(pVM))
+        VMCPU_FF_SET(env->pVCpu, VMCPU_FF_SELM_SYNC_TSS);
+#endif
+    pCtx->cr4 = env->cr[4];
+
+    /*
+     * Let PGM do the rest.
+     */
+    Assert(env->pVCpu);
+    rc = PGMInvalidatePage(env->pVCpu, GCPtr);
+    if (RT_FAILURE(rc))
+    {
+        AssertMsgFailed(("remR3FlushPage %RGv failed with %d!!\n", GCPtr, rc));
+        VMCPU_FF_SET(env->pVCpu, VMCPU_FF_PGM_SYNC_CR3);
+    }
+    //RAWEx_ProfileStart(env, STATS_QEMU_TOTAL);
+}
+
+
+#ifndef REM_PHYS_ADDR_IN_TLB
+/** Wrapper for PGMR3PhysTlbGCPhys2Ptr. */
+void *remR3TlbGCPhys2Ptr(CPUX86State *env1, target_ulong physAddr, int fWritable)
+{
+    void *pv;
+    int rc;
+
+
+    /* Address must be aligned enough to fiddle with lower bits */
+    Assert((physAddr & 0x3) == 0);
+    /*AssertMsg((env1->a20_mask & physAddr) == physAddr, ("%llx\n", (uint64_t)physAddr));*/
+
+    STAM_PROFILE_START(&gStatGCPhys2HCVirt, a);
+    rc = PGMR3PhysTlbGCPhys2Ptr(env1->pVM, physAddr, true /*fWritable*/, &pv);
+    STAM_PROFILE_STOP(&gStatGCPhys2HCVirt, a);
+    Assert(   rc == VINF_SUCCESS
+           || rc == VINF_PGM_PHYS_TLB_CATCH_WRITE
+           || rc == VERR_PGM_PHYS_TLB_CATCH_ALL
+           || rc == VERR_PGM_PHYS_TLB_UNASSIGNED);
+    if (RT_FAILURE(rc))
+        return (void *)1;
+    if (rc == VINF_PGM_PHYS_TLB_CATCH_WRITE)
+        return (void *)((uintptr_t)pv | 2);
+    return pv;
+}
+#endif /* REM_PHYS_ADDR_IN_TLB */
+
+
+/**
+ * Called from tlb_protect_code in order to write monitor a code page.
+ *
+ * @param   env             Pointer to the CPU environment.
+ * @param   GCPtr           Code page to monitor
+ */
+void remR3ProtectCode(CPUX86State *env, RTGCPTR GCPtr)
+{
+#ifdef VBOX_REM_PROTECT_PAGES_FROM_SMC
+    Assert(env->pVM->rem.s.fInREM);
+    if (     (env->cr[0] & X86_CR0_PG)                      /* paging must be enabled */
+        &&  !(env->state & CPU_EMULATE_SINGLE_INSTR)        /* ignore during single instruction execution */
+        &&   (((env->hflags >> HF_CPL_SHIFT) & 3) == 0)     /* supervisor mode only */
+        &&  !(env->eflags & VM_MASK)                        /* no V86 mode */
+        &&  VM_IS_RAW_MODE_ENABLED(env->pVM))
+        CSAMR3MonitorPage(env->pVM, GCPtr, CSAM_TAG_REM);
+#endif
+}
+
+
+/**
+ * Called from tlb_unprotect_code in order to clear write monitoring for a code page.
+ *
+ * @param   env             Pointer to the CPU environment.
+ * @param   GCPtr           Code page to monitor
+ */
+void remR3UnprotectCode(CPUX86State *env, RTGCPTR GCPtr)
+{
+    Assert(env->pVM->rem.s.fInREM);
+#ifdef VBOX_REM_PROTECT_PAGES_FROM_SMC
+    if (     (env->cr[0] & X86_CR0_PG)                      /* paging must be enabled */
+        &&  !(env->state & CPU_EMULATE_SINGLE_INSTR)        /* ignore during single instruction execution */
+        &&   (((env->hflags >> HF_CPL_SHIFT) & 3) == 0)     /* supervisor mode only */
+        &&  !(env->eflags & VM_MASK)                        /* no V86 mode */
+        &&  VM_IS_RAW_MODE_ENABLED(env->pVM))
+        CSAMR3UnmonitorPage(env->pVM, GCPtr, CSAM_TAG_REM);
+#endif
+}
+
+
+/**
+ * Called when the CPU is initialized, any of the CRx registers are changed or
+ * when the A20 line is modified.
+ *
+ * @param   env             Pointer to the CPU environment.
+ * @param   fGlobal         Set if the flush is global.
+ */
+void remR3FlushTLB(CPUX86State *env, bool fGlobal)
+{
+    PVM      pVM = env->pVM;
+    PCPUMCTX pCtx;
+    Assert(EMRemIsLockOwner(pVM));
+
+    /*
+     * When we're replaying invlpg instructions or restoring a saved
+     * state we disable this path.
+     */
+    if (pVM->rem.s.fIgnoreCR3Load || pVM->rem.s.cIgnoreAll)
+        return;
+    Assert(pVM->rem.s.fInREM);
+
+    /*
+     * The caller doesn't check cr4, so we have to do that for ourselves.
+     */
+    if (!fGlobal && !(env->cr[4] & X86_CR4_PGE))
+        fGlobal = true;
+    Log(("remR3FlushTLB: CR0=%08RX64 CR3=%08RX64 CR4=%08RX64 %s\n", (uint64_t)env->cr[0], (uint64_t)env->cr[3], (uint64_t)env->cr[4], fGlobal ? " global" : ""));
+
+    /*
+     * Update the control registers before calling PGMR3FlushTLB.
+     */
+    pCtx = (PCPUMCTX)pVM->rem.s.pCtx;
+    Assert(pCtx);
+    pCtx->cr0 = env->cr[0];
+    pCtx->cr3 = env->cr[3];
+#ifdef VBOX_WITH_RAW_MODE
+    if (((env->cr[4] ^ pCtx->cr4) & X86_CR4_VME) && VM_IS_RAW_MODE_ENABLED(pVM))
+        VMCPU_FF_SET(env->pVCpu, VMCPU_FF_SELM_SYNC_TSS);
+#endif
+    pCtx->cr4 = env->cr[4];
+
+    /*
+     * Let PGM do the rest.
+     */
+    Assert(env->pVCpu);
+    PGMFlushTLB(env->pVCpu, env->cr[3], fGlobal);
+}
+
+
+/**
+ * Called when any of the cr0, cr4 or efer registers is updated.
+ *
+ * @param   env     Pointer to the CPU environment.
+ */
+void remR3ChangeCpuMode(CPUX86State *env)
+{
+    PVM         pVM = env->pVM;
+    uint64_t    efer;
+    PCPUMCTX    pCtx;
+    int         rc;
+
+    /*
+     * When we're replaying loads or restoring a saved
+     * state this path is disabled.
+     */
+    if (pVM->rem.s.fIgnoreCpuMode || pVM->rem.s.cIgnoreAll)
+        return;
+    Assert(pVM->rem.s.fInREM);
+
+    pCtx = (PCPUMCTX)pVM->rem.s.pCtx;
+    Assert(pCtx);
+
+    /*
+     * Notify PGM about WP0 being enabled (like CPUSetGuestCR0 does).
+     */
+    if (((env->cr[0] ^ pCtx->cr0) & X86_CR0_WP) && (env->cr[0] & X86_CR0_WP))
+        PGMCr0WpEnabled(env->pVCpu);
+
+    /*
+     * Update the control registers before calling PGMChangeMode()
+     * as it may need to map whatever cr3 is pointing to.
+     */
+    pCtx->cr0 = env->cr[0];
+    pCtx->cr3 = env->cr[3];
+#ifdef VBOX_WITH_RAW_MODE
+    if (((env->cr[4] ^ pCtx->cr4) & X86_CR4_VME) && VM_IS_RAW_MODE_ENABLED(pVM))
+        VMCPU_FF_SET(env->pVCpu, VMCPU_FF_SELM_SYNC_TSS);
+#endif
+    pCtx->cr4 = env->cr[4];
+#ifdef TARGET_X86_64
+    efer = env->efer;
+    pCtx->msrEFER = efer;
+#else
+    efer = 0;
+#endif
+    Assert(env->pVCpu);
+    rc = PGMChangeMode(env->pVCpu, env->cr[0], env->cr[4], efer);
+    if (rc != VINF_SUCCESS)
+    {
+        if (rc >= VINF_EM_FIRST && rc <= VINF_EM_LAST)
+        {
+            Log(("PGMChangeMode(, %RX64, %RX64, %RX64) -> %Rrc -> remR3RaiseRC\n", env->cr[0], env->cr[4], efer, rc));
+            remR3RaiseRC(env->pVM, rc);
+        }
+        else
+            cpu_abort(env, "PGMChangeMode(, %RX64, %RX64, %RX64) -> %Rrc\n", env->cr[0], env->cr[4], efer, rc);
+    }
+}
+
+
+/**
+ * Called from compiled code to run dma.
+ *
+ * @param   env             Pointer to the CPU environment.
+ */
+void remR3DmaRun(CPUX86State *env)
+{
+    remR3ProfileStop(STATS_QEMU_RUN_EMULATED_CODE);
+    PDMR3DmaRun(env->pVM);
+    remR3ProfileStart(STATS_QEMU_RUN_EMULATED_CODE);
+}
+
+
+/**
+ * Called from compiled code to schedule pending timers in VMM
+ *
+ * @param   env             Pointer to the CPU environment.
+ */
+void remR3TimersRun(CPUX86State *env)
+{
+    LogFlow(("remR3TimersRun:\n"));
+    LogIt(RTLOGGRPFLAGS_LEVEL_5, LOG_GROUP_TM, ("remR3TimersRun\n"));
+    remR3ProfileStop(STATS_QEMU_RUN_EMULATED_CODE);
+    remR3ProfileStart(STATS_QEMU_RUN_TIMERS);
+    TMR3TimerQueuesDo(env->pVM);
+    remR3ProfileStop(STATS_QEMU_RUN_TIMERS);
+    remR3ProfileStart(STATS_QEMU_RUN_EMULATED_CODE);
+}
+
+
+/**
+ * Record trap occurrence
+ *
+ * @returns VBox status code
+ * @param   env             Pointer to the CPU environment.
+ * @param   uTrap           Trap nr
+ * @param   uErrorCode      Error code
+ * @param   pvNextEIP       Next EIP
+ */
+int remR3NotifyTrap(CPUX86State *env, uint32_t uTrap, uint32_t uErrorCode, RTGCPTR pvNextEIP)
+{
+    PVM pVM = env->pVM;
+#ifdef VBOX_WITH_STATISTICS
+    static STAMCOUNTER s_aStatTrap[255];
+    static bool        s_aRegisters[RT_ELEMENTS(s_aStatTrap)];
+#endif
+
+#ifdef VBOX_WITH_STATISTICS
+    if (uTrap < 255)
+    {
+        if (!s_aRegisters[uTrap])
+        {
+            char szStatName[64];
+            s_aRegisters[uTrap] = true;
+            RTStrPrintf(szStatName, sizeof(szStatName), "/REM/Trap/0x%02X", uTrap);
+            STAM_REG(env->pVM, &s_aStatTrap[uTrap], STAMTYPE_COUNTER, szStatName, STAMUNIT_OCCURENCES, "Trap stats.");
+        }
+        STAM_COUNTER_INC(&s_aStatTrap[uTrap]);
+    }
+#endif
+    Log(("remR3NotifyTrap: uTrap=%x error=%x next_eip=%RGv eip=%RGv cr2=%RGv\n", uTrap, uErrorCode, pvNextEIP, (RTGCPTR)env->eip, (RTGCPTR)env->cr[2]));
+    if(   uTrap < 0x20
+       && (env->cr[0] & X86_CR0_PE)
+       && !(env->eflags & X86_EFL_VM))
+    {
+#ifdef DEBUG
+        remR3DisasInstr(env, 1, "remR3NotifyTrap: ");
+#endif
+        if(pVM->rem.s.uPendingException == uTrap && ++pVM->rem.s.cPendingExceptions > 512)
+        {
+            LogRel(("VERR_REM_TOO_MANY_TRAPS -> uTrap=%x error=%x next_eip=%RGv eip=%RGv cr2=%RGv\n", uTrap, uErrorCode, pvNextEIP, (RTGCPTR)env->eip, (RTGCPTR)env->cr[2]));
+            remR3RaiseRC(env->pVM, VERR_REM_TOO_MANY_TRAPS);
+            return VERR_REM_TOO_MANY_TRAPS;
+        }
+        if(pVM->rem.s.uPendingException != uTrap || pVM->rem.s.uPendingExcptEIP != env->eip || pVM->rem.s.uPendingExcptCR2 != env->cr[2])
+        {
+            Log(("remR3NotifyTrap: uTrap=%#x set as pending\n", uTrap));
+            pVM->rem.s.cPendingExceptions = 1;
+        }
+        pVM->rem.s.uPendingException = uTrap;
+        pVM->rem.s.uPendingExcptEIP  = env->eip;
+        pVM->rem.s.uPendingExcptCR2  = env->cr[2];
+    }
+    else
+    {
+        pVM->rem.s.cPendingExceptions = 0;
+        pVM->rem.s.uPendingException = uTrap;
+        pVM->rem.s.uPendingExcptEIP  = env->eip;
+        pVM->rem.s.uPendingExcptCR2  = env->cr[2];
+    }
+    return VINF_SUCCESS;
+}
+
+
+/*
+ * Clear current active trap
+ *
+ * @param   pVM         VM Handle.
+ */
+void remR3TrapClear(PVM pVM)
+{
+    pVM->rem.s.cPendingExceptions = 0;
+    pVM->rem.s.uPendingException  = 0;
+    pVM->rem.s.uPendingExcptEIP   = 0;
+    pVM->rem.s.uPendingExcptCR2   = 0;
+}
+
+
+/*
+ * Record previous call instruction addresses
+ *
+ * @param   env             Pointer to the CPU environment.
+ */
+void remR3RecordCall(CPUX86State *env)
+{
+#ifdef VBOX_WITH_RAW_MODE
+    CSAMR3RecordCallAddress(env->pVM, env->eip);
+#endif
+}
+
+
+/**
+ * Syncs the internal REM state with the VM.
+ *
+ * This must be called before REMR3Run() is invoked whenever when the REM
+ * state is not up to date. Calling it several times in a row is not
+ * permitted.
+ *
+ * @returns VBox status code.
+ *
+ * @param   pVM         VM Handle.
+ * @param   pVCpu       VMCPU Handle.
+ *
+ * @remark  The caller has to check for important FFs before calling REMR3Run. REMR3State will
+ *          no do this since the majority of the callers don't want any unnecessary of events
+ *          pending that would immediately interrupt execution.
+ */
+REMR3DECL(int)  REMR3State(PVM pVM, PVMCPU pVCpu)
+{
+    register const CPUMCTX *pCtx;
+    register unsigned       fFlags;
+    unsigned                i;
+    TRPMEVENT               enmType;
+    uint8_t                 u8TrapNo;
+    uint32_t                uCpl;
+    int                     rc;
+
+    STAM_PROFILE_START(&pVM->rem.s.StatsState, a);
+    Log2(("REMR3State:\n"));
+
+    pVM->rem.s.Env.pVCpu = pVCpu;
+    pCtx = pVM->rem.s.pCtx = CPUMQueryGuestCtxPtr(pVCpu);
+
+    Assert(pCtx);
+    if (   CPUMIsGuestInSvmNestedHwVirtMode(pCtx)
+        || CPUMIsGuestInVmxNonRootMode(pCtx))
+    {
+        AssertMsgFailed(("Bad scheduling - can't exec. nested-guest in REM!\n"));
+        return VERR_EM_CANNOT_EXEC_GUEST;
+    }
+
+    Assert(!pVM->rem.s.fInREM);
+    pVM->rem.s.fInStateSync = true;
+
+    /*
+     * If we have to flush TBs, do that immediately.
+     */
+    if (pVM->rem.s.fFlushTBs)
+    {
+        STAM_COUNTER_INC(&gStatFlushTBs);
+        tb_flush(&pVM->rem.s.Env);
+        pVM->rem.s.fFlushTBs = false;
+    }
+
+    /*
+     * Copy the registers which require no special handling.
+     */
+#ifdef TARGET_X86_64
+    /* Note that the high dwords of 64 bits registers are undefined in 32 bits mode and are undefined after a mode change. */
+    Assert(R_EAX == 0);
+    pVM->rem.s.Env.regs[R_EAX]  = pCtx->rax;
+    Assert(R_ECX == 1);
+    pVM->rem.s.Env.regs[R_ECX]  = pCtx->rcx;
+    Assert(R_EDX == 2);
+    pVM->rem.s.Env.regs[R_EDX]  = pCtx->rdx;
+    Assert(R_EBX == 3);
+    pVM->rem.s.Env.regs[R_EBX]  = pCtx->rbx;
+    Assert(R_ESP == 4);
+    pVM->rem.s.Env.regs[R_ESP]  = pCtx->rsp;
+    Assert(R_EBP == 5);
+    pVM->rem.s.Env.regs[R_EBP]  = pCtx->rbp;
+    Assert(R_ESI == 6);
+    pVM->rem.s.Env.regs[R_ESI]  = pCtx->rsi;
+    Assert(R_EDI == 7);
+    pVM->rem.s.Env.regs[R_EDI]  = pCtx->rdi;
+    pVM->rem.s.Env.regs[8]      = pCtx->r8;
+    pVM->rem.s.Env.regs[9]      = pCtx->r9;
+    pVM->rem.s.Env.regs[10]     = pCtx->r10;
+    pVM->rem.s.Env.regs[11]     = pCtx->r11;
+    pVM->rem.s.Env.regs[12]     = pCtx->r12;
+    pVM->rem.s.Env.regs[13]     = pCtx->r13;
+    pVM->rem.s.Env.regs[14]     = pCtx->r14;
+    pVM->rem.s.Env.regs[15]     = pCtx->r15;
+
+    pVM->rem.s.Env.eip          = pCtx->rip;
+
+    pVM->rem.s.Env.eflags       = pCtx->rflags.u64;
+#else
+    Assert(R_EAX == 0);
+    pVM->rem.s.Env.regs[R_EAX]  = pCtx->eax;
+    Assert(R_ECX == 1);
+    pVM->rem.s.Env.regs[R_ECX]  = pCtx->ecx;
+    Assert(R_EDX == 2);
+    pVM->rem.s.Env.regs[R_EDX]  = pCtx->edx;
+    Assert(R_EBX == 3);
+    pVM->rem.s.Env.regs[R_EBX]  = pCtx->ebx;
+    Assert(R_ESP == 4);
+    pVM->rem.s.Env.regs[R_ESP]  = pCtx->esp;
+    Assert(R_EBP == 5);
+    pVM->rem.s.Env.regs[R_EBP]  = pCtx->ebp;
+    Assert(R_ESI == 6);
+    pVM->rem.s.Env.regs[R_ESI]  = pCtx->esi;
+    Assert(R_EDI == 7);
+    pVM->rem.s.Env.regs[R_EDI]  = pCtx->edi;
+    pVM->rem.s.Env.eip          = pCtx->eip;
+
+    pVM->rem.s.Env.eflags       = pCtx->eflags.u32;
+#endif
+
+    pVM->rem.s.Env.cr[2]        = pCtx->cr2;
+
+    /** @todo we could probably benefit from using a CPUM_CHANGED_DRx flag too! */
+    for (i=0;i<8;i++)
+        pVM->rem.s.Env.dr[i] = pCtx->dr[i];
+
+#ifdef HF_HALTED_MASK /** @todo remove me when we're up to date again. */
+    /*
+     * Clear the halted hidden flag (the interrupt waking up the CPU can
+     * have been dispatched in raw mode).
+     */
+    pVM->rem.s.Env.hflags      &= ~HF_HALTED_MASK;
+#endif
+
+    /*
+     * Replay invlpg?  Only if we're not flushing the TLB.
+     */
+    fFlags = CPUMR3RemEnter(pVCpu, &uCpl);
+    LogFlow(("CPUMR3RemEnter %x %x\n", fFlags, uCpl));
+    if (pVM->rem.s.cInvalidatedPages)
+    {
+        if (!(fFlags & CPUM_CHANGED_GLOBAL_TLB_FLUSH))
+        {
+            RTUINT i;
+
+            pVM->rem.s.fIgnoreCR3Load = true;
+            pVM->rem.s.fIgnoreInvlPg  = true;
+            for (i = 0; i < pVM->rem.s.cInvalidatedPages; i++)
+            {
+                Log2(("REMR3State: invlpg %RGv\n", pVM->rem.s.aGCPtrInvalidatedPages[i]));
+                tlb_flush_page(&pVM->rem.s.Env, pVM->rem.s.aGCPtrInvalidatedPages[i]);
+            }
+            pVM->rem.s.fIgnoreInvlPg  = false;
+            pVM->rem.s.fIgnoreCR3Load = false;
+        }
+        pVM->rem.s.cInvalidatedPages = 0;
+    }
+
+    /* Replay notification changes. */
+    REMR3ReplayHandlerNotifications(pVM);
+
+    /* Update MSRs; before CRx registers! */
+    pVM->rem.s.Env.efer         = pCtx->msrEFER;
+    pVM->rem.s.Env.star         = pCtx->msrSTAR;
+    pVM->rem.s.Env.pat          = pCtx->msrPAT;
+#ifdef TARGET_X86_64
+    pVM->rem.s.Env.lstar        = pCtx->msrLSTAR;
+    pVM->rem.s.Env.cstar        = pCtx->msrCSTAR;
+    pVM->rem.s.Env.fmask        = pCtx->msrSFMASK;
+    pVM->rem.s.Env.kernelgsbase = pCtx->msrKERNELGSBASE;
+
+    /* Update the internal long mode activate flag according to the new EFER value. */
+    if (pCtx->msrEFER & MSR_K6_EFER_LMA)
+        pVM->rem.s.Env.hflags |= HF_LMA_MASK;
+    else
+        pVM->rem.s.Env.hflags &= ~(HF_LMA_MASK | HF_CS64_MASK);
+#endif
+
+    /* Update the inhibit IRQ mask. */
+    pVM->rem.s.Env.hflags &= ~HF_INHIBIT_IRQ_MASK;
+    if (VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_INHIBIT_INTERRUPTS))
+    {
+        RTGCPTR InhibitPC = EMGetInhibitInterruptsPC(pVCpu);
+        if (InhibitPC == pCtx->rip)
+            pVM->rem.s.Env.hflags |= HF_INHIBIT_IRQ_MASK;
+        else
+        {
+            Log(("Clearing VMCPU_FF_INHIBIT_INTERRUPTS at %RGv - successor %RGv (REM#1)\n", (RTGCPTR)pCtx->rip, InhibitPC));
+            VMCPU_FF_CLEAR(pVCpu, VMCPU_FF_INHIBIT_INTERRUPTS);
+        }
+    }
+
+    /* Update the inhibit NMI mask. */
+    pVM->rem.s.Env.hflags2 &= ~HF2_NMI_MASK;
+    if (VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_BLOCK_NMIS))
+        pVM->rem.s.Env.hflags2 |= HF2_NMI_MASK;
+
+    /*
+     * Sync the A20 gate.
+     */
+    bool fA20State = PGMPhysIsA20Enabled(pVCpu);
+    if (fA20State != RT_BOOL(pVM->rem.s.Env.a20_mask & RT_BIT(20)))
+    {
+        ASMAtomicIncU32(&pVM->rem.s.cIgnoreAll);
+        cpu_x86_set_a20(&pVM->rem.s.Env, fA20State);
+        ASMAtomicDecU32(&pVM->rem.s.cIgnoreAll);
+    }
+
+    /*
+     * Registers which are rarely changed and require special handling / order when changed.
+     */
+    if (fFlags & (  CPUM_CHANGED_GLOBAL_TLB_FLUSH
+                  | CPUM_CHANGED_CR4
+                  | CPUM_CHANGED_CR0
+                  | CPUM_CHANGED_CR3
+                  | CPUM_CHANGED_GDTR
+                  | CPUM_CHANGED_IDTR
+                  | CPUM_CHANGED_SYSENTER_MSR
+                  | CPUM_CHANGED_LDTR
+                  | CPUM_CHANGED_CPUID
+                  | CPUM_CHANGED_FPU_REM
+                 )
+        )
+    {
+        if (fFlags & CPUM_CHANGED_GLOBAL_TLB_FLUSH)
+        {
+            pVM->rem.s.fIgnoreCR3Load = true;
+            tlb_flush(&pVM->rem.s.Env, true);
+            pVM->rem.s.fIgnoreCR3Load = false;
+        }
+
+        /* CR4 before CR0! */
+        if (fFlags & CPUM_CHANGED_CR4)
+        {
+            pVM->rem.s.fIgnoreCR3Load = true;
+            pVM->rem.s.fIgnoreCpuMode = true;
+            cpu_x86_update_cr4(&pVM->rem.s.Env, pCtx->cr4);
+            pVM->rem.s.fIgnoreCpuMode = false;
+            pVM->rem.s.fIgnoreCR3Load = false;
+        }
+
+        if (fFlags & CPUM_CHANGED_CR0)
+        {
+            pVM->rem.s.fIgnoreCR3Load = true;
+            pVM->rem.s.fIgnoreCpuMode = true;
+            cpu_x86_update_cr0(&pVM->rem.s.Env, pCtx->cr0);
+            pVM->rem.s.fIgnoreCpuMode = false;
+            pVM->rem.s.fIgnoreCR3Load = false;
+        }
+
+        if (fFlags & CPUM_CHANGED_CR3)
+        {
+            pVM->rem.s.fIgnoreCR3Load = true;
+            cpu_x86_update_cr3(&pVM->rem.s.Env, pCtx->cr3);
+            pVM->rem.s.fIgnoreCR3Load = false;
+        }
+
+        if (fFlags & CPUM_CHANGED_GDTR)
+        {
+            pVM->rem.s.Env.gdt.base     = pCtx->gdtr.pGdt;
+            pVM->rem.s.Env.gdt.limit    = pCtx->gdtr.cbGdt;
+        }
+
+        if (fFlags & CPUM_CHANGED_IDTR)
+        {
+            pVM->rem.s.Env.idt.base     = pCtx->idtr.pIdt;
+            pVM->rem.s.Env.idt.limit    = pCtx->idtr.cbIdt;
+        }
+
+        if (fFlags & CPUM_CHANGED_SYSENTER_MSR)
+        {
+            pVM->rem.s.Env.sysenter_cs  = pCtx->SysEnter.cs;
+            pVM->rem.s.Env.sysenter_eip = pCtx->SysEnter.eip;
+            pVM->rem.s.Env.sysenter_esp = pCtx->SysEnter.esp;
+        }
+
+        if (fFlags & CPUM_CHANGED_LDTR)
+        {
+            if (pCtx->ldtr.fFlags & CPUMSELREG_FLAGS_VALID)
+            {
+                pVM->rem.s.Env.ldt.selector    = pCtx->ldtr.Sel;
+                pVM->rem.s.Env.ldt.newselector = 0;
+                pVM->rem.s.Env.ldt.fVBoxFlags  = pCtx->ldtr.fFlags;
+                pVM->rem.s.Env.ldt.base        = pCtx->ldtr.u64Base;
+                pVM->rem.s.Env.ldt.limit       = pCtx->ldtr.u32Limit;
+                pVM->rem.s.Env.ldt.flags       = (pCtx->ldtr.Attr.u & SEL_FLAGS_SMASK) << SEL_FLAGS_SHIFT;
+            }
+            else
+            {
+                AssertFailed(); /* Shouldn't happen, see cpumR3LoadExec. */
+                sync_ldtr(&pVM->rem.s.Env, pCtx->ldtr.Sel);
+            }
+        }
+
+        if (fFlags & CPUM_CHANGED_CPUID)
+        {
+            uint32_t u32Dummy;
+
+            /*
+             * Get the CPUID features.
+             */
+            CPUMGetGuestCpuId(pVCpu,          1, 0, &u32Dummy, &u32Dummy, &pVM->rem.s.Env.cpuid_ext_features, &pVM->rem.s.Env.cpuid_features);
+            CPUMGetGuestCpuId(pVCpu, 0x80000001, 0, &u32Dummy, &u32Dummy, &u32Dummy, &pVM->rem.s.Env.cpuid_ext2_features);
+        }
+
+        /* Sync FPU state after CR4, CPUID and EFER (!). */
+        if (fFlags & CPUM_CHANGED_FPU_REM)
+            save_raw_fp_state(&pVM->rem.s.Env, (uint8_t *)&pCtx->pXStateR3->x87); /* 'save' is an excellent name. */
+    }
+
+    /*
+     * Sync TR unconditionally to make life simpler.
+     */
+    pVM->rem.s.Env.tr.selector    = pCtx->tr.Sel;
+    pVM->rem.s.Env.tr.newselector = 0;
+    pVM->rem.s.Env.tr.fVBoxFlags  = pCtx->tr.fFlags;
+    pVM->rem.s.Env.tr.base        = pCtx->tr.u64Base;
+    pVM->rem.s.Env.tr.limit       = pCtx->tr.u32Limit;
+    pVM->rem.s.Env.tr.flags       = (pCtx->tr.Attr.u & SEL_FLAGS_SMASK) << SEL_FLAGS_SHIFT;
+
+    /*
+     * Update selector registers.
+     *
+     * This must be done *after* we've synced gdt, ldt and crX registers
+     * since we're reading the GDT/LDT om sync_seg. This will happen with
+     * saved state which takes a quick dip into rawmode for instance.
+     *
+     * CPL/Stack; Note first check this one as the CPL might have changed.
+     * The wrong CPL can cause QEmu to raise an exception in sync_seg!!
+     */
+    cpu_x86_set_cpl(&pVM->rem.s.Env, uCpl);
+    /* Note! QEmu saves the 2nd dword of the descriptor; we should convert the attribute word back! */
+#define SYNC_IN_SREG(a_pEnv, a_SReg, a_pRemSReg, a_pVBoxSReg) \
+        do \
+        { \
+            if (CPUMSELREG_ARE_HIDDEN_PARTS_VALID(pVCpu, a_pVBoxSReg)) \
+            { \
+                cpu_x86_load_seg_cache(a_pEnv, R_##a_SReg, \
+                                       (a_pVBoxSReg)->Sel, \
+                                       (a_pVBoxSReg)->u64Base, \
+                                       (a_pVBoxSReg)->u32Limit, \
+                                       ((a_pVBoxSReg)->Attr.u & SEL_FLAGS_SMASK) << SEL_FLAGS_SHIFT); \
+                (a_pRemSReg)->fVBoxFlags = (a_pVBoxSReg)->fFlags; \
+            } \
+            /* This only-reload-if-changed stuff is the old approach, we should ditch it. */ \
+            else if ((a_pRemSReg)->selector != (a_pVBoxSReg)->Sel) \
+            { \
+                Log2(("REMR3State: " #a_SReg " changed from %04x to %04x!\n", \
+                      (a_pRemSReg)->selector, (a_pVBoxSReg)->Sel)); \
+                sync_seg(a_pEnv, R_##a_SReg, (a_pVBoxSReg)->Sel); \
+                if ((a_pRemSReg)->newselector) \
+                    STAM_COUNTER_INC(&gStatSelOutOfSync[R_##a_SReg]); \
+            } \
+            else \
+                (a_pRemSReg)->newselector = 0; \
+        } while (0)
+
+    SYNC_IN_SREG(&pVM->rem.s.Env, CS, &pVM->rem.s.Env.segs[R_CS], &pCtx->cs);
+    SYNC_IN_SREG(&pVM->rem.s.Env, SS, &pVM->rem.s.Env.segs[R_SS], &pCtx->ss);
+    SYNC_IN_SREG(&pVM->rem.s.Env, DS, &pVM->rem.s.Env.segs[R_DS], &pCtx->ds);
+    SYNC_IN_SREG(&pVM->rem.s.Env, ES, &pVM->rem.s.Env.segs[R_ES], &pCtx->es);
+    SYNC_IN_SREG(&pVM->rem.s.Env, FS, &pVM->rem.s.Env.segs[R_FS], &pCtx->fs);
+    SYNC_IN_SREG(&pVM->rem.s.Env, GS, &pVM->rem.s.Env.segs[R_GS], &pCtx->gs);
+    /** @todo need to find a way to communicate potential GDT/LDT changes and thread switches. The selector might
+     * be the same but not the base/limit. */
+
+    /*
+     * Check for traps.
+     */
+    pVM->rem.s.Env.exception_index = -1; /** @todo this won't work :/ */
+    rc = TRPMQueryTrap(pVCpu, &u8TrapNo, &enmType);
+    if (RT_SUCCESS(rc))
+    {
+#ifdef DEBUG
+        if (u8TrapNo == 0x80)
+        {
+            remR3DumpLnxSyscall(pVCpu);
+            remR3DumpOBsdSyscall(pVCpu);
+        }
+#endif
+
+        pVM->rem.s.Env.exception_index = u8TrapNo;
+        if (enmType != TRPM_SOFTWARE_INT)
+        {
+            pVM->rem.s.Env.exception_is_int     = enmType == TRPM_HARDWARE_INT
+                                                ? EXCEPTION_IS_INT_VALUE_HARDWARE_IRQ : 0; /* HACK ALERT! */
+            pVM->rem.s.Env.exception_next_eip   = pVM->rem.s.Env.eip;
+        }
+        else
+        {
+            /*
+             * The there are two 1 byte opcodes and one 2 byte opcode for software interrupts.
+             * We ASSUME that there are no prefixes and sets the default to 2 byte, and checks
+             * for int03 and into.
+             */
+            pVM->rem.s.Env.exception_is_int     = 1;
+            pVM->rem.s.Env.exception_next_eip   = pCtx->rip + 2;
+            /* int 3 may be generated by one-byte 0xcc */
+            if (u8TrapNo == 3)
+            {
+                if (read_byte(&pVM->rem.s.Env, pVM->rem.s.Env.segs[R_CS].base + pCtx->rip) == 0xcc)
+                    pVM->rem.s.Env.exception_next_eip = pCtx->rip + 1;
+            }
+            /* int 4 may be generated by one-byte 0xce */
+            else if (u8TrapNo == 4)
+            {
+                if (read_byte(&pVM->rem.s.Env, pVM->rem.s.Env.segs[R_CS].base + pCtx->rip) == 0xce)
+                    pVM->rem.s.Env.exception_next_eip = pCtx->rip + 1;
+            }
+        }
+
+        /* get error code and cr2 if needed. */
+        if (enmType == TRPM_TRAP)
+        {
+            switch (u8TrapNo)
+            {
+                case X86_XCPT_PF:
+                    pVM->rem.s.Env.cr[2] = TRPMGetFaultAddress(pVCpu);
+                    /* fallthru */
+                case X86_XCPT_TS: case X86_XCPT_NP: case X86_XCPT_SS: case X86_XCPT_GP:
+                    pVM->rem.s.Env.error_code = TRPMGetErrorCode(pVCpu);
+                    break;
+
+                case X86_XCPT_AC: case X86_XCPT_DF:
+                default:
+                    pVM->rem.s.Env.error_code = 0;
+                    break;
+            }
+        }
+        else
+            pVM->rem.s.Env.error_code = 0;
+
+        /*
+         * We can now reset the active trap since the recompiler is gonna have a go at it.
+         */
+        rc = TRPMResetTrap(pVCpu);
+        AssertRC(rc);
+        Log2(("REMR3State: trap=%02x errcd=%RGv cr2=%RGv nexteip=%RGv%s\n", pVM->rem.s.Env.exception_index, (RTGCPTR)pVM->rem.s.Env.error_code,
+              (RTGCPTR)pVM->rem.s.Env.cr[2], (RTGCPTR)pVM->rem.s.Env.exception_next_eip, pVM->rem.s.Env.exception_is_int ? " software" : ""));
+    }
+
+    /*
+     * Clear old interrupt request flags; Check for pending hardware interrupts.
+     * (See @remark for why we don't check for other FFs.)
+     */
+    pVM->rem.s.Env.interrupt_request &= ~(CPU_INTERRUPT_HARD | CPU_INTERRUPT_EXITTB | CPU_INTERRUPT_TIMER);
+    if (VMCPU_FF_TEST_AND_CLEAR(pVCpu, VMCPU_FF_UPDATE_APIC))
+        APICUpdatePendingInterrupts(pVCpu);
+    if (VMCPU_FF_IS_ANY_SET(pVCpu, VMCPU_FF_INTERRUPT_APIC | VMCPU_FF_INTERRUPT_PIC))
+        pVM->rem.s.Env.interrupt_request |= CPU_INTERRUPT_HARD;
+
+    /*
+     * We're now in REM mode.
+     */
+    VMCPU_SET_STATE(pVCpu, VMCPUSTATE_STARTED_EXEC_REM);
+    pVM->rem.s.fInREM = true;
+    pVM->rem.s.fInStateSync = false;
+    pVM->rem.s.cCanExecuteRaw = 0;
+    STAM_PROFILE_STOP(&pVM->rem.s.StatsState, a);
+    Log2(("REMR3State: returns VINF_SUCCESS\n"));
+    return VINF_SUCCESS;
+}
+
+
+/**
+ * Syncs back changes in the REM state to the VM state.
+ *
+ * This must be called after invoking REMR3Run().
+ * Calling it several times in a row is not permitted.
+ *
+ * @returns VBox status code.
+ *
+ * @param   pVM         VM Handle.
+ * @param   pVCpu       VMCPU Handle.
+ */
+REMR3DECL(int) REMR3StateBack(PVM pVM, PVMCPU pVCpu)
+{
+    register PCPUMCTX pCtx = pVM->rem.s.pCtx;
+    Assert(pCtx);
+    unsigned          i;
+
+    STAM_PROFILE_START(&pVM->rem.s.StatsStateBack, a);
+    Log2(("REMR3StateBack:\n"));
+    Assert(pVM->rem.s.fInREM);
+
+    /*
+     * Copy back the registers.
+     * This is done in the order they are declared in the CPUMCTX structure.
+     */
+
+    /** @todo FOP */
+    /** @todo FPUIP */
+    /** @todo CS */
+    /** @todo FPUDP */
+    /** @todo DS */
+
+    /** @todo check if FPU/XMM was actually used in the recompiler */
+    restore_raw_fp_state(&pVM->rem.s.Env, (uint8_t *)&pCtx->pXStateR3->x87);
+////    dprintf2(("FPU state CW=%04X TT=%04X SW=%04X (%04X)\n", env->fpuc, env->fpstt, env->fpus, pVMCtx->fpu.FSW));
+
+#ifdef TARGET_X86_64
+    /* Note that the high dwords of 64 bits registers are undefined in 32 bits mode and are undefined after a mode change. */
+    pCtx->rdi           = pVM->rem.s.Env.regs[R_EDI];
+    pCtx->rsi           = pVM->rem.s.Env.regs[R_ESI];
+    pCtx->rbp           = pVM->rem.s.Env.regs[R_EBP];
+    pCtx->rax           = pVM->rem.s.Env.regs[R_EAX];
+    pCtx->rbx           = pVM->rem.s.Env.regs[R_EBX];
+    pCtx->rdx           = pVM->rem.s.Env.regs[R_EDX];
+    pCtx->rcx           = pVM->rem.s.Env.regs[R_ECX];
+    pCtx->r8            = pVM->rem.s.Env.regs[8];
+    pCtx->r9            = pVM->rem.s.Env.regs[9];
+    pCtx->r10           = pVM->rem.s.Env.regs[10];
+    pCtx->r11           = pVM->rem.s.Env.regs[11];
+    pCtx->r12           = pVM->rem.s.Env.regs[12];
+    pCtx->r13           = pVM->rem.s.Env.regs[13];
+    pCtx->r14           = pVM->rem.s.Env.regs[14];
+    pCtx->r15           = pVM->rem.s.Env.regs[15];
+
+    pCtx->rsp           = pVM->rem.s.Env.regs[R_ESP];
+
+#else
+    pCtx->edi           = pVM->rem.s.Env.regs[R_EDI];
+    pCtx->esi           = pVM->rem.s.Env.regs[R_ESI];
+    pCtx->ebp           = pVM->rem.s.Env.regs[R_EBP];
+    pCtx->eax           = pVM->rem.s.Env.regs[R_EAX];
+    pCtx->ebx           = pVM->rem.s.Env.regs[R_EBX];
+    pCtx->edx           = pVM->rem.s.Env.regs[R_EDX];
+    pCtx->ecx           = pVM->rem.s.Env.regs[R_ECX];
+
+    pCtx->esp           = pVM->rem.s.Env.regs[R_ESP];
+#endif
+
+#define SYNC_BACK_SREG(a_sreg, a_SREG) \
+        do \
+        { \
+            pCtx->a_sreg.Sel = pVM->rem.s.Env.segs[R_##a_SREG].selector; \
+            if (!pVM->rem.s.Env.segs[R_SS].newselector) \
+            { \
+                pCtx->a_sreg.ValidSel = pVM->rem.s.Env.segs[R_##a_SREG].selector; \
+                pCtx->a_sreg.fFlags   = CPUMSELREG_FLAGS_VALID; \
+                pCtx->a_sreg.u64Base  = pVM->rem.s.Env.segs[R_##a_SREG].base; \
+                pCtx->a_sreg.u32Limit = pVM->rem.s.Env.segs[R_##a_SREG].limit; \
+                /* Note! QEmu saves the 2nd dword of the descriptor; we (VT-x/AMD-V) keep only the attributes! */ \
+                pCtx->a_sreg.Attr.u   = (pVM->rem.s.Env.segs[R_##a_SREG].flags >> SEL_FLAGS_SHIFT) & SEL_FLAGS_SMASK; \
+            } \
+            else \
+            { \
+                pCtx->a_sreg.fFlags = 0; \
+                STAM_COUNTER_INC(&gStatSelOutOfSyncStateBack[R_##a_SREG]); \
+            } \
+        } while (0)
+
+    SYNC_BACK_SREG(es, ES);
+    SYNC_BACK_SREG(cs, CS);
+    SYNC_BACK_SREG(ss, SS);
+    SYNC_BACK_SREG(ds, DS);
+    SYNC_BACK_SREG(fs, FS);
+    SYNC_BACK_SREG(gs, GS);
+
+#ifdef TARGET_X86_64
+    pCtx->rip           = pVM->rem.s.Env.eip;
+    pCtx->rflags.u64    = pVM->rem.s.Env.eflags;
+#else
+    pCtx->eip           = pVM->rem.s.Env.eip;
+    pCtx->eflags.u32    = pVM->rem.s.Env.eflags;
+#endif
+
+    pCtx->cr0           = pVM->rem.s.Env.cr[0];
+    pCtx->cr2           = pVM->rem.s.Env.cr[2];
+    pCtx->cr3           = pVM->rem.s.Env.cr[3];
+#ifdef VBOX_WITH_RAW_MODE
+    if (((pVM->rem.s.Env.cr[4] ^ pCtx->cr4) & X86_CR4_VME) && VM_IS_RAW_MODE_ENABLED(pVM))
+        VMCPU_FF_SET(pVCpu, VMCPU_FF_SELM_SYNC_TSS);
+#endif
+    pCtx->cr4           = pVM->rem.s.Env.cr[4];
+
+    for (i = 0; i < 8; i++)
+        pCtx->dr[i] = pVM->rem.s.Env.dr[i];
+
+    pCtx->gdtr.cbGdt    = pVM->rem.s.Env.gdt.limit;
+    if (pCtx->gdtr.pGdt != pVM->rem.s.Env.gdt.base)
+    {
+        pCtx->gdtr.pGdt = pVM->rem.s.Env.gdt.base;
+        STAM_COUNTER_INC(&gStatREMGDTChange);
+#ifdef VBOX_WITH_RAW_MODE
+        if (VM_IS_RAW_MODE_ENABLED(pVM))
+            VMCPU_FF_SET(pVCpu, VMCPU_FF_SELM_SYNC_GDT);
+#endif
+    }
+
+    pCtx->idtr.cbIdt    = pVM->rem.s.Env.idt.limit;
+    if (pCtx->idtr.pIdt != pVM->rem.s.Env.idt.base)
+    {
+        pCtx->idtr.pIdt = pVM->rem.s.Env.idt.base;
+        STAM_COUNTER_INC(&gStatREMIDTChange);
+#ifdef VBOX_WITH_RAW_MODE
+        if (VM_IS_RAW_MODE_ENABLED(pVM))
+            VMCPU_FF_SET(pVCpu, VMCPU_FF_TRPM_SYNC_IDT);
+#endif
+    }
+
+    if (    pCtx->ldtr.Sel      != pVM->rem.s.Env.ldt.selector
+        ||  pCtx->ldtr.ValidSel != pVM->rem.s.Env.ldt.selector
+        ||  pCtx->ldtr.u64Base  != pVM->rem.s.Env.ldt.base
+        ||  pCtx->ldtr.u32Limit != pVM->rem.s.Env.ldt.limit
+        ||  pCtx->ldtr.Attr.u   != ((pVM->rem.s.Env.ldt.flags >> SEL_FLAGS_SHIFT) & SEL_FLAGS_SMASK)
+        ||  !(pCtx->ldtr.fFlags & CPUMSELREG_FLAGS_VALID)
+       )
+    {
+        pCtx->ldtr.Sel      = pVM->rem.s.Env.ldt.selector;
+        pCtx->ldtr.ValidSel = pVM->rem.s.Env.ldt.selector;
+        pCtx->ldtr.fFlags   = CPUMSELREG_FLAGS_VALID;
+        pCtx->ldtr.u64Base  = pVM->rem.s.Env.ldt.base;
+        pCtx->ldtr.u32Limit = pVM->rem.s.Env.ldt.limit;
+        pCtx->ldtr.Attr.u   = (pVM->rem.s.Env.ldt.flags >> SEL_FLAGS_SHIFT) & SEL_FLAGS_SMASK;
+        STAM_COUNTER_INC(&gStatREMLDTRChange);
+#ifdef VBOX_WITH_RAW_MODE
+        if (VM_IS_RAW_MODE_ENABLED(pVM))
+            VMCPU_FF_SET(pVCpu, VMCPU_FF_SELM_SYNC_LDT);
+#endif
+    }
+
+    if (    pCtx->tr.Sel      != pVM->rem.s.Env.tr.selector
+        ||  pCtx->tr.ValidSel != pVM->rem.s.Env.tr.selector
+        ||  pCtx->tr.u64Base  != pVM->rem.s.Env.tr.base
+        ||  pCtx->tr.u32Limit != pVM->rem.s.Env.tr.limit
+        ||  pCtx->tr.Attr.u   != ((pVM->rem.s.Env.tr.flags >> SEL_FLAGS_SHIFT) & SEL_FLAGS_SMASK)
+        ||  !(pCtx->tr.fFlags & CPUMSELREG_FLAGS_VALID)
+       )
+    {
+        Log(("REM: TR changed! %#x{%#llx,%#x,%#x} -> %#x{%llx,%#x,%#x}\n",
+             pCtx->tr.Sel, pCtx->tr.u64Base, pCtx->tr.u32Limit, pCtx->tr.Attr.u,
+             pVM->rem.s.Env.tr.selector, (uint64_t)pVM->rem.s.Env.tr.base, pVM->rem.s.Env.tr.limit,
+             pVM->rem.s.Env.tr.flags >> SEL_FLAGS_SHIFT));
+        pCtx->tr.Sel        = pVM->rem.s.Env.tr.selector;
+        pCtx->tr.ValidSel   = pVM->rem.s.Env.tr.selector;
+        pCtx->tr.fFlags     = CPUMSELREG_FLAGS_VALID;
+        pCtx->tr.u64Base    = pVM->rem.s.Env.tr.base;
+        pCtx->tr.u32Limit   = pVM->rem.s.Env.tr.limit;
+        pCtx->tr.Attr.u     = (pVM->rem.s.Env.tr.flags >> SEL_FLAGS_SHIFT) & SEL_FLAGS_SMASK;
+        Assert(pCtx->tr.Attr.u & ~DESC_INTEL_UNUSABLE);
+        STAM_COUNTER_INC(&gStatREMTRChange);
+#ifdef VBOX_WITH_RAW_MODE
+        if (VM_IS_RAW_MODE_ENABLED(pVM))
+            VMCPU_FF_SET(pVCpu, VMCPU_FF_SELM_SYNC_TSS);
+#endif
+    }
+
+    /* Sysenter MSR */
+    pCtx->SysEnter.cs      = pVM->rem.s.Env.sysenter_cs;
+    pCtx->SysEnter.eip     = pVM->rem.s.Env.sysenter_eip;
+    pCtx->SysEnter.esp     = pVM->rem.s.Env.sysenter_esp;
+
+    /* System MSRs. */
+    pCtx->msrEFER          = pVM->rem.s.Env.efer;
+    pCtx->msrSTAR          = pVM->rem.s.Env.star;
+    pCtx->msrPAT           = pVM->rem.s.Env.pat;
+#ifdef TARGET_X86_64
+    pCtx->msrLSTAR         = pVM->rem.s.Env.lstar;
+    pCtx->msrCSTAR         = pVM->rem.s.Env.cstar;
+    pCtx->msrSFMASK        = pVM->rem.s.Env.fmask;
+    pCtx->msrKERNELGSBASE  = pVM->rem.s.Env.kernelgsbase;
+#endif
+
+    /* Inhibit interrupt flag. */
+    if (pVM->rem.s.Env.hflags & HF_INHIBIT_IRQ_MASK)
+    {
+        Log(("Settings VMCPU_FF_INHIBIT_INTERRUPTS at %RGv (REM)\n", (RTGCPTR)pCtx->rip));
+        EMSetInhibitInterruptsPC(pVCpu, pCtx->rip);
+        VMCPU_FF_SET(pVCpu, VMCPU_FF_INHIBIT_INTERRUPTS);
+    }
+    else if (VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_INHIBIT_INTERRUPTS))
+    {
+        Log(("Clearing VMCPU_FF_INHIBIT_INTERRUPTS at %RGv - successor %RGv (REM#2)\n", (RTGCPTR)pCtx->rip, EMGetInhibitInterruptsPC(pVCpu)));
+        VMCPU_FF_CLEAR(pVCpu, VMCPU_FF_INHIBIT_INTERRUPTS);
+    }
+
+    /* Inhibit NMI flag. */
+    if (pVM->rem.s.Env.hflags2 & HF2_NMI_MASK)
+    {
+        Log(("Settings VMCPU_FF_BLOCK_NMIS at %RGv (REM)\n", (RTGCPTR)pCtx->rip));
+        VMCPU_FF_SET(pVCpu, VMCPU_FF_BLOCK_NMIS);
+    }
+    else if (VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_BLOCK_NMIS))
+    {
+        Log(("Clearing VMCPU_FF_BLOCK_NMIS at %RGv (REM)\n", (RTGCPTR)pCtx->rip));
+        VMCPU_FF_CLEAR(pVCpu, VMCPU_FF_BLOCK_NMIS);
+    }
+
+    remR3TrapClear(pVM);
+
+    /*
+     * Check for traps.
+     */
+    if (    pVM->rem.s.Env.exception_index >= 0
+        &&  pVM->rem.s.Env.exception_index < 256)
+    {
+        /* This cannot be a hardware-interrupt because exception_index < EXCP_INTERRUPT. */
+        int rc;
+
+        Log(("REMR3StateBack: Pending trap %x %d\n", pVM->rem.s.Env.exception_index, pVM->rem.s.Env.exception_is_int));
+        TRPMEVENT enmType = pVM->rem.s.Env.exception_is_int == 0                                    ? TRPM_TRAP
+                          : pVM->rem.s.Env.exception_is_int == EXCEPTION_IS_INT_VALUE_HARDWARE_IRQ  ? TRPM_HARDWARE_INT
+                          :                                                                           TRPM_SOFTWARE_INT;
+        rc = TRPMAssertTrap(pVCpu, pVM->rem.s.Env.exception_index, enmType);
+        AssertRC(rc);
+        if (enmType == TRPM_TRAP)
+        {
+            switch (pVM->rem.s.Env.exception_index)
+            {
+                case X86_XCPT_PF:
+                    TRPMSetFaultAddress(pVCpu, pCtx->cr2);
+                    /* fallthru */
+                case X86_XCPT_TS: case X86_XCPT_NP: case X86_XCPT_SS: case X86_XCPT_GP:
+                case X86_XCPT_AC: case X86_XCPT_DF: /* 0 */
+                    TRPMSetErrorCode(pVCpu, pVM->rem.s.Env.error_code);
+                    break;
+            }
+        }
+    }
+
+    /*
+     * We're not longer in REM mode.
+     */
+    CPUMR3RemLeave(pVCpu,
+                      !VM_IS_RAW_MODE_ENABLED(pVM)
+                   || (  pVM->rem.s.Env.segs[R_SS].newselector
+                       | pVM->rem.s.Env.segs[R_GS].newselector
+                       | pVM->rem.s.Env.segs[R_FS].newselector
+                       | pVM->rem.s.Env.segs[R_ES].newselector
+                       | pVM->rem.s.Env.segs[R_DS].newselector
+                       | pVM->rem.s.Env.segs[R_CS].newselector) == 0
+                   );
+    VMCPU_CMPXCHG_STATE(pVCpu, VMCPUSTATE_STARTED, VMCPUSTATE_STARTED_EXEC_REM);
+    pVM->rem.s.fInREM    = false;
+    pVM->rem.s.pCtx      = NULL;
+    pVM->rem.s.Env.pVCpu = NULL;
+    STAM_PROFILE_STOP(&pVM->rem.s.StatsStateBack, a);
+    Log2(("REMR3StateBack: returns VINF_SUCCESS\n"));
+    return VINF_SUCCESS;
+}
+
+
+/**
+ * This is called by the disassembler when it wants to update the cpu state
+ * before for instance doing a register dump.
+ */
+static void remR3StateUpdate(PVM pVM, PVMCPU pVCpu)
+{
+    register PCPUMCTX pCtx = pVM->rem.s.pCtx;
+    unsigned          i;
+
+    Assert(pVM->rem.s.fInREM);
+
+    /*
+     * Copy back the registers.
+     * This is done in the order they are declared in the CPUMCTX structure.
+     */
+
+    PX86FXSTATE pFpuCtx = &pCtx->pXStateR3->x87;
+    /** @todo FOP */
+    /** @todo FPUIP */
+    /** @todo CS */
+    /** @todo FPUDP */
+    /** @todo DS */
+    /** @todo Fix MXCSR support in QEMU so we don't overwrite MXCSR with 0 when we shouldn't! */
+    pFpuCtx->MXCSR       = 0;
+    pFpuCtx->MXCSR_MASK  = 0;
+
+    /** @todo check if FPU/XMM was actually used in the recompiler */
+    restore_raw_fp_state(&pVM->rem.s.Env, (uint8_t *)pFpuCtx);
+////    dprintf2(("FPU state CW=%04X TT=%04X SW=%04X (%04X)\n", env->fpuc, env->fpstt, env->fpus, pVMCtx->fpu.FSW));
+
+#ifdef TARGET_X86_64
+    pCtx->rdi           = pVM->rem.s.Env.regs[R_EDI];
+    pCtx->rsi           = pVM->rem.s.Env.regs[R_ESI];
+    pCtx->rbp           = pVM->rem.s.Env.regs[R_EBP];
+    pCtx->rax           = pVM->rem.s.Env.regs[R_EAX];
+    pCtx->rbx           = pVM->rem.s.Env.regs[R_EBX];
+    pCtx->rdx           = pVM->rem.s.Env.regs[R_EDX];
+    pCtx->rcx           = pVM->rem.s.Env.regs[R_ECX];
+    pCtx->r8            = pVM->rem.s.Env.regs[8];
+    pCtx->r9            = pVM->rem.s.Env.regs[9];
+    pCtx->r10           = pVM->rem.s.Env.regs[10];
+    pCtx->r11           = pVM->rem.s.Env.regs[11];
+    pCtx->r12           = pVM->rem.s.Env.regs[12];
+    pCtx->r13           = pVM->rem.s.Env.regs[13];
+    pCtx->r14           = pVM->rem.s.Env.regs[14];
+    pCtx->r15           = pVM->rem.s.Env.regs[15];
+
+    pCtx->rsp           = pVM->rem.s.Env.regs[R_ESP];
+#else
+    pCtx->edi           = pVM->rem.s.Env.regs[R_EDI];
+    pCtx->esi           = pVM->rem.s.Env.regs[R_ESI];
+    pCtx->ebp           = pVM->rem.s.Env.regs[R_EBP];
+    pCtx->eax           = pVM->rem.s.Env.regs[R_EAX];
+    pCtx->ebx           = pVM->rem.s.Env.regs[R_EBX];
+    pCtx->edx           = pVM->rem.s.Env.regs[R_EDX];
+    pCtx->ecx           = pVM->rem.s.Env.regs[R_ECX];
+
+    pCtx->esp           = pVM->rem.s.Env.regs[R_ESP];
+#endif
+
+    SYNC_BACK_SREG(es, ES);
+    SYNC_BACK_SREG(cs, CS);
+    SYNC_BACK_SREG(ss, SS);
+    SYNC_BACK_SREG(ds, DS);
+    SYNC_BACK_SREG(fs, FS);
+    SYNC_BACK_SREG(gs, GS);
+
+#ifdef TARGET_X86_64
+    pCtx->rip           = pVM->rem.s.Env.eip;
+    pCtx->rflags.u64    = pVM->rem.s.Env.eflags;
+#else
+    pCtx->eip           = pVM->rem.s.Env.eip;
+    pCtx->eflags.u32    = pVM->rem.s.Env.eflags;
+#endif
+
+    pCtx->cr0           = pVM->rem.s.Env.cr[0];
+    pCtx->cr2           = pVM->rem.s.Env.cr[2];
+    pCtx->cr3           = pVM->rem.s.Env.cr[3];
+#ifdef VBOX_WITH_RAW_MODE
+    if (((pVM->rem.s.Env.cr[4] ^ pCtx->cr4) & X86_CR4_VME) && VM_IS_RAW_MODE_ENABLED(pVM))
+        VMCPU_FF_SET(pVCpu, VMCPU_FF_SELM_SYNC_TSS);
+#endif
+    pCtx->cr4           = pVM->rem.s.Env.cr[4];
+
+    for (i = 0; i < 8; i++)
+        pCtx->dr[i] = pVM->rem.s.Env.dr[i];
+
+    pCtx->gdtr.cbGdt    = pVM->rem.s.Env.gdt.limit;
+    if (pCtx->gdtr.pGdt != (RTGCPTR)pVM->rem.s.Env.gdt.base)
+    {
+        pCtx->gdtr.pGdt     = (RTGCPTR)pVM->rem.s.Env.gdt.base;
+        STAM_COUNTER_INC(&gStatREMGDTChange);
+#ifdef VBOX_WITH_RAW_MODE
+        if (VM_IS_RAW_MODE_ENABLED(pVM))
+            VMCPU_FF_SET(pVCpu, VMCPU_FF_SELM_SYNC_GDT);
+#endif
+    }
+
+    pCtx->idtr.cbIdt    = pVM->rem.s.Env.idt.limit;
+    if (pCtx->idtr.pIdt != (RTGCPTR)pVM->rem.s.Env.idt.base)
+    {
+        pCtx->idtr.pIdt     = (RTGCPTR)pVM->rem.s.Env.idt.base;
+        STAM_COUNTER_INC(&gStatREMIDTChange);
+#ifdef VBOX_WITH_RAW_MODE
+        if (VM_IS_RAW_MODE_ENABLED(pVM))
+            VMCPU_FF_SET(pVCpu, VMCPU_FF_TRPM_SYNC_IDT);
+#endif
+    }
+
+    if (    pCtx->ldtr.Sel      != pVM->rem.s.Env.ldt.selector
+        ||  pCtx->ldtr.ValidSel != pVM->rem.s.Env.ldt.selector
+        ||  pCtx->ldtr.u64Base  != pVM->rem.s.Env.ldt.base
+        ||  pCtx->ldtr.u32Limit != pVM->rem.s.Env.ldt.limit
+        ||  pCtx->ldtr.Attr.u   != ((pVM->rem.s.Env.ldt.flags >> SEL_FLAGS_SHIFT) & SEL_FLAGS_SMASK)
+        ||  !(pCtx->ldtr.fFlags & CPUMSELREG_FLAGS_VALID)
+       )
+    {
+        pCtx->ldtr.Sel      = pVM->rem.s.Env.ldt.selector;
+        pCtx->ldtr.ValidSel = pVM->rem.s.Env.ldt.selector;
+        pCtx->ldtr.fFlags   = CPUMSELREG_FLAGS_VALID;
+        pCtx->ldtr.u64Base  = pVM->rem.s.Env.ldt.base;
+        pCtx->ldtr.u32Limit = pVM->rem.s.Env.ldt.limit;
+        pCtx->ldtr.Attr.u   = (pVM->rem.s.Env.ldt.flags >> SEL_FLAGS_SHIFT) & SEL_FLAGS_SMASK;
+        STAM_COUNTER_INC(&gStatREMLDTRChange);
+#ifdef VBOX_WITH_RAW_MODE
+        if (VM_IS_RAW_MODE_ENABLED(pVM))
+            VMCPU_FF_SET(pVCpu, VMCPU_FF_SELM_SYNC_LDT);
+#endif
+    }
+
+    if (    pCtx->tr.Sel      != pVM->rem.s.Env.tr.selector
+        ||  pCtx->tr.ValidSel != pVM->rem.s.Env.tr.selector
+        ||  pCtx->tr.u64Base  != pVM->rem.s.Env.tr.base
+        ||  pCtx->tr.u32Limit != pVM->rem.s.Env.tr.limit
+        ||  pCtx->tr.Attr.u   != ((pVM->rem.s.Env.tr.flags >> SEL_FLAGS_SHIFT) & SEL_FLAGS_SMASK)
+        ||  !(pCtx->tr.fFlags & CPUMSELREG_FLAGS_VALID)
+       )
+    {
+        Log(("REM: TR changed! %#x{%#llx,%#x,%#x} -> %#x{%llx,%#x,%#x}\n",
+             pCtx->tr.Sel, pCtx->tr.u64Base, pCtx->tr.u32Limit, pCtx->tr.Attr.u,
+             pVM->rem.s.Env.tr.selector, (uint64_t)pVM->rem.s.Env.tr.base, pVM->rem.s.Env.tr.limit,
+             pVM->rem.s.Env.tr.flags >> SEL_FLAGS_SHIFT));
+        pCtx->tr.Sel        = pVM->rem.s.Env.tr.selector;
+        pCtx->tr.ValidSel   = pVM->rem.s.Env.tr.selector;
+        pCtx->tr.fFlags     = CPUMSELREG_FLAGS_VALID;
+        pCtx->tr.u64Base    = pVM->rem.s.Env.tr.base;
+        pCtx->tr.u32Limit   = pVM->rem.s.Env.tr.limit;
+        pCtx->tr.Attr.u     = (pVM->rem.s.Env.tr.flags >> SEL_FLAGS_SHIFT) & SEL_FLAGS_SMASK;
+        Assert(pCtx->tr.Attr.u & ~DESC_INTEL_UNUSABLE);
+        STAM_COUNTER_INC(&gStatREMTRChange);
+#ifdef VBOX_WITH_RAW_MODE
+        if (VM_IS_RAW_MODE_ENABLED(pVM))
+            VMCPU_FF_SET(pVCpu, VMCPU_FF_SELM_SYNC_TSS);
+#endif
+    }
+
+    /* Sysenter MSR */
+    pCtx->SysEnter.cs      = pVM->rem.s.Env.sysenter_cs;
+    pCtx->SysEnter.eip     = pVM->rem.s.Env.sysenter_eip;
+    pCtx->SysEnter.esp     = pVM->rem.s.Env.sysenter_esp;
+
+    /* System MSRs. */
+    pCtx->msrEFER          = pVM->rem.s.Env.efer;
+    pCtx->msrSTAR          = pVM->rem.s.Env.star;
+    pCtx->msrPAT           = pVM->rem.s.Env.pat;
+#ifdef TARGET_X86_64
+    pCtx->msrLSTAR         = pVM->rem.s.Env.lstar;
+    pCtx->msrCSTAR         = pVM->rem.s.Env.cstar;
+    pCtx->msrSFMASK        = pVM->rem.s.Env.fmask;
+    pCtx->msrKERNELGSBASE  = pVM->rem.s.Env.kernelgsbase;
+#endif
+
+}
+
+
+/**
+ * Update the VMM state information if we're currently in REM.
+ *
+ * This method is used by the DBGF and PDMDevice when there is any uncertainty of whether
+ * we're currently executing in REM and the VMM state is invalid. This method will of
+ * course check that we're executing in REM before syncing any data over to the VMM.
+ *
+ * @param   pVM         The VM handle.
+ * @param   pVCpu       The VMCPU handle.
+ */
+REMR3DECL(void) REMR3StateUpdate(PVM pVM, PVMCPU pVCpu)
+{
+    if (pVM->rem.s.fInREM)
+        remR3StateUpdate(pVM, pVCpu);
+}
+
+
+#undef LOG_GROUP
+#define LOG_GROUP LOG_GROUP_REM
+
+
+/**
+ * Notify the recompiler about Address Gate 20 state change.
+ *
+ * This notification is required since A20 gate changes are
+ * initialized from a device driver and the VM might just as
+ * well be in REM mode as in RAW mode.
+ *
+ * @param   pVM         VM handle.
+ * @param   pVCpu       VMCPU handle.
+ * @param   fEnable     True if the gate should be enabled.
+ *                      False if the gate should be disabled.
+ */
+REMR3DECL(void) REMR3A20Set(PVM pVM, PVMCPU pVCpu, bool fEnable)
+{
+    LogFlow(("REMR3A20Set: fEnable=%d\n", fEnable));
+    VM_ASSERT_EMT(pVM);
+
+    /** @todo SMP and the A20 gate... */
+    if (pVM->rem.s.Env.pVCpu == pVCpu)
+    {
+        ASMAtomicIncU32(&pVM->rem.s.cIgnoreAll);
+        cpu_x86_set_a20(&pVM->rem.s.Env, fEnable);
+        ASMAtomicDecU32(&pVM->rem.s.cIgnoreAll);
+    }
+}
+
+
+/**
+ * Replays the handler notification changes
+ * Called in response to VM_FF_REM_HANDLER_NOTIFY from the RAW execution loop.
+ *
+ * @param   pVM         VM handle.
+ */
+REMR3DECL(void) REMR3ReplayHandlerNotifications(PVM pVM)
+{
+    /*
+     * Replay the flushes.
+     */
+    LogFlow(("REMR3ReplayHandlerNotifications:\n"));
+    VM_ASSERT_EMT(pVM);
+
+    /** @todo this isn't ensuring correct replay order. */
+    if (VM_FF_TEST_AND_CLEAR(pVM, VM_FF_REM_HANDLER_NOTIFY))
+    {
+        uint32_t    idxNext;
+        uint32_t    idxRevHead;
+        uint32_t    idxHead;
+#ifdef VBOX_STRICT
+        int32_t     c = 0;
+#endif
+
+        /* Lockless purging of pending notifications. */
+        idxHead = ASMAtomicXchgU32(&pVM->rem.s.idxPendingList, UINT32_MAX);
+        if (idxHead == UINT32_MAX)
+            return;
+        Assert(idxHead < RT_ELEMENTS(pVM->rem.s.aHandlerNotifications));
+
+        /*
+         * Reverse the list to process it in FIFO order.
+         */
+        idxRevHead = UINT32_MAX;
+        do
+        {
+            /* Save the index of the next rec. */
+            idxNext    = pVM->rem.s.aHandlerNotifications[idxHead].idxNext;
+            Assert(idxNext < RT_ELEMENTS(pVM->rem.s.aHandlerNotifications) || idxNext == UINT32_MAX);
+            /* Push the record onto the reversed list. */
+            pVM->rem.s.aHandlerNotifications[idxHead].idxNext = idxRevHead;
+            idxRevHead = idxHead;
+            Assert(++c <= RT_ELEMENTS(pVM->rem.s.aHandlerNotifications));
+            /* Advance. */
+            idxHead    = idxNext;
+        } while (idxHead != UINT32_MAX);
+
+        /*
+         * Loop thru the list, reinserting the record into the free list as they are
+         * processed to avoid having other EMTs running out of entries while we're flushing.
+         */
+        idxHead = idxRevHead;
+        do
+        {
+            PREMHANDLERNOTIFICATION pCur = &pVM->rem.s.aHandlerNotifications[idxHead];
+            uint32_t                idxCur;
+            Assert(--c >= 0);
+
+            switch (pCur->enmKind)
+            {
+                case REMHANDLERNOTIFICATIONKIND_PHYSICAL_REGISTER:
+                    remR3NotifyHandlerPhysicalRegister(pVM,
+                                                       pCur->u.PhysicalRegister.enmKind,
+                                                       pCur->u.PhysicalRegister.GCPhys,
+                                                       pCur->u.PhysicalRegister.cb,
+                                                       pCur->u.PhysicalRegister.fHasHCHandler);
+                    break;
+
+                case REMHANDLERNOTIFICATIONKIND_PHYSICAL_DEREGISTER:
+                    remR3NotifyHandlerPhysicalDeregister(pVM,
+                                                         pCur->u.PhysicalDeregister.enmKind,
+                                                         pCur->u.PhysicalDeregister.GCPhys,
+                                                         pCur->u.PhysicalDeregister.cb,
+                                                         pCur->u.PhysicalDeregister.fHasHCHandler,
+                                                         pCur->u.PhysicalDeregister.fRestoreAsRAM);
+                    break;
+
+                case REMHANDLERNOTIFICATIONKIND_PHYSICAL_MODIFY:
+                    remR3NotifyHandlerPhysicalModify(pVM,
+                                                     pCur->u.PhysicalModify.enmKind,
+                                                     pCur->u.PhysicalModify.GCPhysOld,
+                                                     pCur->u.PhysicalModify.GCPhysNew,
+                                                     pCur->u.PhysicalModify.cb,
+                                                     pCur->u.PhysicalModify.fHasHCHandler,
+                                                     pCur->u.PhysicalModify.fRestoreAsRAM);
+                    break;
+
+                default:
+                    AssertReleaseMsgFailed(("enmKind=%d\n", pCur->enmKind));
+                    break;
+            }
+
+            /*
+             * Advance idxHead.
+             */
+            idxCur  = idxHead;
+            idxHead = pCur->idxNext;
+            Assert(idxHead < RT_ELEMENTS(pVM->rem.s.aHandlerNotifications) || (idxHead == UINT32_MAX && c == 0));
+
+            /*
+             * Put the record back into the free list.
+             */
+            do
+            {
+                idxNext = ASMAtomicUoReadU32(&pVM->rem.s.idxFreeList);
+                ASMAtomicWriteU32(&pCur->idxNext, idxNext);
+                ASMCompilerBarrier();
+            } while (!ASMAtomicCmpXchgU32(&pVM->rem.s.idxFreeList, idxCur, idxNext));
+        } while (idxHead != UINT32_MAX);
+
+#ifdef VBOX_STRICT
+        if (pVM->cCpus == 1)
+        {
+            unsigned c;
+            /* Check that all records are now on the free list. */
+            for (c = 0, idxNext = pVM->rem.s.idxFreeList; idxNext != UINT32_MAX;
+                 idxNext = pVM->rem.s.aHandlerNotifications[idxNext].idxNext)
+                c++;
+            AssertReleaseMsg(c == RT_ELEMENTS(pVM->rem.s.aHandlerNotifications), ("%#x != %#x, idxFreeList=%#x\n", c, RT_ELEMENTS(pVM->rem.s.aHandlerNotifications), pVM->rem.s.idxFreeList));
+        }
+#endif
+    }
+}
+
+
+/**
+ * Notify REM about changed code page.
+ *
+ * @returns VBox status code.
+ * @param   pVM         VM handle.
+ * @param   pVCpu       VMCPU handle.
+ * @param   pvCodePage  Code page address
+ */
+REMR3DECL(int) REMR3NotifyCodePageChanged(PVM pVM, PVMCPU pVCpu, RTGCPTR pvCodePage)
+{
+#ifdef VBOX_REM_PROTECT_PAGES_FROM_SMC
+    int      rc;
+    RTGCPHYS PhysGC;
+    uint64_t flags;
+
+    VM_ASSERT_EMT(pVM);
+
+    /*
+     * Get the physical page address.
+     */
+    rc = PGMGstGetPage(pVM, pvCodePage, &flags, &PhysGC);
+    if (rc == VINF_SUCCESS)
+    {
+        /*
+         * Sync the required registers and flush the whole page.
+         * (Easier to do the whole page than notifying it about each physical
+         * byte that was changed.
+         */
+        pVM->rem.s.Env.cr[0] = pVM->rem.s.pCtx->cr0;
+        pVM->rem.s.Env.cr[2] = pVM->rem.s.pCtx->cr2;
+        pVM->rem.s.Env.cr[3] = pVM->rem.s.pCtx->cr3;
+        pVM->rem.s.Env.cr[4] = pVM->rem.s.pCtx->cr4;
+
+        tb_invalidate_phys_page_range(PhysGC, PhysGC + PAGE_SIZE - 1, 0);
+    }
+#endif
+    return VINF_SUCCESS;
+}
+
+
+/**
+ * Notification about a successful MMR3PhysRegister() call.
+ *
+ * @param   pVM         VM handle.
+ * @param   GCPhys      The physical address the RAM.
+ * @param   cb          Size of the memory.
+ * @param   fFlags      Flags of the REM_NOTIFY_PHYS_RAM_FLAGS_* defines.
+ */
+REMR3DECL(void) REMR3NotifyPhysRamRegister(PVM pVM, RTGCPHYS GCPhys, RTGCPHYS cb, unsigned fFlags)
+{
+    Log(("REMR3NotifyPhysRamRegister: GCPhys=%RGp cb=%RGp fFlags=%#x\n", GCPhys, cb, fFlags));
+    VM_ASSERT_EMT(pVM);
+
+    /*
+     * Validate input - we trust the caller.
+     */
+    Assert(RT_ALIGN_T(GCPhys, PAGE_SIZE, RTGCPHYS) == GCPhys);
+    Assert(cb);
+    Assert(RT_ALIGN_Z(cb, PAGE_SIZE) == cb);
+    AssertMsg(fFlags == REM_NOTIFY_PHYS_RAM_FLAGS_RAM || fFlags == REM_NOTIFY_PHYS_RAM_FLAGS_MMIO2, ("%#x\n", fFlags));
+
+    /*
+     * Base ram? Update GCPhysLastRam.
+     */
+    if (fFlags & REM_NOTIFY_PHYS_RAM_FLAGS_RAM)
+    {
+        if (GCPhys + (cb - 1) > pVM->rem.s.GCPhysLastRam)
+        {
+            AssertReleaseMsg(!pVM->rem.s.fGCPhysLastRamFixed, ("GCPhys=%RGp cb=%RGp\n", GCPhys, cb));
+            pVM->rem.s.GCPhysLastRam = GCPhys + (cb - 1);
+        }
+    }
+
+    /*
+     * Register the ram.
+     */
+    ASMAtomicIncU32(&pVM->rem.s.cIgnoreAll);
+
+    PDMCritSectEnter(&pVM->rem.s.CritSectRegister, VERR_SEM_BUSY);
+    cpu_register_physical_memory_offset(GCPhys, cb, GCPhys, GCPhys);
+    PDMCritSectLeave(&pVM->rem.s.CritSectRegister);
+
+    ASMAtomicDecU32(&pVM->rem.s.cIgnoreAll);
+}
+
+
+/**
+ * Notification about a successful MMR3PhysRomRegister() call.
+ *
+ * @param   pVM         VM handle.
+ * @param   GCPhys      The physical address of the ROM.
+ * @param   cb          The size of the ROM.
+ * @param   pvCopy      Pointer to the ROM copy.
+ * @param   fShadow     Whether it's currently writable shadow ROM or normal readonly ROM.
+ *                      This function will be called when ever the protection of the
+ *                      shadow ROM changes (at reset and end of POST).
+ */
+REMR3DECL(void) REMR3NotifyPhysRomRegister(PVM pVM, RTGCPHYS GCPhys, RTUINT cb, void *pvCopy, bool fShadow)
+{
+    Log(("REMR3NotifyPhysRomRegister: GCPhys=%RGp cb=%d fShadow=%RTbool\n", GCPhys, cb, fShadow));
+    VM_ASSERT_EMT(pVM);
+
+    /*
+     * Validate input - we trust the caller.
+     */
+    Assert(RT_ALIGN_T(GCPhys, PAGE_SIZE, RTGCPHYS) == GCPhys);
+    Assert(cb);
+    Assert(RT_ALIGN_Z(cb, PAGE_SIZE) == cb);
+
+    /*
+     * Register the rom.
+     */
+    ASMAtomicIncU32(&pVM->rem.s.cIgnoreAll);
+
+    PDMCritSectEnter(&pVM->rem.s.CritSectRegister, VERR_SEM_BUSY);
+    cpu_register_physical_memory_offset(GCPhys, cb, GCPhys | (fShadow ? 0 : IO_MEM_ROM), GCPhys);
+    PDMCritSectLeave(&pVM->rem.s.CritSectRegister);
+
+    ASMAtomicDecU32(&pVM->rem.s.cIgnoreAll);
+}
+
+
+/**
+ * Notification about a successful memory deregistration or reservation.
+ *
+ * @param   pVM         VM Handle.
+ * @param   GCPhys      Start physical address.
+ * @param   cb          The size of the range.
+ */
+REMR3DECL(void) REMR3NotifyPhysRamDeregister(PVM pVM, RTGCPHYS GCPhys, RTUINT cb)
+{
+    Log(("REMR3NotifyPhysRamDeregister: GCPhys=%RGp cb=%d\n", GCPhys, cb));
+    VM_ASSERT_EMT(pVM);
+
+    /*
+     * Validate input - we trust the caller.
+     */
+    Assert(RT_ALIGN_T(GCPhys, PAGE_SIZE, RTGCPHYS) == GCPhys);
+    Assert(cb);
+    Assert(RT_ALIGN_Z(cb, PAGE_SIZE) == cb);
+
+    /*
+     * Unassigning the memory.
+     */
+    ASMAtomicIncU32(&pVM->rem.s.cIgnoreAll);
+
+    PDMCritSectEnter(&pVM->rem.s.CritSectRegister, VERR_SEM_BUSY);
+    cpu_register_physical_memory_offset(GCPhys, cb, IO_MEM_UNASSIGNED, GCPhys);
+    PDMCritSectLeave(&pVM->rem.s.CritSectRegister);
+
+    ASMAtomicDecU32(&pVM->rem.s.cIgnoreAll);
+}
+
+
+/**
+ * Notification about a successful PGMR3HandlerPhysicalRegister() call.
+ *
+ * @param   pVM             VM Handle.
+ * @param   enmKind         Kind of access handler.
+ * @param   GCPhys          Handler range address.
+ * @param   cb              Size of the handler range.
+ * @param   fHasHCHandler   Set if the handler has a HC callback function.
+ *
+ * @remark  MMR3PhysRomRegister assumes that this function will not apply the
+ *          Handler memory type to memory which has no HC handler.
+ */
+static void remR3NotifyHandlerPhysicalRegister(PVM pVM, PGMPHYSHANDLERKIND enmKind, RTGCPHYS GCPhys, RTGCPHYS cb,
+                                               bool fHasHCHandler)
+{
+    Log(("REMR3NotifyHandlerPhysicalRegister: enmKind=%d GCPhys=%RGp cb=%RGp fHasHCHandler=%d\n",
+         enmKind, GCPhys, cb, fHasHCHandler));
+
+    VM_ASSERT_EMT(pVM);
+    Assert(RT_ALIGN_T(GCPhys, PAGE_SIZE, RTGCPHYS) == GCPhys);
+    Assert(RT_ALIGN_T(cb, PAGE_SIZE, RTGCPHYS) == cb);
+
+
+    ASMAtomicIncU32(&pVM->rem.s.cIgnoreAll);
+
+    PDMCritSectEnter(&pVM->rem.s.CritSectRegister, VERR_SEM_BUSY);
+    if (enmKind == PGMPHYSHANDLERKIND_MMIO)
+        cpu_register_physical_memory_offset(GCPhys, cb, pVM->rem.s.iMMIOMemType, GCPhys);
+    else if (fHasHCHandler)
+        cpu_register_physical_memory_offset(GCPhys, cb, pVM->rem.s.iHandlerMemType, GCPhys);
+    PDMCritSectLeave(&pVM->rem.s.CritSectRegister);
+
+    ASMAtomicDecU32(&pVM->rem.s.cIgnoreAll);
+}
+
+/**
+ * Notification about a successful PGMR3HandlerPhysicalRegister() call.
+ *
+ * @param   pVM             VM Handle.
+ * @param   enmKind         Kind of access handler.
+ * @param   GCPhys          Handler range address.
+ * @param   cb              Size of the handler range.
+ * @param   fHasHCHandler   Set if the handler has a HC callback function.
+ *
+ * @remark  MMR3PhysRomRegister assumes that this function will not apply the
+ *          Handler memory type to memory which has no HC handler.
+ */
+REMR3DECL(void) REMR3NotifyHandlerPhysicalRegister(PVM pVM, PGMPHYSHANDLERKIND enmKind, RTGCPHYS GCPhys, RTGCPHYS cb,
+                                                   bool fHasHCHandler)
+{
+    REMR3ReplayHandlerNotifications(pVM);
+
+    remR3NotifyHandlerPhysicalRegister(pVM, enmKind, GCPhys, cb, fHasHCHandler);
+}
+
+/**
+ * Notification about a successful PGMR3HandlerPhysicalDeregister() operation.
+ *
+ * @param   pVM             VM Handle.
+ * @param   enmKind         Kind of access handler.
+ * @param   GCPhys          Handler range address.
+ * @param   cb              Size of the handler range.
+ * @param   fHasHCHandler   Set if the handler has a HC callback function.
+ * @param   fRestoreAsRAM   Whether the to restore it as normal RAM or as unassigned memory.
+ */
+static void remR3NotifyHandlerPhysicalDeregister(PVM pVM, PGMPHYSHANDLERKIND enmKind, RTGCPHYS GCPhys, RTGCPHYS cb,
+                                                 bool fHasHCHandler, bool fRestoreAsRAM)
+{
+    Log(("REMR3NotifyHandlerPhysicalDeregister: enmKind=%d GCPhys=%RGp cb=%RGp fHasHCHandler=%RTbool fRestoreAsRAM=%RTbool RAM=%08x\n",
+         enmKind, GCPhys, cb, fHasHCHandler, fRestoreAsRAM, MMR3PhysGetRamSize(pVM)));
+    VM_ASSERT_EMT(pVM);
+
+
+    ASMAtomicIncU32(&pVM->rem.s.cIgnoreAll);
+
+    PDMCritSectEnter(&pVM->rem.s.CritSectRegister, VERR_SEM_BUSY);
+    /** @todo this isn't right, MMIO can (in theory) be restored as RAM. */
+    if (enmKind == PGMPHYSHANDLERKIND_MMIO)
+        cpu_register_physical_memory_offset(GCPhys, cb, IO_MEM_UNASSIGNED, GCPhys);
+    else if (fHasHCHandler)
+    {
+        if (!fRestoreAsRAM)
+        {
+            Assert(GCPhys > MMR3PhysGetRamSize(pVM));
+            cpu_register_physical_memory_offset(GCPhys, cb, IO_MEM_UNASSIGNED, GCPhys);
+        }
+        else
+        {
+            Assert(RT_ALIGN_T(GCPhys, PAGE_SIZE, RTGCPHYS) == GCPhys);
+            Assert(RT_ALIGN_T(cb, PAGE_SIZE, RTGCPHYS) == cb);
+            cpu_register_physical_memory_offset(GCPhys, cb, GCPhys, GCPhys);
+        }
+    }
+    PDMCritSectLeave(&pVM->rem.s.CritSectRegister);
+
+    ASMAtomicDecU32(&pVM->rem.s.cIgnoreAll);
+}
+
+/**
+ * Notification about a successful PGMR3HandlerPhysicalDeregister() operation.
+ *
+ * @param   pVM             VM Handle.
+ * @param   enmKind         Kind of access handler.
+ * @param   GCPhys          Handler range address.
+ * @param   cb              Size of the handler range.
+ * @param   fHasHCHandler   Set if the handler has a HC callback function.
+ * @param   fRestoreAsRAM   Whether the to restore it as normal RAM or as unassigned memory.
+ */
+REMR3DECL(void) REMR3NotifyHandlerPhysicalDeregister(PVM pVM, PGMPHYSHANDLERKIND enmKind, RTGCPHYS GCPhys, RTGCPHYS cb, bool fHasHCHandler, bool fRestoreAsRAM)
+{
+    REMR3ReplayHandlerNotifications(pVM);
+    remR3NotifyHandlerPhysicalDeregister(pVM, enmKind, GCPhys, cb, fHasHCHandler, fRestoreAsRAM);
+}
+
+
+/**
+ * Notification about a successful PGMR3HandlerPhysicalModify() call.
+ *
+ * @param   pVM             VM Handle.
+ * @param   enmKind         Kind of access handler.
+ * @param   GCPhysOld       Old handler range address.
+ * @param   GCPhysNew       New handler range address.
+ * @param   cb              Size of the handler range.
+ * @param   fHasHCHandler   Set if the handler has a HC callback function.
+ * @param   fRestoreAsRAM   Whether the to restore it as normal RAM or as unassigned memory.
+ */
+static void remR3NotifyHandlerPhysicalModify(PVM pVM, PGMPHYSHANDLERKIND enmKind, RTGCPHYS GCPhysOld, RTGCPHYS GCPhysNew, RTGCPHYS cb, bool fHasHCHandler, bool fRestoreAsRAM)
+{
+    Log(("REMR3NotifyHandlerPhysicalModify: enmKind=%d GCPhysOld=%RGp GCPhysNew=%RGp cb=%RGp fHasHCHandler=%RTbool fRestoreAsRAM=%RTbool\n",
+         enmKind, GCPhysOld, GCPhysNew, cb, fHasHCHandler, fRestoreAsRAM));
+    VM_ASSERT_EMT(pVM);
+    AssertReleaseMsg(enmKind != PGMPHYSHANDLERKIND_MMIO, ("enmKind=%d\n", enmKind));
+
+    if (fHasHCHandler)
+    {
+        ASMAtomicIncU32(&pVM->rem.s.cIgnoreAll);
+
+        /*
+         * Reset the old page.
+         */
+        PDMCritSectEnter(&pVM->rem.s.CritSectRegister, VERR_SEM_BUSY);
+        if (!fRestoreAsRAM)
+            cpu_register_physical_memory_offset(GCPhysOld, cb, IO_MEM_UNASSIGNED, GCPhysOld);
+        else
+        {
+            /* This is not perfect, but it'll do for PD monitoring... */
+            Assert(cb == PAGE_SIZE);
+            Assert(RT_ALIGN_T(GCPhysOld, PAGE_SIZE, RTGCPHYS) == GCPhysOld);
+            cpu_register_physical_memory_offset(GCPhysOld, cb, GCPhysOld, GCPhysOld);
+        }
+
+        /*
+         * Update the new page.
+         */
+        Assert(RT_ALIGN_T(GCPhysNew, PAGE_SIZE, RTGCPHYS) == GCPhysNew);
+        Assert(RT_ALIGN_T(cb, PAGE_SIZE, RTGCPHYS) == cb);
+        cpu_register_physical_memory_offset(GCPhysNew, cb, pVM->rem.s.iHandlerMemType, GCPhysNew);
+        PDMCritSectLeave(&pVM->rem.s.CritSectRegister);
+
+        ASMAtomicDecU32(&pVM->rem.s.cIgnoreAll);
+    }
+}
+
+/**
+ * Notification about a successful PGMR3HandlerPhysicalModify() call.
+ *
+ * @param   pVM             VM Handle.
+ * @param   enmKind         Kind of access handler.
+ * @param   GCPhysOld       Old handler range address.
+ * @param   GCPhysNew       New handler range address.
+ * @param   cb              Size of the handler range.
+ * @param   fHasHCHandler   Set if the handler has a HC callback function.
+ * @param   fRestoreAsRAM   Whether the to restore it as normal RAM or as unassigned memory.
+ */
+REMR3DECL(void) REMR3NotifyHandlerPhysicalModify(PVM pVM, PGMPHYSHANDLERKIND enmKind, RTGCPHYS GCPhysOld, RTGCPHYS GCPhysNew, RTGCPHYS cb, bool fHasHCHandler, bool fRestoreAsRAM)
+{
+    REMR3ReplayHandlerNotifications(pVM);
+
+    remR3NotifyHandlerPhysicalModify(pVM, enmKind, GCPhysOld, GCPhysNew, cb, fHasHCHandler, fRestoreAsRAM);
+}
+
+/**
+ * Checks if we're handling access to this page or not.
+ *
+ * @returns true if we're trapping access.
+ * @returns false if we aren't.
+ * @param   pVM         The VM handle.
+ * @param   GCPhys      The physical address.
+ *
+ * @remark  This function will only work correctly in VBOX_STRICT builds!
+ */
+REMR3DECL(bool) REMR3IsPageAccessHandled(PVM pVM, RTGCPHYS GCPhys)
+{
+#ifdef VBOX_STRICT
+    ram_addr_t off;
+    REMR3ReplayHandlerNotifications(pVM);
+
+    off = get_phys_page_offset(GCPhys);
+    return (off & PAGE_OFFSET_MASK) == pVM->rem.s.iHandlerMemType
+        || (off & PAGE_OFFSET_MASK) == pVM->rem.s.iMMIOMemType
+        || (off & PAGE_OFFSET_MASK) == IO_MEM_ROM;
+#else
+    return false;
+#endif
+}
+
+
+/**
+ * Deals with a rare case in get_phys_addr_code where the code
+ * is being monitored.
+ *
+ * It could also be an MMIO page, in which case we will raise a fatal error.
+ *
+ * @returns The physical address corresponding to addr.
+ * @param   env         The cpu environment.
+ * @param   addr        The virtual address.
+ * @param   pTLBEntry   The TLB entry.
+ * @param   IoTlbEntry  The I/O TLB entry address.
+ */
+target_ulong remR3PhysGetPhysicalAddressCode(CPUX86State       *env,
+                                             target_ulong       addr,
+                                             CPUTLBEntry       *pTLBEntry,
+                                             target_phys_addr_t IoTlbEntry)
+{
+    PVM pVM = env->pVM;
+
+    if ((IoTlbEntry & ~TARGET_PAGE_MASK) == pVM->rem.s.iHandlerMemType)
+    {
+        /* If code memory is being monitored, appropriate IOTLB entry will have
+           handler IO type, and addend will provide real physical address, no
+           matter if we store VA in TLB or not, as handlers are always passed PA */
+        target_ulong ret = (IoTlbEntry & TARGET_PAGE_MASK) + addr;
+        return ret;
+    }
+    LogRel(("\nTrying to execute code with memory type addr_code=%RGv addend=%RGp at %RGv! (iHandlerMemType=%#x iMMIOMemType=%#x IOTLB=%RGp)\n"
+            "*** handlers\n",
+            (RTGCPTR)pTLBEntry->addr_code, (RTGCPHYS)pTLBEntry->addend, (RTGCPTR)addr, pVM->rem.s.iHandlerMemType, pVM->rem.s.iMMIOMemType, (RTGCPHYS)IoTlbEntry));
+    DBGFR3Info(pVM->pUVM, "handlers", NULL, DBGFR3InfoLogRelHlp());
+    LogRel(("*** mmio\n"));
+    DBGFR3Info(pVM->pUVM, "mmio", NULL, DBGFR3InfoLogRelHlp());
+    LogRel(("*** phys\n"));
+    DBGFR3Info(pVM->pUVM, "phys", NULL, DBGFR3InfoLogRelHlp());
+    cpu_abort(env, "Trying to execute code with memory type addr_code=%RGv addend=%RGp at %RGv. (iHandlerMemType=%#x iMMIOMemType=%#x)\n",
+              (RTGCPTR)pTLBEntry->addr_code, (RTGCPHYS)pTLBEntry->addend, (RTGCPTR)addr, pVM->rem.s.iHandlerMemType, pVM->rem.s.iMMIOMemType);
+    AssertFatalFailed();
+}
+
+/**
+ * Read guest RAM and ROM.
+ *
+ * @param   SrcGCPhys       The source address (guest physical).
+ * @param   pvDst           The destination address.
+ * @param   cb              Number of bytes
+ */
+void remR3PhysRead(RTGCPHYS SrcGCPhys, void *pvDst, unsigned cb)
+{
+    STAM_PROFILE_ADV_START(&gStatMemRead, a);
+    VBOX_CHECK_ADDR(SrcGCPhys);
+    VBOXSTRICTRC rcStrict = PGMPhysRead(cpu_single_env->pVM, SrcGCPhys, pvDst, cb, PGMACCESSORIGIN_REM);
+    AssertMsg(rcStrict == VINF_SUCCESS, ("%Rrc\n", VBOXSTRICTRC_VAL(rcStrict))); NOREF(rcStrict);
+#ifdef VBOX_DEBUG_PHYS
+    LogRel(("read(%d): %08x\n", cb, (uint32_t)SrcGCPhys));
+#endif
+    STAM_PROFILE_ADV_STOP(&gStatMemRead, a);
+}
+
+
+/**
+ * Read guest RAM and ROM, unsigned 8-bit.
+ *
+ * @param   SrcGCPhys       The source address (guest physical).
+ */
+RTCCUINTREG remR3PhysReadU8(RTGCPHYS SrcGCPhys)
+{
+    uint8_t val;
+    STAM_PROFILE_ADV_START(&gStatMemRead, a);
+    VBOX_CHECK_ADDR(SrcGCPhys);
+    val = PGMR3PhysReadU8(cpu_single_env->pVM, SrcGCPhys, PGMACCESSORIGIN_REM);
+    STAM_PROFILE_ADV_STOP(&gStatMemRead, a);
+#ifdef VBOX_DEBUG_PHYS
+    LogRel(("readu8: %x <- %08x\n", val, (uint32_t)SrcGCPhys));
+#endif
+    return val;
+}
+
+
+/**
+ * Read guest RAM and ROM, signed 8-bit.
+ *
+ * @param   SrcGCPhys       The source address (guest physical).
+ */
+RTCCINTREG remR3PhysReadS8(RTGCPHYS SrcGCPhys)
+{
+    int8_t val;
+    STAM_PROFILE_ADV_START(&gStatMemRead, a);
+    VBOX_CHECK_ADDR(SrcGCPhys);
+    val = PGMR3PhysReadU8(cpu_single_env->pVM, SrcGCPhys, PGMACCESSORIGIN_REM);
+    STAM_PROFILE_ADV_STOP(&gStatMemRead, a);
+#ifdef VBOX_DEBUG_PHYS
+    LogRel(("reads8: %x <- %08x\n", val, (uint32_t)SrcGCPhys));
+#endif
+    return val;
+}
+
+
+/**
+ * Read guest RAM and ROM, unsigned 16-bit.
+ *
+ * @param   SrcGCPhys       The source address (guest physical).
+ */
+RTCCUINTREG remR3PhysReadU16(RTGCPHYS SrcGCPhys)
+{
+    uint16_t val;
+    STAM_PROFILE_ADV_START(&gStatMemRead, a);
+    VBOX_CHECK_ADDR(SrcGCPhys);
+    val = PGMR3PhysReadU16(cpu_single_env->pVM, SrcGCPhys, PGMACCESSORIGIN_REM);
+    STAM_PROFILE_ADV_STOP(&gStatMemRead, a);
+#ifdef VBOX_DEBUG_PHYS
+    LogRel(("readu16: %x <- %08x\n", val, (uint32_t)SrcGCPhys));
+#endif
+    return val;
+}
+
+
+/**
+ * Read guest RAM and ROM, signed 16-bit.
+ *
+ * @param   SrcGCPhys       The source address (guest physical).
+ */
+RTCCINTREG remR3PhysReadS16(RTGCPHYS SrcGCPhys)
+{
+    int16_t val;
+    STAM_PROFILE_ADV_START(&gStatMemRead, a);
+    VBOX_CHECK_ADDR(SrcGCPhys);
+    val = PGMR3PhysReadU16(cpu_single_env->pVM, SrcGCPhys, PGMACCESSORIGIN_REM);
+    STAM_PROFILE_ADV_STOP(&gStatMemRead, a);
+#ifdef VBOX_DEBUG_PHYS
+    LogRel(("reads16: %x <- %08x\n", (uint16_t)val, (uint32_t)SrcGCPhys));
+#endif
+    return val;
+}
+
+
+/**
+ * Read guest RAM and ROM, unsigned 32-bit.
+ *
+ * @param   SrcGCPhys       The source address (guest physical).
+ */
+RTCCUINTREG remR3PhysReadU32(RTGCPHYS SrcGCPhys)
+{
+    uint32_t val;
+    STAM_PROFILE_ADV_START(&gStatMemRead, a);
+    VBOX_CHECK_ADDR(SrcGCPhys);
+    val = PGMR3PhysReadU32(cpu_single_env->pVM, SrcGCPhys, PGMACCESSORIGIN_REM);
+    STAM_PROFILE_ADV_STOP(&gStatMemRead, a);
+#ifdef VBOX_DEBUG_PHYS
+    LogRel(("readu32: %x <- %08x\n", val, (uint32_t)SrcGCPhys));
+#endif
+    return val;
+}
+
+
+/**
+ * Read guest RAM and ROM, signed 32-bit.
+ *
+ * @param   SrcGCPhys       The source address (guest physical).
+ */
+RTCCINTREG remR3PhysReadS32(RTGCPHYS SrcGCPhys)
+{
+    int32_t val;
+    STAM_PROFILE_ADV_START(&gStatMemRead, a);
+    VBOX_CHECK_ADDR(SrcGCPhys);
+    val = PGMR3PhysReadU32(cpu_single_env->pVM, SrcGCPhys, PGMACCESSORIGIN_REM);
+    STAM_PROFILE_ADV_STOP(&gStatMemRead, a);
+#ifdef VBOX_DEBUG_PHYS
+    LogRel(("reads32: %x <- %08x\n", val, (uint32_t)SrcGCPhys));
+#endif
+    return val;
+}
+
+
+/**
+ * Read guest RAM and ROM, unsigned 64-bit.
+ *
+ * @param   SrcGCPhys       The source address (guest physical).
+ */
+uint64_t remR3PhysReadU64(RTGCPHYS SrcGCPhys)
+{
+    uint64_t val;
+    STAM_PROFILE_ADV_START(&gStatMemRead, a);
+    VBOX_CHECK_ADDR(SrcGCPhys);
+    val = PGMR3PhysReadU64(cpu_single_env->pVM, SrcGCPhys, PGMACCESSORIGIN_REM);
+    STAM_PROFILE_ADV_STOP(&gStatMemRead, a);
+#ifdef VBOX_DEBUG_PHYS
+    LogRel(("readu64: %llx <- %08x\n", val, (uint32_t)SrcGCPhys));
+#endif
+    return val;
+}
+
+
+/**
+ * Read guest RAM and ROM, signed 64-bit.
+ *
+ * @param   SrcGCPhys       The source address (guest physical).
+ */
+int64_t remR3PhysReadS64(RTGCPHYS SrcGCPhys)
+{
+    int64_t val;
+    STAM_PROFILE_ADV_START(&gStatMemRead, a);
+    VBOX_CHECK_ADDR(SrcGCPhys);
+    val = PGMR3PhysReadU64(cpu_single_env->pVM, SrcGCPhys, PGMACCESSORIGIN_REM);
+    STAM_PROFILE_ADV_STOP(&gStatMemRead, a);
+#ifdef VBOX_DEBUG_PHYS
+    LogRel(("reads64: %llx <- %08x\n", val, (uint32_t)SrcGCPhys));
+#endif
+    return val;
+}
+
+
+/**
+ * Write guest RAM.
+ *
+ * @param   DstGCPhys       The destination address (guest physical).
+ * @param   pvSrc           The source address.
+ * @param   cb              Number of bytes to write
+ */
+void remR3PhysWrite(RTGCPHYS DstGCPhys, const void *pvSrc, unsigned cb)
+{
+    STAM_PROFILE_ADV_START(&gStatMemWrite, a);
+    VBOX_CHECK_ADDR(DstGCPhys);
+    VBOXSTRICTRC rcStrict = PGMPhysWrite(cpu_single_env->pVM, DstGCPhys, pvSrc, cb, PGMACCESSORIGIN_REM);
+    AssertMsg(rcStrict == VINF_SUCCESS, ("%Rrc\n", VBOXSTRICTRC_VAL(rcStrict))); NOREF(rcStrict);
+    STAM_PROFILE_ADV_STOP(&gStatMemWrite, a);
+#ifdef VBOX_DEBUG_PHYS
+    LogRel(("write(%d): %08x\n", cb, (uint32_t)DstGCPhys));
+#endif
+}
+
+
+/**
+ * Write guest RAM, unsigned 8-bit.
+ *
+ * @param   DstGCPhys       The destination address (guest physical).
+ * @param   val             Value
+ */
+void remR3PhysWriteU8(RTGCPHYS DstGCPhys, uint8_t val)
+{
+    STAM_PROFILE_ADV_START(&gStatMemWrite, a);
+    VBOX_CHECK_ADDR(DstGCPhys);
+    PGMR3PhysWriteU8(cpu_single_env->pVM, DstGCPhys, val, PGMACCESSORIGIN_REM);
+    STAM_PROFILE_ADV_STOP(&gStatMemWrite, a);
+#ifdef VBOX_DEBUG_PHYS
+    LogRel(("writeu8: %x -> %08x\n", val, (uint32_t)DstGCPhys));
+#endif
+}
+
+
+/**
+ * Write guest RAM, unsigned 8-bit.
+ *
+ * @param   DstGCPhys       The destination address (guest physical).
+ * @param   val             Value
+ */
+void remR3PhysWriteU16(RTGCPHYS DstGCPhys, uint16_t val)
+{
+    STAM_PROFILE_ADV_START(&gStatMemWrite, a);
+    VBOX_CHECK_ADDR(DstGCPhys);
+    PGMR3PhysWriteU16(cpu_single_env->pVM, DstGCPhys, val, PGMACCESSORIGIN_REM);
+    STAM_PROFILE_ADV_STOP(&gStatMemWrite, a);
+#ifdef VBOX_DEBUG_PHYS
+    LogRel(("writeu16: %x -> %08x\n", val, (uint32_t)DstGCPhys));
+#endif
+}
+
+
+/**
+ * Write guest RAM, unsigned 32-bit.
+ *
+ * @param   DstGCPhys       The destination address (guest physical).
+ * @param   val             Value
+ */
+void remR3PhysWriteU32(RTGCPHYS DstGCPhys, uint32_t val)
+{
+    STAM_PROFILE_ADV_START(&gStatMemWrite, a);
+    VBOX_CHECK_ADDR(DstGCPhys);
+    PGMR3PhysWriteU32(cpu_single_env->pVM, DstGCPhys, val, PGMACCESSORIGIN_REM);
+    STAM_PROFILE_ADV_STOP(&gStatMemWrite, a);
+#ifdef VBOX_DEBUG_PHYS
+    LogRel(("writeu32: %x -> %08x\n", val, (uint32_t)DstGCPhys));
+#endif
+}
+
+
+/**
+ * Write guest RAM, unsigned 64-bit.
+ *
+ * @param   DstGCPhys       The destination address (guest physical).
+ * @param   val             Value
+ */
+void remR3PhysWriteU64(RTGCPHYS DstGCPhys, uint64_t val)
+{
+    STAM_PROFILE_ADV_START(&gStatMemWrite, a);
+    VBOX_CHECK_ADDR(DstGCPhys);
+    PGMR3PhysWriteU64(cpu_single_env->pVM, DstGCPhys, val, PGMACCESSORIGIN_REM);
+    STAM_PROFILE_ADV_STOP(&gStatMemWrite, a);
+#ifdef VBOX_DEBUG_PHYS
+    LogRel(("writeu64: %llx -> %08x\n", val, (uint32_t)DstGCPhys));
+#endif
+}
+
+#undef LOG_GROUP
+#define LOG_GROUP LOG_GROUP_REM_MMIO
+
+/** Read MMIO memory. */
+static uint32_t remR3MMIOReadU8(void *pvEnv, target_phys_addr_t GCPhys)
+{
+    CPUX86State *env = (CPUX86State *)pvEnv;
+    uint32_t     u32 = 0;
+    int rc = IOMMMIORead(env->pVM, env->pVCpu, GCPhys, &u32, 1);
+    AssertMsg(rc == VINF_SUCCESS, ("rc=%Rrc\n", rc)); NOREF(rc);
+    Log2(("remR3MMIOReadU8: GCPhys=%RGp -> %02x\n", (RTGCPHYS)GCPhys, u32));
+    return u32;
+}
+
+/** Read MMIO memory. */
+static uint32_t remR3MMIOReadU16(void *pvEnv, target_phys_addr_t GCPhys)
+{
+    CPUX86State *env = (CPUX86State *)pvEnv;
+    uint32_t     u32 = 0;
+    int rc = IOMMMIORead(env->pVM, env->pVCpu, GCPhys, &u32, 2);
+    AssertMsg(rc == VINF_SUCCESS, ("rc=%Rrc\n", rc)); NOREF(rc);
+    Log2(("remR3MMIOReadU16: GCPhys=%RGp -> %04x\n", (RTGCPHYS)GCPhys, u32));
+    return u32;
+}
+
+/** Read MMIO memory. */
+static uint32_t remR3MMIOReadU32(void *pvEnv, target_phys_addr_t GCPhys)
+{
+    CPUX86State *env = (CPUX86State *)pvEnv;
+    uint32_t     u32 = 0;
+    int rc = IOMMMIORead(env->pVM, env->pVCpu, GCPhys, &u32, 4);
+    AssertMsg(rc == VINF_SUCCESS, ("rc=%Rrc\n", rc)); NOREF(rc);
+    Log2(("remR3MMIOReadU32: GCPhys=%RGp -> %08x\n", (RTGCPHYS)GCPhys, u32));
+    return u32;
+}
+
+/** Write to MMIO memory. */
+static void     remR3MMIOWriteU8(void *pvEnv, target_phys_addr_t GCPhys, uint32_t u32)
+{
+    CPUX86State *env = (CPUX86State *)pvEnv;
+    int          rc;
+    Log2(("remR3MMIOWriteU8: GCPhys=%RGp u32=%#x\n", (RTGCPHYS)GCPhys, u32));
+    rc = IOMMMIOWrite(env->pVM, env->pVCpu, GCPhys, u32, 1);
+    AssertMsg(rc == VINF_SUCCESS, ("rc=%Rrc\n", rc)); NOREF(rc);
+}
+
+/** Write to MMIO memory. */
+static void     remR3MMIOWriteU16(void *pvEnv, target_phys_addr_t GCPhys, uint32_t u32)
+{
+    CPUX86State *env = (CPUX86State *)pvEnv;
+    int          rc;
+    Log2(("remR3MMIOWriteU16: GCPhys=%RGp u32=%#x\n", (RTGCPHYS)GCPhys, u32));
+    rc = IOMMMIOWrite(env->pVM, env->pVCpu, GCPhys, u32, 2);
+    AssertMsg(rc == VINF_SUCCESS, ("rc=%Rrc\n", rc)); NOREF(rc);
+}
+
+/** Write to MMIO memory. */
+static void     remR3MMIOWriteU32(void *pvEnv, target_phys_addr_t GCPhys, uint32_t u32)
+{
+    CPUX86State *env = (CPUX86State *)pvEnv;
+    int          rc;
+    Log2(("remR3MMIOWriteU32: GCPhys=%RGp u32=%#x\n", (RTGCPHYS)GCPhys, u32));
+    rc = IOMMMIOWrite(env->pVM, env->pVCpu, GCPhys, u32, 4);
+    AssertMsg(rc == VINF_SUCCESS, ("rc=%Rrc\n", rc)); NOREF(rc);
+}
+
+
+#undef LOG_GROUP
+#define LOG_GROUP LOG_GROUP_REM_HANDLER
+
+/*  !!!WARNING!!! This is extremely hackish right now, we assume it's only for LFB access!  !!!WARNING!!!  */
+
+static uint32_t remR3HandlerReadU8(void *pvVM, target_phys_addr_t GCPhys)
+{
+    uint8_t u8;
+    Log2(("remR3HandlerReadU8: GCPhys=%RGp\n", (RTGCPHYS)GCPhys));
+    VBOXSTRICTRC rcStrict = PGMPhysRead((PVM)pvVM, GCPhys, &u8, sizeof(u8), PGMACCESSORIGIN_REM);
+    AssertMsg(rcStrict == VINF_SUCCESS, ("%Rrc\n", VBOXSTRICTRC_VAL(rcStrict))); NOREF(rcStrict);
+    return u8;
+}
+
+static uint32_t remR3HandlerReadU16(void *pvVM, target_phys_addr_t GCPhys)
+{
+    uint16_t u16;
+    Log2(("remR3HandlerReadU16: GCPhys=%RGp\n", (RTGCPHYS)GCPhys));
+    VBOXSTRICTRC rcStrict = PGMPhysRead((PVM)pvVM, GCPhys, &u16, sizeof(u16), PGMACCESSORIGIN_REM);
+    AssertMsg(rcStrict == VINF_SUCCESS, ("%Rrc\n", VBOXSTRICTRC_VAL(rcStrict))); NOREF(rcStrict);
+    return u16;
+}
+
+static uint32_t remR3HandlerReadU32(void *pvVM, target_phys_addr_t GCPhys)
+{
+    uint32_t u32;
+    Log2(("remR3HandlerReadU32: GCPhys=%RGp\n", (RTGCPHYS)GCPhys));
+    VBOXSTRICTRC rcStrict = PGMPhysRead((PVM)pvVM, GCPhys, &u32, sizeof(u32), PGMACCESSORIGIN_REM);
+    AssertMsg(rcStrict == VINF_SUCCESS, ("%Rrc\n", VBOXSTRICTRC_VAL(rcStrict))); NOREF(rcStrict);
+    return u32;
+}
+
+static void     remR3HandlerWriteU8(void *pvVM, target_phys_addr_t GCPhys, uint32_t u32)
+{
+    Log2(("remR3HandlerWriteU8: GCPhys=%RGp u32=%#x\n", (RTGCPHYS)GCPhys, u32));
+    VBOXSTRICTRC rcStrict = PGMPhysWrite((PVM)pvVM, GCPhys, &u32, sizeof(uint8_t), PGMACCESSORIGIN_REM);
+    AssertMsg(rcStrict == VINF_SUCCESS, ("%Rrc\n", VBOXSTRICTRC_VAL(rcStrict))); NOREF(rcStrict);
+}
+
+static void     remR3HandlerWriteU16(void *pvVM, target_phys_addr_t GCPhys, uint32_t u32)
+{
+    Log2(("remR3HandlerWriteU16: GCPhys=%RGp u32=%#x\n", (RTGCPHYS)GCPhys, u32));
+    VBOXSTRICTRC rcStrict = PGMPhysWrite((PVM)pvVM, GCPhys, &u32, sizeof(uint16_t), PGMACCESSORIGIN_REM);
+    AssertMsg(rcStrict == VINF_SUCCESS, ("%Rrc\n", VBOXSTRICTRC_VAL(rcStrict))); NOREF(rcStrict);
+}
+
+static void     remR3HandlerWriteU32(void *pvVM, target_phys_addr_t GCPhys, uint32_t u32)
+{
+    Log2(("remR3HandlerWriteU32: GCPhys=%RGp u32=%#x\n", (RTGCPHYS)GCPhys, u32));
+    VBOXSTRICTRC rcStrict = PGMPhysWrite((PVM)pvVM, GCPhys, &u32, sizeof(uint32_t), PGMACCESSORIGIN_REM);
+    AssertMsg(rcStrict == VINF_SUCCESS, ("%Rrc\n", VBOXSTRICTRC_VAL(rcStrict))); NOREF(rcStrict);
+}
+
+/* -+- disassembly -+- */
+
+#undef LOG_GROUP
+#define LOG_GROUP LOG_GROUP_REM_DISAS
+
+
+/**
+ * Enables or disables singled stepped disassembly.
+ *
+ * @returns VBox status code.
+ * @param   pVM         VM handle.
+ * @param   fEnable     To enable set this flag, to disable clear it.
+ */
+static DECLCALLBACK(int) remR3DisasEnableStepping(PVM pVM, bool fEnable)
+{
+    LogFlow(("remR3DisasEnableStepping: fEnable=%d\n", fEnable));
+    VM_ASSERT_EMT(pVM);
+
+    if (fEnable)
+        pVM->rem.s.Env.state |= CPU_EMULATE_SINGLE_STEP;
+    else
+        pVM->rem.s.Env.state &= ~CPU_EMULATE_SINGLE_STEP;
+#ifdef REM_USE_QEMU_SINGLE_STEP_FOR_LOGGING
+    cpu_single_step(&pVM->rem.s.Env, fEnable);
+#endif
+    return VINF_SUCCESS;
+}
+
+
+/**
+ * Enables or disables singled stepped disassembly.
+ *
+ * @returns VBox status code.
+ * @param   pVM         VM handle.
+ * @param   fEnable     To enable set this flag, to disable clear it.
+ */
+REMR3DECL(int) REMR3DisasEnableStepping(PVM pVM, bool fEnable)
+{
+    int rc;
+
+    LogFlow(("REMR3DisasEnableStepping: fEnable=%d\n", fEnable));
+    if (VM_IS_EMT(pVM))
+        return remR3DisasEnableStepping(pVM, fEnable);
+
+    rc = VMR3ReqPriorityCallWait(pVM, VMCPUID_ANY, (PFNRT)remR3DisasEnableStepping, 2, pVM, fEnable);
+    AssertRC(rc);
+    return rc;
+}
+
+
+#ifdef VBOX_WITH_DEBUGGER
+/**
+ * External Debugger Command: .remstep [on|off|1|0]
+ */
+static DECLCALLBACK(int) remR3CmdDisasEnableStepping(PCDBGCCMD pCmd, PDBGCCMDHLP pCmdHlp, PUVM pUVM,
+                                                     PCDBGCVAR paArgs, unsigned cArgs)
+{
+    int rc;
+    PVM pVM = pUVM->pVM;
+
+    if (cArgs == 0)
+        /*
+         * Print the current status.
+         */
+        rc = DBGCCmdHlpPrintf(pCmdHlp, "DisasStepping is %s\n",
+                              pVM->rem.s.Env.state & CPU_EMULATE_SINGLE_STEP ? "enabled" : "disabled");
+    else
+    {
+        /*
+         * Convert the argument and change the mode.
+         */
+        bool fEnable;
+        rc = DBGCCmdHlpVarToBool(pCmdHlp, &paArgs[0], &fEnable);
+        if (RT_SUCCESS(rc))
+        {
+            rc = REMR3DisasEnableStepping(pVM, fEnable);
+            if (RT_SUCCESS(rc))
+                rc = DBGCCmdHlpPrintf(pCmdHlp, "DisasStepping was %s\n", fEnable ? "enabled" : "disabled");
+            else
+                rc = DBGCCmdHlpFailRc(pCmdHlp, pCmd, rc, "REMR3DisasEnableStepping");
+        }
+        else
+            rc = DBGCCmdHlpFailRc(pCmdHlp, pCmd, rc, "DBGCCmdHlpVarToBool");
+    }
+    return rc;
+}
+#endif /* VBOX_WITH_DEBUGGER */
+
+
+/**
+ * Disassembles one instruction and prints it to the log.
+ *
+ * @returns Success indicator.
+ * @param   env         Pointer to the recompiler CPU structure.
+ * @param   f32BitCode  Indicates that whether or not the code should
+ *                      be disassembled as 16 or 32 bit. If -1 the CS
+ *                      selector will be inspected.
+ * @param   pszPrefix
+ */
+bool remR3DisasInstr(CPUX86State *env, int f32BitCode, char *pszPrefix)
+{
+    PVM pVM = env->pVM;
+    const bool fLog = LogIsEnabled();
+    const bool fLog2 = LogIs2Enabled();
+    int rc = VINF_SUCCESS;
+
+    /*
+     * Don't bother if there ain't any log output to do.
+     */
+    if (!fLog && !fLog2)
+        return true;
+
+    /*
+     * Update the state so DBGF reads the correct register values.
+     */
+    remR3StateUpdate(pVM, env->pVCpu);
+
+    /*
+     * Log registers if requested.
+     */
+    if (fLog2)
+        DBGFR3_INFO_LOG(pVM, env->pVCpu, "cpumguest", pszPrefix);
+
+    /*
+     * Disassemble to log.
+     */
+    if (fLog)
+    {
+        PVMCPU  pVCpu = VMMGetCpu(pVM);
+        char    szBuf[256];
+        szBuf[0] = '\0';
+        int rc = DBGFR3DisasInstrEx(pVCpu->pVMR3->pUVM,
+                                    pVCpu->idCpu,
+                                    0, /* Sel */  0, /* GCPtr */
+                                    DBGF_DISAS_FLAGS_CURRENT_GUEST | DBGF_DISAS_FLAGS_DEFAULT_MODE,
+                                    szBuf,
+                                    sizeof(szBuf),
+                                    NULL);
+        if (RT_FAILURE(rc))
+            RTStrPrintf(szBuf, sizeof(szBuf), "DBGFR3DisasInstrEx failed with rc=%Rrc\n", rc);
+        if (pszPrefix && *pszPrefix)
+            RTLogPrintf("%s-CPU%d: %s\n", pszPrefix, pVCpu->idCpu, szBuf);
+        else
+            RTLogPrintf("CPU%d: %s\n", pVCpu->idCpu, szBuf);
+    }
+
+    return RT_SUCCESS(rc);
+}
+
+
+/**
+ * Disassemble recompiled code.
+ *
+ * @param   phFileIgnored   Ignored, logfile usually.
+ * @param   pvCode          Pointer to the code block.
+ * @param   cb              Size of the code block.
+ */
+void disas(FILE *phFileIgnored, void *pvCode, unsigned long cb)
+{
+    if (LogIs2Enabled())
+    {
+        unsigned        off = 0;
+        char            szOutput[256];
+        DISCPUSTATE     Cpu;
+#ifdef RT_ARCH_X86
+        DISCPUMODE      enmCpuMode = DISCPUMODE_32BIT;
+#else
+        DISCPUMODE      enmCpuMode = DISCPUMODE_64BIT;
+#endif
+
+        RTLogPrintf("Recompiled Code: %p %#lx (%ld) bytes\n", pvCode, cb, cb);
+        while (off < cb)
+        {
+            uint32_t cbInstr;
+            int rc = DISInstrToStr((uint8_t const *)pvCode + off, enmCpuMode,
+                                   &Cpu, &cbInstr, szOutput, sizeof(szOutput));
+            if (RT_SUCCESS(rc))
+                RTLogPrintf("%s", szOutput);
+            else
+            {
+                RTLogPrintf("disas error %Rrc\n", rc);
+                cbInstr = 1;
+            }
+            off += cbInstr;
+        }
+    }
+}
+
+
+/**
+ * Disassemble guest code.
+ *
+ * @param   phFileIgnored   Ignored, logfile usually.
+ * @param   uCode           The guest address of the code to disassemble. (flat?)
+ * @param   cb              Number of bytes to disassemble.
+ * @param   fFlags          Flags, probably something which tells if this is 16, 32 or 64 bit code.
+ */
+void target_disas(FILE *phFileIgnored, target_ulong uCode, target_ulong cb, int fFlags)
+{
+    if (LogIs2Enabled())
+    {
+        PVM         pVM = cpu_single_env->pVM;
+        PVMCPU      pVCpu = cpu_single_env->pVCpu;
+        RTSEL       cs;
+        RTGCUINTPTR eip;
+
+        Assert(pVCpu);
+
+        /*
+         * Update the state so DBGF reads the correct register values (flags).
+         */
+        remR3StateUpdate(pVM, pVCpu);
+
+        /*
+         * Do the disassembling.
+         */
+        RTLogPrintf("Guest Code: PC=%llx %llx bytes fFlags=%d\n", (uint64_t)uCode, (uint64_t)cb, fFlags);
+        cs = cpu_single_env->segs[R_CS].selector;
+        eip = uCode - cpu_single_env->segs[R_CS].base;
+        for (;;)
+        {
+            char        szBuf[256];
+            uint32_t    cbInstr;
+            int rc = DBGFR3DisasInstrEx(pVM->pUVM,
+                                        pVCpu->idCpu,
+                                        cs,
+                                        eip,
+                                        DBGF_DISAS_FLAGS_DEFAULT_MODE,
+                                        szBuf, sizeof(szBuf),
+                                        &cbInstr);
+            if (RT_SUCCESS(rc))
+                RTLogPrintf("%llx %s\n", (uint64_t)uCode, szBuf);
+            else
+            {
+                RTLogPrintf("%llx %04x:%llx: %s\n", (uint64_t)uCode, cs, (uint64_t)eip, szBuf);
+                cbInstr = 1;
+            }
+
+            /* next */
+            if (cb <= cbInstr)
+                break;
+            cb -= cbInstr;
+            uCode += cbInstr;
+            eip += cbInstr;
+        }
+    }
+}
+
+
+/**
+ * Looks up a guest symbol.
+ *
+ * @returns Pointer to symbol name. This is a static buffer.
+ * @param   orig_addr   The address in question.
+ */
+const char *lookup_symbol(target_ulong orig_addr)
+{
+    PVM         pVM = cpu_single_env->pVM;
+    RTGCINTPTR  off = 0;
+    RTDBGSYMBOL Sym;
+    DBGFADDRESS Addr;
+
+    int rc = DBGFR3AsSymbolByAddr(pVM->pUVM, DBGF_AS_GLOBAL, DBGFR3AddrFromFlat(pVM->pUVM, &Addr, orig_addr),
+                                  RTDBGSYMADDR_FLAGS_LESS_OR_EQUAL | RTDBGSYMADDR_FLAGS_SKIP_ABS_IN_DEFERRED,
+                                  &off, &Sym, NULL /*phMod*/);
+    if (RT_SUCCESS(rc))
+    {
+        static char szSym[sizeof(Sym.szName) + 48];
+        if (!off)
+            RTStrPrintf(szSym,  sizeof(szSym), "%s\n", Sym.szName);
+        else if (off > 0)
+            RTStrPrintf(szSym,  sizeof(szSym), "%s+%x\n", Sym.szName,  off);
+        else
+            RTStrPrintf(szSym,  sizeof(szSym), "%s-%x\n", Sym.szName,  -off);
+        return szSym;
+    }
+    return "<N/A>";
+}
+
+
+#undef LOG_GROUP
+#define LOG_GROUP LOG_GROUP_REM
+
+
+/* -+- FF notifications -+- */
+
+/**
+ * Notification about the interrupt FF being set.
+ *
+ * @param   pVM             VM Handle.
+ * @param   pVCpu           VMCPU Handle.
+ * @thread  The emulation thread.
+ */
+REMR3DECL(void) REMR3NotifyInterruptSet(PVM pVM, PVMCPU pVCpu)
+{
+    LogFlow(("REMR3NotifyInterruptSet: fInRem=%d interrupts %s\n", pVM->rem.s.fInREM,
+             (pVM->rem.s.Env.eflags & IF_MASK) && !(pVM->rem.s.Env.hflags & HF_INHIBIT_IRQ_MASK) ? "enabled" : "disabled"));
+    if (pVM->rem.s.fInREM)
+        ASMAtomicOrS32((int32_t volatile *)&cpu_single_env->interrupt_request, CPU_INTERRUPT_EXTERNAL_HARD);
+}
+
+
+/**
+ * Notification about the interrupt FF being set.
+ *
+ * @param   pVM             VM Handle.
+ * @param   pVCpu           VMCPU Handle.
+ * @thread  Any.
+ */
+REMR3DECL(void) REMR3NotifyInterruptClear(PVM pVM, PVMCPU pVCpu)
+{
+    LogFlow(("REMR3NotifyInterruptClear:\n"));
+    if (pVM->rem.s.fInREM)
+        cpu_reset_interrupt(cpu_single_env, CPU_INTERRUPT_HARD);
+}
+
+
+/**
+ * Notification about pending timer(s).
+ *
+ * @param   pVM             VM Handle.
+ * @param   pVCpuDst        The target cpu for this notification.
+ *                          TM will not broadcast pending timer events, but use
+ *                          a dedicated EMT for them. So, only interrupt REM
+ *                          execution if the given CPU is executing in REM.
+ * @thread  Any.
+ */
+REMR3DECL(void) REMR3NotifyTimerPending(PVM pVM, PVMCPU pVCpuDst)
+{
+#ifndef DEBUG_bird
+    LogFlow(("REMR3NotifyTimerPending: fInRem=%d\n", pVM->rem.s.fInREM));
+#endif
+    if (pVM->rem.s.fInREM)
+    {
+        if (pVM->rem.s.Env.pVCpu == pVCpuDst)
+        {
+            LogIt(RTLOGGRPFLAGS_LEVEL_5, LOG_GROUP_TM, ("REMR3NotifyTimerPending: setting\n"));
+            ASMAtomicOrS32((int32_t volatile *)&pVM->rem.s.Env.interrupt_request,
+                           CPU_INTERRUPT_EXTERNAL_TIMER);
+        }
+        else
+            LogIt(RTLOGGRPFLAGS_LEVEL_5, LOG_GROUP_TM, ("REMR3NotifyTimerPending: pVCpu:%p != pVCpuDst:%p\n", pVM->rem.s.Env.pVCpu, pVCpuDst));
+    }
+    else
+        LogIt(RTLOGGRPFLAGS_LEVEL_5, LOG_GROUP_TM, ("REMR3NotifyTimerPending: !fInREM; cpu state=%d\n", VMCPU_GET_STATE(pVCpuDst)));
+}
+
+
+/**
+ * Notification about pending DMA transfers.
+ *
+ * @param   pVM             VM Handle.
+ * @thread  Any.
+ */
+REMR3DECL(void) REMR3NotifyDmaPending(PVM pVM)
+{
+    LogFlow(("REMR3NotifyDmaPending: fInRem=%d\n", pVM->rem.s.fInREM));
+    if (pVM->rem.s.fInREM)
+        ASMAtomicOrS32((int32_t volatile *)&cpu_single_env->interrupt_request, CPU_INTERRUPT_EXTERNAL_DMA);
+}
+
+
+/**
+ * Notification about pending timer(s).
+ *
+ * @param   pVM             VM Handle.
+ * @thread  Any.
+ */
+REMR3DECL(void) REMR3NotifyQueuePending(PVM pVM)
+{
+    LogFlow(("REMR3NotifyQueuePending: fInRem=%d\n", pVM->rem.s.fInREM));
+    if (pVM->rem.s.fInREM)
+        ASMAtomicOrS32((int32_t volatile *)&cpu_single_env->interrupt_request, CPU_INTERRUPT_EXTERNAL_EXIT);
+}
+
+
+/**
+ * Notification about pending FF set by an external thread.
+ *
+ * @param   pVM             VM handle.
+ * @thread  Any.
+ */
+REMR3DECL(void) REMR3NotifyFF(PVM pVM)
+{
+    LogFlow(("REMR3NotifyFF: fInRem=%d\n", pVM->rem.s.fInREM));
+    if (pVM->rem.s.fInREM)
+        ASMAtomicOrS32((int32_t volatile *)&cpu_single_env->interrupt_request, CPU_INTERRUPT_EXTERNAL_EXIT);
+}
+
+
+#ifdef VBOX_WITH_STATISTICS
+void remR3ProfileStart(int statcode)
+{
+    STAMPROFILEADV *pStat;
+    switch(statcode)
+    {
+    case STATS_EMULATE_SINGLE_INSTR:
+        pStat = &gStatExecuteSingleInstr;
+        break;
+    case STATS_QEMU_COMPILATION:
+        pStat = &gStatCompilationQEmu;
+        break;
+    case STATS_QEMU_RUN_EMULATED_CODE:
+        pStat = &gStatRunCodeQEmu;
+        break;
+    case STATS_QEMU_TOTAL:
+        pStat = &gStatTotalTimeQEmu;
+        break;
+    case STATS_QEMU_RUN_TIMERS:
+        pStat = &gStatTimers;
+        break;
+    case STATS_TLB_LOOKUP:
+        pStat= &gStatTBLookup;
+        break;
+    case STATS_IRQ_HANDLING:
+        pStat= &gStatIRQ;
+        break;
+    case STATS_RAW_CHECK:
+        pStat = &gStatRawCheck;
+        break;
+
+    default:
+        AssertMsgFailed(("unknown stat %d\n", statcode));
+        return;
+    }
+    STAM_PROFILE_ADV_START(pStat, a);
+}
+
+
+void remR3ProfileStop(int statcode)
+{
+    STAMPROFILEADV *pStat;
+    switch(statcode)
+    {
+    case STATS_EMULATE_SINGLE_INSTR:
+        pStat = &gStatExecuteSingleInstr;
+        break;
+    case STATS_QEMU_COMPILATION:
+        pStat = &gStatCompilationQEmu;
+        break;
+    case STATS_QEMU_RUN_EMULATED_CODE:
+        pStat = &gStatRunCodeQEmu;
+        break;
+    case STATS_QEMU_TOTAL:
+        pStat = &gStatTotalTimeQEmu;
+        break;
+    case STATS_QEMU_RUN_TIMERS:
+        pStat = &gStatTimers;
+        break;
+    case STATS_TLB_LOOKUP:
+        pStat= &gStatTBLookup;
+        break;
+    case STATS_IRQ_HANDLING:
+        pStat= &gStatIRQ;
+        break;
+    case STATS_RAW_CHECK:
+        pStat = &gStatRawCheck;
+        break;
+    default:
+        AssertMsgFailed(("unknown stat %d\n", statcode));
+        return;
+    }
+    STAM_PROFILE_ADV_STOP(pStat, a);
+}
+#endif
+
+/**
+ * Raise an RC, force rem exit.
+ *
+ * @param   pVM     VM handle.
+ * @param   rc      The rc.
+ */
+void remR3RaiseRC(PVM pVM, int rc)
+{
+    Log(("remR3RaiseRC: rc=%Rrc\n", rc));
+    Assert(pVM->rem.s.fInREM);
+    VM_ASSERT_EMT(pVM);
+    pVM->rem.s.rc = rc;
+    cpu_interrupt(&pVM->rem.s.Env, CPU_INTERRUPT_RC);
+}
+
+
+/* -+- timers -+- */
+
+uint64_t cpu_get_tsc(CPUX86State *env)
+{
+    STAM_COUNTER_INC(&gStatCpuGetTSC);
+    return TMCpuTickGet(env->pVCpu);
+}
+
+
+/* -+- interrupts -+- */
+
+void cpu_set_ferr(CPUX86State *env)
+{
+    int rc = PDMIsaSetIrq(env->pVM, 13, 1, 0 /*uTagSrc*/);
+    LogFlow(("cpu_set_ferr: rc=%d\n", rc)); NOREF(rc);
+}
+
+int cpu_get_pic_interrupt(CPUX86State *env)
+{
+    uint8_t u8Interrupt;
+    int     rc;
+
+    if (VMCPU_FF_TEST_AND_CLEAR(env->pVCpu, VMCPU_FF_UPDATE_APIC))
+        APICUpdatePendingInterrupts(env->pVCpu);
+
+    /* When we fail to forward interrupts directly in raw mode, we fall back to the recompiler.
+     * In that case we can't call PDMGetInterrupt anymore, because it has already cleared the interrupt
+     * with the (a)pic.
+     */
+    /* Note! We assume we will go directly to the recompiler to handle the pending interrupt! */
+    rc = PDMGetInterrupt(env->pVCpu, &u8Interrupt);
+    LogFlow(("cpu_get_pic_interrupt: u8Interrupt=%d rc=%Rrc pc=%04x:%08llx ~flags=%08llx\n",
+             u8Interrupt, rc, env->segs[R_CS].selector, (uint64_t)env->eip, (uint64_t)env->eflags));
+    if (RT_SUCCESS(rc))
+    {
+        if (VMCPU_FF_IS_ANY_SET(env->pVCpu, VMCPU_FF_INTERRUPT_APIC | VMCPU_FF_INTERRUPT_PIC))
+            env->interrupt_request |= CPU_INTERRUPT_HARD;
+        return u8Interrupt;
+    }
+    return -1;
+}
+
+
+/* -+- local apic -+- */
+
+#if 0 /* CPUMSetGuestMsr does this now. */
+void cpu_set_apic_base(CPUX86State *env, uint64_t val)
+{
+    int rc = PDMApicSetBase(env->pVM, val);
+    LogFlow(("cpu_set_apic_base: val=%#llx rc=%Rrc\n", val, rc)); NOREF(rc);
+}
+#endif
+
+uint64_t cpu_get_apic_base(CPUX86State *env)
+{
+    uint64_t u64;
+    VBOXSTRICTRC rcStrict = CPUMQueryGuestMsr(env->pVCpu, MSR_IA32_APICBASE, &u64);
+    if (RT_SUCCESS(rcStrict))
+    {
+        LogFlow(("cpu_get_apic_base: returns %#llx \n", u64));
+        return u64;
+    }
+    LogFlow(("cpu_get_apic_base: returns 0 (rc=%Rrc)\n", VBOXSTRICTRC_VAL(rcStrict)));
+    return 0;
+}
+
+void cpu_set_apic_tpr(CPUX86State *env, uint8_t val)
+{
+    int rc = APICSetTpr(env->pVCpu, val << 4);       /* cr8 bits 3-0 correspond to bits 7-4 of the task priority mmio register. */
+    LogFlow(("cpu_set_apic_tpr: val=%#x rc=%Rrc\n", val, rc)); NOREF(rc);
+}
+
+uint8_t cpu_get_apic_tpr(CPUX86State *env)
+{
+    uint8_t u8;
+    int rc = APICGetTpr(env->pVCpu, &u8, NULL, NULL);
+    if (RT_SUCCESS(rc))
+    {
+        LogFlow(("cpu_get_apic_tpr: returns %#x\n", u8));
+        return u8 >> 4;     /* cr8 bits 3-0 correspond to bits 7-4 of the task priority mmio register. */
+    }
+    LogFlow(("cpu_get_apic_tpr: returns 0 (rc=%Rrc)\n", rc));
+    return 0;
+}
+
+/**
+ * Read an MSR.
+ *
+ * @retval 0 success.
+ * @retval -1 failure, raise \#GP(0).
+ * @param   env     The cpu state.
+ * @param   idMsr   The MSR to read.
+ * @param   puValue Where to return the value.
+ */
+int cpu_rdmsr(CPUX86State *env, uint32_t idMsr, uint64_t *puValue)
+{
+    Assert(env->pVCpu);
+    return CPUMQueryGuestMsr(env->pVCpu, idMsr, puValue) == VINF_SUCCESS ? 0 : -1;
+}
+
+/**
+ * Write to an MSR.
+ *
+ * @retval 0 success.
+ * @retval -1 failure, raise \#GP(0).
+ * @param   env     The cpu state.
+ * @param   idMsr   The MSR to write to.
+ * @param   uValue  The value to write.
+ */
+int cpu_wrmsr(CPUX86State *env, uint32_t idMsr, uint64_t uValue)
+{
+    Assert(env->pVCpu);
+    return CPUMSetGuestMsr(env->pVCpu, idMsr, uValue) == VINF_SUCCESS ? 0 : -1;
+}
+
+/* -+- I/O Ports -+- */
+
+#undef LOG_GROUP
+#define LOG_GROUP LOG_GROUP_REM_IOPORT
+
+void cpu_outb(CPUX86State *env, pio_addr_t addr, uint8_t val)
+{
+    int rc;
+
+    if (addr != 0x80 && addr != 0x70 && addr != 0x61)
+        Log2(("cpu_outb: addr=%#06x val=%#x\n", addr, val));
+
+    rc = IOMIOPortWrite(env->pVM, env->pVCpu, (RTIOPORT)addr, val, 1);
+    if (RT_LIKELY(rc == VINF_SUCCESS))
+        return;
+    if (rc >= VINF_EM_FIRST && rc <= VINF_EM_LAST)
+    {
+        Log(("cpu_outb: addr=%#06x val=%#x -> %Rrc\n", addr, val, rc));
+        remR3RaiseRC(env->pVM, rc);
+        return;
+    }
+    remAbort(rc, __FUNCTION__);
+}
+
+void cpu_outw(CPUX86State *env, pio_addr_t addr, uint16_t val)
+{
+    //Log2(("cpu_outw: addr=%#06x val=%#x\n", addr, val));
+    int rc = IOMIOPortWrite(env->pVM, env->pVCpu, (RTIOPORT)addr, val, 2);
+    if (RT_LIKELY(rc == VINF_SUCCESS))
+        return;
+    if (rc >= VINF_EM_FIRST && rc <= VINF_EM_LAST)
+    {
+        Log(("cpu_outw: addr=%#06x val=%#x -> %Rrc\n", addr, val, rc));
+        remR3RaiseRC(env->pVM, rc);
+        return;
+    }
+    remAbort(rc, __FUNCTION__);
+}
+
+void cpu_outl(CPUX86State *env, pio_addr_t addr, uint32_t val)
+{
+    int rc;
+    Log2(("cpu_outl: addr=%#06x val=%#x\n", addr, val));
+    rc = IOMIOPortWrite(env->pVM, env->pVCpu, (RTIOPORT)addr, val, 4);
+    if (RT_LIKELY(rc == VINF_SUCCESS))
+        return;
+    if (rc >= VINF_EM_FIRST && rc <= VINF_EM_LAST)
+    {
+        Log(("cpu_outl: addr=%#06x val=%#x -> %Rrc\n", addr, val, rc));
+        remR3RaiseRC(env->pVM, rc);
+        return;
+    }
+    remAbort(rc, __FUNCTION__);
+}
+
+uint8_t cpu_inb(CPUX86State *env, pio_addr_t addr)
+{
+    uint32_t u32 = 0;
+    int rc = IOMIOPortRead(env->pVM, env->pVCpu, (RTIOPORT)addr, &u32, 1);
+    if (RT_LIKELY(rc == VINF_SUCCESS))
+    {
+        if (/*addr != 0x61 && */addr != 0x71)
+            Log2(("cpu_inb: addr=%#06x -> %#x\n", addr, u32));
+        return (uint8_t)u32;
+    }
+    if (rc >= VINF_EM_FIRST && rc <= VINF_EM_LAST)
+    {
+        Log(("cpu_inb: addr=%#06x -> %#x rc=%Rrc\n", addr, u32, rc));
+        remR3RaiseRC(env->pVM, rc);
+        return (uint8_t)u32;
+    }
+    remAbort(rc, __FUNCTION__);
+    return UINT8_C(0xff);
+}
+
+uint16_t cpu_inw(CPUX86State *env, pio_addr_t addr)
+{
+    uint32_t u32 = 0;
+    int rc = IOMIOPortRead(env->pVM, env->pVCpu, (RTIOPORT)addr, &u32, 2);
+    if (RT_LIKELY(rc == VINF_SUCCESS))
+    {
+        Log2(("cpu_inw: addr=%#06x -> %#x\n", addr, u32));
+        return (uint16_t)u32;
+    }
+    if (rc >= VINF_EM_FIRST && rc <= VINF_EM_LAST)
+    {
+        Log(("cpu_inw: addr=%#06x -> %#x rc=%Rrc\n", addr, u32, rc));
+        remR3RaiseRC(env->pVM, rc);
+        return (uint16_t)u32;
+    }
+    remAbort(rc, __FUNCTION__);
+    return UINT16_C(0xffff);
+}
+
+uint32_t cpu_inl(CPUX86State *env, pio_addr_t addr)
+{
+    uint32_t u32 = 0;
+    int rc = IOMIOPortRead(env->pVM, env->pVCpu, (RTIOPORT)addr, &u32, 4);
+    if (RT_LIKELY(rc == VINF_SUCCESS))
+    {
+        Log2(("cpu_inl: addr=%#06x -> %#x\n", addr, u32));
+        return u32;
+    }
+    if (rc >= VINF_EM_FIRST && rc <= VINF_EM_LAST)
+    {
+        Log(("cpu_inl: addr=%#06x -> %#x rc=%Rrc\n", addr, u32, rc));
+        remR3RaiseRC(env->pVM, rc);
+        return u32;
+    }
+    remAbort(rc, __FUNCTION__);
+    return UINT32_C(0xffffffff);
+}
+
+#undef LOG_GROUP
+#define LOG_GROUP LOG_GROUP_REM
+
+
+/* -+- helpers and misc other interfaces -+- */
+
+/**
+ * Perform the CPUID instruction.
+ *
+ * @param   env         Pointer to the recompiler CPU structure.
+ * @param   idx         The CPUID leaf (eax).
+ * @param   idxSub      The CPUID sub-leaf (ecx) where applicable.
+ * @param   pEAX        Where to store eax.
+ * @param   pEBX        Where to store ebx.
+ * @param   pECX        Where to store ecx.
+ * @param   pEDX        Where to store edx.
+ */
+void cpu_x86_cpuid(CPUX86State *env, uint32_t idx, uint32_t idxSub,
+                   uint32_t *pEAX, uint32_t *pEBX, uint32_t *pECX, uint32_t *pEDX)
+{
+    NOREF(idxSub);
+    CPUMGetGuestCpuId(env->pVCpu, idx, idxSub, pEAX, pEBX, pECX, pEDX);
+}
+
+
+#if 0 /* not used */
+/**
+ * Interface for qemu hardware to report back fatal errors.
+ */
+void hw_error(const char *pszFormat, ...)
+{
+    /*
+     * Bitch about it.
+     */
+    /** @todo Add support for nested arg lists in the LogPrintfV routine! I've code for
+     * this in my Odin32 tree at home! */
+    va_list args;
+    va_start(args, pszFormat);
+    RTLogPrintf("fatal error in virtual hardware:");
+    RTLogPrintfV(pszFormat, args);
+    va_end(args);
+    AssertReleaseMsgFailed(("fatal error in virtual hardware: %s\n", pszFormat));
+
+    /*
+     * If we're in REM context we'll sync back the state before 'jumping' to
+     * the EMs failure handling.
+     */
+    PVM pVM = cpu_single_env->pVM;
+    if (pVM->rem.s.fInREM)
+        REMR3StateBack(pVM);
+    EMR3FatalError(pVM, VERR_REM_VIRTUAL_HARDWARE_ERROR);
+    AssertMsgFailed(("EMR3FatalError returned!\n"));
+}
+#endif
+
+/**
+ * Interface for the qemu cpu to report unhandled situation
+ * raising a fatal VM error.
+ */
+void cpu_abort(CPUX86State *env, const char *pszFormat, ...)
+{
+    va_list va;
+    PVM     pVM;
+    PVMCPU  pVCpu;
+    char    szMsg[256];
+
+    /*
+     * Bitch about it.
+     */
+    RTLogFlags(NULL, "nodisabled nobuffered");
+    RTLogFlush(NULL);
+
+    va_start(va, pszFormat);
+#if defined(RT_OS_WINDOWS) && ARCH_BITS == 64
+    /* It's a bit complicated when mixing MSC and GCC on AMD64. This is a bit ugly, but it works. */
+    unsigned    cArgs     = 0;
+    uintptr_t   auArgs[6] = {0,0,0,0,0,0};
+    const char *psz       = strchr(pszFormat, '%');
+    while (psz && cArgs < 6)
+    {
+        auArgs[cArgs++] = va_arg(va, uintptr_t);
+        psz = strchr(psz + 1, '%');
+    }
+    switch (cArgs)
+    {
+        case 1: RTStrPrintf(szMsg, sizeof(szMsg), pszFormat, auArgs[0]); break;
+        case 2: RTStrPrintf(szMsg, sizeof(szMsg), pszFormat, auArgs[0], auArgs[1]); break;
+        case 3: RTStrPrintf(szMsg, sizeof(szMsg), pszFormat, auArgs[0], auArgs[1], auArgs[2]); break;
+        case 4: RTStrPrintf(szMsg, sizeof(szMsg), pszFormat, auArgs[0], auArgs[1], auArgs[2], auArgs[3]); break;
+        case 5: RTStrPrintf(szMsg, sizeof(szMsg), pszFormat, auArgs[0], auArgs[1], auArgs[2], auArgs[3], auArgs[4]); break;
+        case 6: RTStrPrintf(szMsg, sizeof(szMsg), pszFormat, auArgs[0], auArgs[1], auArgs[2], auArgs[3], auArgs[4], auArgs[5]); break;
+        default:
+        case 0: RTStrPrintf(szMsg, sizeof(szMsg), "%s", pszFormat); break;
+    }
+#else
+    RTStrPrintfV(szMsg, sizeof(szMsg), pszFormat, va);
+#endif
+    va_end(va);
+
+    RTLogPrintf("fatal error in recompiler cpu: %s\n", szMsg);
+    RTLogRelPrintf("fatal error in recompiler cpu: %s\n", szMsg);
+
+    /*
+     * If we're in REM context we'll sync back the state before 'jumping' to
+     * the EMs failure handling.
+     */
+    pVM   = cpu_single_env->pVM;
+    pVCpu = cpu_single_env->pVCpu;
+    Assert(pVCpu);
+
+    if (pVM->rem.s.fInREM)
+        REMR3StateBack(pVM, pVCpu);
+    EMR3FatalError(pVCpu, VERR_REM_VIRTUAL_CPU_ERROR);
+    AssertMsgFailed(("EMR3FatalError returned!\n"));
+}
+
+
+/**
+ * Aborts the VM.
+ *
+ * @param   rc      VBox error code.
+ * @param   pszTip  Hint about why/when this happened.
+ */
+void remAbort(int rc, const char *pszTip)
+{
+    PVM     pVM;
+    PVMCPU  pVCpu;
+
+    /*
+     * Bitch about it.
+     */
+    RTLogPrintf("internal REM fatal error: rc=%Rrc %s\n", rc, pszTip);
+    AssertReleaseMsgFailed(("internal REM fatal error: rc=%Rrc %s\n", rc, pszTip));
+
+    /*
+     * Jump back to where we entered the recompiler.
+     */
+    pVM = cpu_single_env->pVM;
+    pVCpu = cpu_single_env->pVCpu;
+    Assert(pVCpu);
+
+    if (pVM->rem.s.fInREM)
+        REMR3StateBack(pVM, pVCpu);
+
+    EMR3FatalError(pVCpu, rc);
+    AssertMsgFailed(("EMR3FatalError returned!\n"));
+}
+
+
+/**
+ * Dumps a linux system call.
+ * @param   pVCpu     VMCPU handle.
+ */
+void remR3DumpLnxSyscall(PVMCPU pVCpu)
+{
+    static const char *apsz[] =
+    {
+        "sys_restart_syscall",  /* 0 - old "setup()" system call, used for restarting */
+        "sys_exit",
+        "sys_fork",
+        "sys_read",
+        "sys_write",
+        "sys_open",             /* 5 */
+        "sys_close",
+        "sys_waitpid",
+        "sys_creat",
+        "sys_link",
+        "sys_unlink",   /* 10 */
+        "sys_execve",
+        "sys_chdir",
+        "sys_time",
+        "sys_mknod",
+        "sys_chmod",            /* 15 */
+        "sys_lchown16",
+        "sys_ni_syscall",       /* old break syscall holder */
+        "sys_stat",
+        "sys_lseek",
+        "sys_getpid",   /* 20 */
+        "sys_mount",
+        "sys_oldumount",
+        "sys_setuid16",
+        "sys_getuid16",
+        "sys_stime",            /* 25 */
+        "sys_ptrace",
+        "sys_alarm",
+        "sys_fstat",
+        "sys_pause",
+        "sys_utime",            /* 30 */
+        "sys_ni_syscall",       /* old stty syscall holder */
+        "sys_ni_syscall",       /* old gtty syscall holder */
+        "sys_access",
+        "sys_nice",
+        "sys_ni_syscall",       /* 35 - old ftime syscall holder */
+        "sys_sync",
+        "sys_kill",
+        "sys_rename",
+        "sys_mkdir",
+        "sys_rmdir",            /* 40 */
+        "sys_dup",
+        "sys_pipe",
+        "sys_times",
+        "sys_ni_syscall",       /* old prof syscall holder */
+        "sys_brk",              /* 45 */
+        "sys_setgid16",
+        "sys_getgid16",
+        "sys_signal",
+        "sys_geteuid16",
+        "sys_getegid16",        /* 50 */
+        "sys_acct",
+        "sys_umount",   /* recycled never used phys() */
+        "sys_ni_syscall",       /* old lock syscall holder */
+        "sys_ioctl",
+        "sys_fcntl",            /* 55 */
+        "sys_ni_syscall",       /* old mpx syscall holder */
+        "sys_setpgid",
+        "sys_ni_syscall",       /* old ulimit syscall holder */
+        "sys_olduname",
+        "sys_umask",            /* 60 */
+        "sys_chroot",
+        "sys_ustat",
+        "sys_dup2",
+        "sys_getppid",
+        "sys_getpgrp",  /* 65 */
+        "sys_setsid",
+        "sys_sigaction",
+        "sys_sgetmask",
+        "sys_ssetmask",
+        "sys_setreuid16",       /* 70 */
+        "sys_setregid16",
+        "sys_sigsuspend",
+        "sys_sigpending",
+        "sys_sethostname",
+        "sys_setrlimit",        /* 75 */
+        "sys_old_getrlimit",
+        "sys_getrusage",
+        "sys_gettimeofday",
+        "sys_settimeofday",
+        "sys_getgroups16",      /* 80 */
+        "sys_setgroups16",
+        "old_select",
+        "sys_symlink",
+        "sys_lstat",
+        "sys_readlink", /* 85 */
+        "sys_uselib",
+        "sys_swapon",
+        "sys_reboot",
+        "old_readdir",
+        "old_mmap",             /* 90 */
+        "sys_munmap",
+        "sys_truncate",
+        "sys_ftruncate",
+        "sys_fchmod",
+        "sys_fchown16", /* 95 */
+        "sys_getpriority",
+        "sys_setpriority",
+        "sys_ni_syscall",       /* old profil syscall holder */
+        "sys_statfs",
+        "sys_fstatfs",  /* 100 */
+        "sys_ioperm",
+        "sys_socketcall",
+        "sys_syslog",
+        "sys_setitimer",
+        "sys_getitimer",        /* 105 */
+        "sys_newstat",
+        "sys_newlstat",
+        "sys_newfstat",
+        "sys_uname",
+        "sys_iopl",             /* 110 */
+        "sys_vhangup",
+        "sys_ni_syscall",       /* old "idle" system call */
+        "sys_vm86old",
+        "sys_wait4",
+        "sys_swapoff",  /* 115 */
+        "sys_sysinfo",
+        "sys_ipc",
+        "sys_fsync",
+        "sys_sigreturn",
+        "sys_clone",            /* 120 */
+        "sys_setdomainname",
+        "sys_newuname",
+        "sys_modify_ldt",
+        "sys_adjtimex",
+        "sys_mprotect", /* 125 */
+        "sys_sigprocmask",
+        "sys_ni_syscall",       /* old "create_module" */
+        "sys_init_module",
+        "sys_delete_module",
+        "sys_ni_syscall",       /* 130: old "get_kernel_syms" */
+        "sys_quotactl",
+        "sys_getpgid",
+        "sys_fchdir",
+        "sys_bdflush",
+        "sys_sysfs",            /* 135 */
+        "sys_personality",
+        "sys_ni_syscall",       /* reserved for afs_syscall */
+        "sys_setfsuid16",
+        "sys_setfsgid16",
+        "sys_llseek",   /* 140 */
+        "sys_getdents",
+        "sys_select",
+        "sys_flock",
+        "sys_msync",
+        "sys_readv",            /* 145 */
+        "sys_writev",
+        "sys_getsid",
+        "sys_fdatasync",
+        "sys_sysctl",
+        "sys_mlock",            /* 150 */
+        "sys_munlock",
+        "sys_mlockall",
+        "sys_munlockall",
+        "sys_sched_setparam",
+        "sys_sched_getparam",   /* 155 */
+        "sys_sched_setscheduler",
+        "sys_sched_getscheduler",
+        "sys_sched_yield",
+        "sys_sched_get_priority_max",
+        "sys_sched_get_priority_min",  /* 160 */
+        "sys_sched_rr_get_interval",
+        "sys_nanosleep",
+        "sys_mremap",
+        "sys_setresuid16",
+        "sys_getresuid16",      /* 165 */
+        "sys_vm86",
+        "sys_ni_syscall",       /* Old sys_query_module */
+        "sys_poll",
+        "sys_nfsservctl",
+        "sys_setresgid16",      /* 170 */
+        "sys_getresgid16",
+        "sys_prctl",
+        "sys_rt_sigreturn",
+        "sys_rt_sigaction",
+        "sys_rt_sigprocmask",   /* 175 */
+        "sys_rt_sigpending",
+        "sys_rt_sigtimedwait",
+        "sys_rt_sigqueueinfo",
+        "sys_rt_sigsuspend",
+        "sys_pread64",  /* 180 */
+        "sys_pwrite64",
+        "sys_chown16",
+        "sys_getcwd",
+        "sys_capget",
+        "sys_capset",   /* 185 */
+        "sys_sigaltstack",
+        "sys_sendfile",
+        "sys_ni_syscall",       /* reserved for streams1 */
+        "sys_ni_syscall",       /* reserved for streams2 */
+        "sys_vfork",            /* 190 */
+        "sys_getrlimit",
+        "sys_mmap2",
+        "sys_truncate64",
+        "sys_ftruncate64",
+        "sys_stat64",   /* 195 */
+        "sys_lstat64",
+        "sys_fstat64",
+        "sys_lchown",
+        "sys_getuid",
+        "sys_getgid",   /* 200 */
+        "sys_geteuid",
+        "sys_getegid",
+        "sys_setreuid",
+        "sys_setregid",
+        "sys_getgroups",        /* 205 */
+        "sys_setgroups",
+        "sys_fchown",
+        "sys_setresuid",
+        "sys_getresuid",
+        "sys_setresgid",        /* 210 */
+        "sys_getresgid",
+        "sys_chown",
+        "sys_setuid",
+        "sys_setgid",
+        "sys_setfsuid", /* 215 */
+        "sys_setfsgid",
+        "sys_pivot_root",
+        "sys_mincore",
+        "sys_madvise",
+        "sys_getdents64",       /* 220 */
+        "sys_fcntl64",
+        "sys_ni_syscall",       /* reserved for TUX */
+        "sys_ni_syscall",
+        "sys_gettid",
+        "sys_readahead",        /* 225 */
+        "sys_setxattr",
+        "sys_lsetxattr",
+        "sys_fsetxattr",
+        "sys_getxattr",
+        "sys_lgetxattr",        /* 230 */
+        "sys_fgetxattr",
+        "sys_listxattr",
+        "sys_llistxattr",
+        "sys_flistxattr",
+        "sys_removexattr",      /* 235 */
+        "sys_lremovexattr",
+        "sys_fremovexattr",
+        "sys_tkill",
+        "sys_sendfile64",
+        "sys_futex",            /* 240 */
+        "sys_sched_setaffinity",
+        "sys_sched_getaffinity",
+        "sys_set_thread_area",
+        "sys_get_thread_area",
+        "sys_io_setup", /* 245 */
+        "sys_io_destroy",
+        "sys_io_getevents",
+        "sys_io_submit",
+        "sys_io_cancel",
+        "sys_fadvise64",        /* 250 */
+        "sys_ni_syscall",
+        "sys_exit_group",
+        "sys_lookup_dcookie",
+        "sys_epoll_create",
+        "sys_epoll_ctl",        /* 255 */
+        "sys_epoll_wait",
+        "sys_remap_file_pages",
+        "sys_set_tid_address",
+        "sys_timer_create",
+        "sys_timer_settime",            /* 260 */
+        "sys_timer_gettime",
+        "sys_timer_getoverrun",
+        "sys_timer_delete",
+        "sys_clock_settime",
+        "sys_clock_gettime",            /* 265 */
+        "sys_clock_getres",
+        "sys_clock_nanosleep",
+        "sys_statfs64",
+        "sys_fstatfs64",
+        "sys_tgkill",   /* 270 */
+        "sys_utimes",
+        "sys_fadvise64_64",
+        "sys_ni_syscall"        /* sys_vserver */
+    };
+
+    uint32_t    uEAX = CPUMGetGuestEAX(pVCpu);
+    switch (uEAX)
+    {
+        default:
+            if (uEAX < RT_ELEMENTS(apsz))
+                Log(("REM: linux syscall %3d: %s (eip=%08x ebx=%08x ecx=%08x edx=%08x esi=%08x edi=%08x ebp=%08x)\n",
+                     uEAX, apsz[uEAX], CPUMGetGuestEIP(pVCpu), CPUMGetGuestEBX(pVCpu), CPUMGetGuestECX(pVCpu),
+                     CPUMGetGuestEDX(pVCpu), CPUMGetGuestESI(pVCpu), CPUMGetGuestEDI(pVCpu), CPUMGetGuestEBP(pVCpu)));
+            else
+                Log(("eip=%08x: linux syscall %d (#%x) unknown\n", CPUMGetGuestEIP(pVCpu), uEAX, uEAX));
+            break;
+
+    }
+}
+
+
+/**
+ * Dumps an OpenBSD system call.
+ * @param   pVCpu     VMCPU handle.
+ */
+void remR3DumpOBsdSyscall(PVMCPU pVCpu)
+{
+    static const char *apsz[] =
+    {
+        "SYS_syscall",          //0
+        "SYS_exit",             //1
+        "SYS_fork",             //2
+        "SYS_read",             //3
+        "SYS_write",            //4
+        "SYS_open",             //5
+        "SYS_close",            //6
+        "SYS_wait4",            //7
+        "SYS_8",
+        "SYS_link",             //9
+        "SYS_unlink",           //10
+        "SYS_11",
+        "SYS_chdir",            //12
+        "SYS_fchdir",           //13
+        "SYS_mknod",            //14
+        "SYS_chmod",            //15
+        "SYS_chown",            //16
+        "SYS_break",            //17
+        "SYS_18",
+        "SYS_19",
+        "SYS_getpid",           //20
+        "SYS_mount",            //21
+        "SYS_unmount",          //22
+        "SYS_setuid",           //23
+        "SYS_getuid",           //24
+        "SYS_geteuid",          //25
+        "SYS_ptrace",           //26
+        "SYS_recvmsg",          //27
+        "SYS_sendmsg",          //28
+        "SYS_recvfrom",         //29
+        "SYS_accept",           //30
+        "SYS_getpeername",      //31
+        "SYS_getsockname",      //32
+        "SYS_access",           //33
+        "SYS_chflags",          //34
+        "SYS_fchflags",         //35
+        "SYS_sync",             //36
+        "SYS_kill",             //37
+        "SYS_38",
+        "SYS_getppid",          //39
+        "SYS_40",
+        "SYS_dup",              //41
+        "SYS_opipe",            //42
+        "SYS_getegid",          //43
+        "SYS_profil",           //44
+        "SYS_ktrace",           //45
+        "SYS_sigaction",        //46
+        "SYS_getgid",           //47
+        "SYS_sigprocmask",      //48
+        "SYS_getlogin",         //49
+        "SYS_setlogin",         //50
+        "SYS_acct",             //51
+        "SYS_sigpending",       //52
+        "SYS_osigaltstack",     //53
+        "SYS_ioctl",            //54
+        "SYS_reboot",           //55
+        "SYS_revoke",           //56
+        "SYS_symlink",          //57
+        "SYS_readlink",         //58
+        "SYS_execve",           //59
+        "SYS_umask",            //60
+        "SYS_chroot",           //61
+        "SYS_62",
+        "SYS_63",
+        "SYS_64",
+        "SYS_65",
+        "SYS_vfork",            //66
+        "SYS_67",
+        "SYS_68",
+        "SYS_sbrk",             //69
+        "SYS_sstk",             //70
+        "SYS_61",
+        "SYS_vadvise",          //72
+        "SYS_munmap",           //73
+        "SYS_mprotect",         //74
+        "SYS_madvise",          //75
+        "SYS_76",
+        "SYS_77",
+        "SYS_mincore",          //78
+        "SYS_getgroups",        //79
+        "SYS_setgroups",        //80
+        "SYS_getpgrp",          //81
+        "SYS_setpgid",          //82
+        "SYS_setitimer",        //83
+        "SYS_84",
+        "SYS_85",
+        "SYS_getitimer",        //86
+        "SYS_87",
+        "SYS_88",
+        "SYS_89",
+        "SYS_dup2",             //90
+        "SYS_91",
+        "SYS_fcntl",            //92
+        "SYS_select",           //93
+        "SYS_94",
+        "SYS_fsync",            //95
+        "SYS_setpriority",      //96
+        "SYS_socket",           //97
+        "SYS_connect",          //98
+        "SYS_99",
+        "SYS_getpriority",      //100
+        "SYS_101",
+        "SYS_102",
+        "SYS_sigreturn",        //103
+        "SYS_bind",             //104
+        "SYS_setsockopt",       //105
+        "SYS_listen",           //106
+        "SYS_107",
+        "SYS_108",
+        "SYS_109",
+        "SYS_110",
+        "SYS_sigsuspend",       //111
+        "SYS_112",
+        "SYS_113",
+        "SYS_114",
+        "SYS_115",
+        "SYS_gettimeofday",     //116
+        "SYS_getrusage",        //117
+        "SYS_getsockopt",       //118
+        "SYS_119",
+        "SYS_readv",            //120
+        "SYS_writev",           //121
+        "SYS_settimeofday",     //122
+        "SYS_fchown",           //123
+        "SYS_fchmod",           //124
+        "SYS_125",
+        "SYS_setreuid",         //126
+        "SYS_setregid",         //127
+        "SYS_rename",           //128
+        "SYS_129",
+        "SYS_130",
+        "SYS_flock",            //131
+        "SYS_mkfifo",           //132
+        "SYS_sendto",           //133
+        "SYS_shutdown",         //134
+        "SYS_socketpair",       //135
+        "SYS_mkdir",            //136
+        "SYS_rmdir",            //137
+        "SYS_utimes",           //138
+        "SYS_139",
+        "SYS_adjtime",          //140
+        "SYS_141",
+        "SYS_142",
+        "SYS_143",
+        "SYS_144",
+        "SYS_145",
+        "SYS_146",
+        "SYS_setsid",           //147
+        "SYS_quotactl",         //148
+        "SYS_149",
+        "SYS_150",
+        "SYS_151",
+        "SYS_152",
+        "SYS_153",
+        "SYS_154",
+        "SYS_nfssvc",           //155
+        "SYS_156",
+        "SYS_157",
+        "SYS_158",
+        "SYS_159",
+        "SYS_160",
+        "SYS_getfh",            //161
+        "SYS_162",
+        "SYS_163",
+        "SYS_164",
+        "SYS_sysarch",          //165
+        "SYS_166",
+        "SYS_167",
+        "SYS_168",
+        "SYS_169",
+        "SYS_170",
+        "SYS_171",
+        "SYS_172",
+        "SYS_pread",            //173
+        "SYS_pwrite",           //174
+        "SYS_175",
+        "SYS_176",
+        "SYS_177",
+        "SYS_178",
+        "SYS_179",
+        "SYS_180",
+        "SYS_setgid",           //181
+        "SYS_setegid",          //182
+        "SYS_seteuid",          //183
+        "SYS_lfs_bmapv",        //184
+        "SYS_lfs_markv",        //185
+        "SYS_lfs_segclean",     //186
+        "SYS_lfs_segwait",      //187
+        "SYS_188",
+        "SYS_189",
+        "SYS_190",
+        "SYS_pathconf",         //191
+        "SYS_fpathconf",        //192
+        "SYS_swapctl",          //193
+        "SYS_getrlimit",        //194
+        "SYS_setrlimit",        //195
+        "SYS_getdirentries",    //196
+        "SYS_mmap",             //197
+        "SYS___syscall",        //198
+        "SYS_lseek",            //199
+        "SYS_truncate",         //200
+        "SYS_ftruncate",        //201
+        "SYS___sysctl",         //202
+        "SYS_mlock",            //203
+        "SYS_munlock",          //204
+        "SYS_205",
+        "SYS_futimes",          //206
+        "SYS_getpgid",          //207
+        "SYS_xfspioctl",        //208
+        "SYS_209",
+        "SYS_210",
+        "SYS_211",
+        "SYS_212",
+        "SYS_213",
+        "SYS_214",
+        "SYS_215",
+        "SYS_216",
+        "SYS_217",
+        "SYS_218",
+        "SYS_219",
+        "SYS_220",
+        "SYS_semget",           //221
+        "SYS_222",
+        "SYS_223",
+        "SYS_224",
+        "SYS_msgget",           //225
+        "SYS_msgsnd",           //226
+        "SYS_msgrcv",           //227
+        "SYS_shmat",            //228
+        "SYS_229",
+        "SYS_shmdt",            //230
+        "SYS_231",
+        "SYS_clock_gettime",    //232
+        "SYS_clock_settime",    //233
+        "SYS_clock_getres",     //234
+        "SYS_235",
+        "SYS_236",
+        "SYS_237",
+        "SYS_238",
+        "SYS_239",
+        "SYS_nanosleep",        //240
+        "SYS_241",
+        "SYS_242",
+        "SYS_243",
+        "SYS_244",
+        "SYS_245",
+        "SYS_246",
+        "SYS_247",
+        "SYS_248",
+        "SYS_249",
+        "SYS_minherit",         //250
+        "SYS_rfork",            //251
+        "SYS_poll",             //252
+        "SYS_issetugid",        //253
+        "SYS_lchown",           //254
+        "SYS_getsid",           //255
+        "SYS_msync",            //256
+        "SYS_257",
+        "SYS_258",
+        "SYS_259",
+        "SYS_getfsstat",        //260
+        "SYS_statfs",           //261
+        "SYS_fstatfs",          //262
+        "SYS_pipe",             //263
+        "SYS_fhopen",           //264
+        "SYS_265",
+        "SYS_fhstatfs",         //266
+        "SYS_preadv",           //267
+        "SYS_pwritev",          //268
+        "SYS_kqueue",           //269
+        "SYS_kevent",           //270
+        "SYS_mlockall",         //271
+        "SYS_munlockall",       //272
+        "SYS_getpeereid",       //273
+        "SYS_274",
+        "SYS_275",
+        "SYS_276",
+        "SYS_277",
+        "SYS_278",
+        "SYS_279",
+        "SYS_280",
+        "SYS_getresuid",        //281
+        "SYS_setresuid",        //282
+        "SYS_getresgid",        //283
+        "SYS_setresgid",        //284
+        "SYS_285",
+        "SYS_mquery",           //286
+        "SYS_closefrom",        //287
+        "SYS_sigaltstack",      //288
+        "SYS_shmget",           //289
+        "SYS_semop",            //290
+        "SYS_stat",             //291
+        "SYS_fstat",            //292
+        "SYS_lstat",            //293
+        "SYS_fhstat",           //294
+        "SYS___semctl",         //295
+        "SYS_shmctl",           //296
+        "SYS_msgctl",           //297
+        "SYS_MAXSYSCALL",       //298
+        //299
+        //300
+    };
+    uint32_t    uEAX;
+    if (!LogIsEnabled())
+        return;
+    uEAX = CPUMGetGuestEAX(pVCpu);
+    switch (uEAX)
+    {
+        default:
+            if (uEAX < RT_ELEMENTS(apsz))
+            {
+                uint32_t au32Args[8] = {0};
+                PGMPhysSimpleReadGCPtr(pVCpu, au32Args, CPUMGetGuestESP(pVCpu), sizeof(au32Args));
+                RTLogPrintf("REM: OpenBSD syscall %3d: %s (eip=%08x %08x %08x %08x %08x %08x %08x %08x %08x)\n",
+                            uEAX, apsz[uEAX], CPUMGetGuestEIP(pVCpu), au32Args[0], au32Args[1], au32Args[2], au32Args[3],
+                            au32Args[4], au32Args[5], au32Args[6], au32Args[7]);
+            }
+            else
+                RTLogPrintf("eip=%08x: OpenBSD syscall %d (#%x) unknown!!\n", CPUMGetGuestEIP(pVCpu), uEAX, uEAX);
+            break;
+    }
+}
+
+
+#if defined(IPRT_NO_CRT) && defined(RT_OS_WINDOWS) && defined(RT_ARCH_X86)
+/**
+ * The Dll main entry point (stub).
+ */
+bool __stdcall _DllMainCRTStartup(void *hModule, uint32_t dwReason, void *pvReserved)
+{
+    return true;
+}
+
+void *memcpy(void *dst, const void *src, size_t size)
+{
+    uint8_t*pbDst = dst, *pbSrc = src;
+    while (size-- > 0)
+        *pbDst++ = *pbSrc++;
+    return dst;
+}
+
+#endif
+
+void cpu_smm_update(CPUX86State *env)
+{
+}
diff --git a/src/recompiler/bswap.h b/src/recompiler/bswap.h
new file mode 100644
index 00000000..4377c7b0
--- /dev/null
+++ b/src/recompiler/bswap.h
@@ -0,0 +1,225 @@
+#ifndef BSWAP_H
+#define BSWAP_H
+
+#include "config-host.h"
+
+#include <inttypes.h>
+
+#ifdef CONFIG_MACHINE_BSWAP_H
+#include <sys/endian.h>
+#include <sys/types.h>
+#include <machine/bswap.h>
+#else
+
+#ifdef CONFIG_BYTESWAP_H
+#include <byteswap.h>
+#else
+
+#define bswap_16(x) __extension__ /* <- VBOX */ \
+({ \
+	uint16_t __x = (x); \
+	((uint16_t)( \
+		(((uint16_t)(__x) & (uint16_t)0x00ffU) << 8) | \
+		(((uint16_t)(__x) & (uint16_t)0xff00U) >> 8) )); \
+})
+
+#define bswap_32(x) __extension__ /* <- VBOX */ \
+({ \
+	uint32_t __x = (x); \
+	((uint32_t)( \
+		(((uint32_t)(__x) & (uint32_t)0x000000ffUL) << 24) | \
+		(((uint32_t)(__x) & (uint32_t)0x0000ff00UL) <<  8) | \
+		(((uint32_t)(__x) & (uint32_t)0x00ff0000UL) >>  8) | \
+		(((uint32_t)(__x) & (uint32_t)0xff000000UL) >> 24) )); \
+})
+
+#define bswap_64(x) __extension__ /* <- VBOX */ \
+({ \
+	uint64_t __x = (x); \
+	((uint64_t)( \
+		(uint64_t)(((uint64_t)(__x) & (uint64_t)0x00000000000000ffULL) << 56) | \
+		(uint64_t)(((uint64_t)(__x) & (uint64_t)0x000000000000ff00ULL) << 40) | \
+		(uint64_t)(((uint64_t)(__x) & (uint64_t)0x0000000000ff0000ULL) << 24) | \
+		(uint64_t)(((uint64_t)(__x) & (uint64_t)0x00000000ff000000ULL) <<  8) | \
+	        (uint64_t)(((uint64_t)(__x) & (uint64_t)0x000000ff00000000ULL) >>  8) | \
+		(uint64_t)(((uint64_t)(__x) & (uint64_t)0x0000ff0000000000ULL) >> 24) | \
+		(uint64_t)(((uint64_t)(__x) & (uint64_t)0x00ff000000000000ULL) >> 40) | \
+		(uint64_t)(((uint64_t)(__x) & (uint64_t)0xff00000000000000ULL) >> 56) )); \
+})
+
+#endif /* !CONFIG_BYTESWAP_H */
+
+static inline uint16_t bswap16(uint16_t x)
+{
+    return bswap_16(x);
+}
+
+static inline uint32_t bswap32(uint32_t x)
+{
+    return bswap_32(x);
+}
+
+static inline uint64_t bswap64(uint64_t x)
+{
+    return bswap_64(x);
+}
+
+#endif /* ! CONFIG_MACHINE_BSWAP_H */
+
+static inline void bswap16s(uint16_t *s)
+{
+    *s = bswap16(*s);
+}
+
+static inline void bswap32s(uint32_t *s)
+{
+    *s = bswap32(*s);
+}
+
+static inline void bswap64s(uint64_t *s)
+{
+    *s = bswap64(*s);
+}
+
+#if defined(HOST_WORDS_BIGENDIAN)
+#define be_bswap(v, size) (v)
+#define le_bswap(v, size) bswap ## size(v)
+#define be_bswaps(v, size)
+#define le_bswaps(p, size) *p = bswap ## size(*p);
+#else
+#define le_bswap(v, size) (v)
+#define be_bswap(v, size) bswap ## size(v)
+#define le_bswaps(v, size)
+#define be_bswaps(p, size) *p = bswap ## size(*p);
+#endif
+
+#define CPU_CONVERT(endian, size, type)\
+static inline type endian ## size ## _to_cpu(type v)\
+{\
+    return endian ## _bswap(v, size);\
+}\
+\
+static inline type cpu_to_ ## endian ## size(type v)\
+{\
+    return endian ## _bswap(v, size);\
+}\
+\
+static inline void endian ## size ## _to_cpus(type *p)\
+{\
+    endian ## _bswaps(p, size)\
+}\
+\
+static inline void cpu_to_ ## endian ## size ## s(type *p)\
+{\
+    endian ## _bswaps(p, size)\
+}\
+\
+static inline type endian ## size ## _to_cpup(const type *p)\
+{\
+    return endian ## size ## _to_cpu(*p);\
+}\
+\
+static inline void cpu_to_ ## endian ## size ## w(type *p, type v)\
+{\
+     *p = cpu_to_ ## endian ## size(v);\
+}
+
+CPU_CONVERT(be, 16, uint16_t)
+CPU_CONVERT(be, 32, uint32_t)
+CPU_CONVERT(be, 64, uint64_t)
+
+CPU_CONVERT(le, 16, uint16_t)
+CPU_CONVERT(le, 32, uint32_t)
+CPU_CONVERT(le, 64, uint64_t)
+
+/* unaligned versions (optimized for frequent unaligned accesses)*/
+
+#if defined(__i386__) || defined(_ARCH_PPC)
+
+#define cpu_to_le16wu(p, v) cpu_to_le16w(p, v)
+#define cpu_to_le32wu(p, v) cpu_to_le32w(p, v)
+#define le16_to_cpupu(p) le16_to_cpup(p)
+#define le32_to_cpupu(p) le32_to_cpup(p)
+#define be32_to_cpupu(p) be32_to_cpup(p)
+
+#define cpu_to_be16wu(p, v) cpu_to_be16w(p, v)
+#define cpu_to_be32wu(p, v) cpu_to_be32w(p, v)
+
+#else
+
+static inline void cpu_to_le16wu(uint16_t *p, uint16_t v)
+{
+    uint8_t *p1 = (uint8_t *)p;
+
+    p1[0] = v & 0xff;
+    p1[1] = v >> 8;
+}
+
+static inline void cpu_to_le32wu(uint32_t *p, uint32_t v)
+{
+    uint8_t *p1 = (uint8_t *)p;
+
+    p1[0] = v & 0xff;
+    p1[1] = v >> 8;
+    p1[2] = v >> 16;
+    p1[3] = v >> 24;
+}
+
+static inline uint16_t le16_to_cpupu(const uint16_t *p)
+{
+    const uint8_t *p1 = (const uint8_t *)p;
+    return p1[0] | (p1[1] << 8);
+}
+
+static inline uint32_t le32_to_cpupu(const uint32_t *p)
+{
+    const uint8_t *p1 = (const uint8_t *)p;
+    return p1[0] | (p1[1] << 8) | (p1[2] << 16) | (p1[3] << 24);
+}
+
+static inline uint32_t be32_to_cpupu(const uint32_t *p)
+{
+    const uint8_t *p1 = (const uint8_t *)p;
+    return p1[3] | (p1[2] << 8) | (p1[1] << 16) | (p1[0] << 24);
+}
+
+static inline void cpu_to_be16wu(uint16_t *p, uint16_t v)
+{
+    uint8_t *p1 = (uint8_t *)p;
+
+    p1[0] = v >> 8;
+    p1[1] = v & 0xff;
+}
+
+static inline void cpu_to_be32wu(uint32_t *p, uint32_t v)
+{
+    uint8_t *p1 = (uint8_t *)p;
+
+    p1[0] = v >> 24;
+    p1[1] = v >> 16;
+    p1[2] = v >> 8;
+    p1[3] = v & 0xff;
+}
+
+#endif
+
+#ifdef HOST_WORDS_BIGENDIAN
+#define cpu_to_32wu cpu_to_be32wu
+#define leul_to_cpu(v) glue(glue(le,HOST_LONG_BITS),_to_cpu)(v)
+#else
+#define cpu_to_32wu cpu_to_le32wu
+#define leul_to_cpu(v) (v)
+#endif
+
+#undef le_bswap
+#undef be_bswap
+#undef le_bswaps
+#undef be_bswaps
+
+/* len must be one of 1, 2, 4 */
+static inline uint32_t qemu_bswap_len(uint32_t value, int len)
+{
+    return bswap32(value) >> (32 - 8 * len);
+}
+
+#endif /* BSWAP_H */
diff --git a/src/recompiler/cache-utils.h b/src/recompiler/cache-utils.h
new file mode 100644
index 00000000..5d5d44f4
--- /dev/null
+++ b/src/recompiler/cache-utils.h
@@ -0,0 +1,41 @@
+#ifndef QEMU_CACHE_UTILS_H
+#define QEMU_CACHE_UTILS_H
+
+#if defined(_ARCH_PPC)
+struct qemu_cache_conf {
+    unsigned long dcache_bsize;
+    unsigned long icache_bsize;
+};
+
+extern struct qemu_cache_conf qemu_cache_conf;
+
+extern void qemu_cache_utils_init(char **envp);
+
+/* mildly adjusted code from tcg-dyngen.c */
+static inline void flush_icache_range(uintptr_t start, uintptr_t stop)
+{
+    unsigned long p, start1, stop1;
+    unsigned long dsize = qemu_cache_conf.dcache_bsize;
+    unsigned long isize = qemu_cache_conf.icache_bsize;
+
+    start1 = start & ~(dsize - 1);
+    stop1 = (stop + dsize - 1) & ~(dsize - 1);
+    for (p = start1; p < stop1; p += dsize) {
+        asm volatile ("dcbst 0,%0" : : "r"(p) : "memory");
+    }
+    asm volatile ("sync" : : : "memory");
+
+    start &= start & ~(isize - 1);
+    stop1 = (stop + isize - 1) & ~(isize - 1);
+    for (p = start1; p < stop1; p += isize) {
+        asm volatile ("icbi 0,%0" : : "r"(p) : "memory");
+    }
+    asm volatile ("sync" : : : "memory");
+    asm volatile ("isync" : : : "memory");
+}
+
+#else
+#define qemu_cache_utils_init(envp) do { (void) (envp); } while (0)
+#endif
+
+#endif /* QEMU_CACHE_UTILS_H */
diff --git a/src/recompiler/cpu-all.h b/src/recompiler/cpu-all.h
new file mode 100644
index 00000000..128c672e
--- /dev/null
+++ b/src/recompiler/cpu-all.h
@@ -0,0 +1,1184 @@
+/*
+ * defines common to all virtual CPUs
+ *
+ *  Copyright (c) 2003 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Oracle LGPL Disclaimer: For the avoidance of doubt, except that if any license choice
+ * other than GPL or LGPL is available it will apply instead, Oracle elects to use only
+ * the Lesser General Public License version 2.1 (LGPLv2) at this time for any software where
+ * a choice of LGPL license versions is made available with the language indicating
+ * that LGPLv2 or any later version may be used, or where a choice of which version
+ * of the LGPL is applied is otherwise unspecified.
+ */
+
+#ifndef CPU_ALL_H
+#define CPU_ALL_H
+
+#ifdef VBOX
+# ifndef LOG_GROUP
+#  define LOG_GROUP LOG_GROUP_REM
+# endif
+# include <VBox/log.h>
+# include <VBox/vmm/pgm.h> /* PGM_DYNAMIC_RAM_ALLOC */
+#endif /* VBOX */
+#include "qemu-common.h"
+#include "cpu-common.h"
+
+/* some important defines:
+ *
+ * WORDS_ALIGNED : if defined, the host cpu can only make word aligned
+ * memory accesses.
+ *
+ * HOST_WORDS_BIGENDIAN : if defined, the host cpu is big endian and
+ * otherwise little endian.
+ *
+ * (TARGET_WORDS_ALIGNED : same for target cpu (not supported yet))
+ *
+ * TARGET_WORDS_BIGENDIAN : same for target cpu
+ */
+
+#include "softfloat.h"
+
+#if defined(HOST_WORDS_BIGENDIAN) != defined(TARGET_WORDS_BIGENDIAN)
+#define BSWAP_NEEDED
+#endif
+
+#ifdef BSWAP_NEEDED
+
+static inline uint16_t tswap16(uint16_t s)
+{
+    return bswap16(s);
+}
+
+static inline uint32_t tswap32(uint32_t s)
+{
+    return bswap32(s);
+}
+
+static inline uint64_t tswap64(uint64_t s)
+{
+    return bswap64(s);
+}
+
+static inline void tswap16s(uint16_t *s)
+{
+    *s = bswap16(*s);
+}
+
+static inline void tswap32s(uint32_t *s)
+{
+    *s = bswap32(*s);
+}
+
+static inline void tswap64s(uint64_t *s)
+{
+    *s = bswap64(*s);
+}
+
+#else
+
+static inline uint16_t tswap16(uint16_t s)
+{
+    return s;
+}
+
+static inline uint32_t tswap32(uint32_t s)
+{
+    return s;
+}
+
+static inline uint64_t tswap64(uint64_t s)
+{
+    return s;
+}
+
+static inline void tswap16s(uint16_t *s)
+{
+}
+
+static inline void tswap32s(uint32_t *s)
+{
+}
+
+static inline void tswap64s(uint64_t *s)
+{
+}
+
+#endif
+
+#if TARGET_LONG_SIZE == 4
+#define tswapl(s) tswap32(s)
+#define tswapls(s) tswap32s((uint32_t *)(s))
+#define bswaptls(s) bswap32s(s)
+#else
+#define tswapl(s) tswap64(s)
+#define tswapls(s) tswap64s((uint64_t *)(s))
+#define bswaptls(s) bswap64s(s)
+#endif
+
+typedef union {
+    float32 f;
+    uint32_t l;
+} CPU_FloatU;
+
+/* NOTE: arm FPA is horrible as double 32 bit words are stored in big
+   endian ! */
+typedef union {
+    float64 d;
+#if defined(HOST_WORDS_BIGENDIAN) \
+    || (defined(__arm__) && !defined(__VFP_FP__) && !defined(CONFIG_SOFTFLOAT))
+    struct {
+        uint32_t upper;
+        uint32_t lower;
+    } l;
+#else
+    struct {
+        uint32_t lower;
+        uint32_t upper;
+    } l;
+#endif
+    uint64_t ll;
+} CPU_DoubleU;
+
+#ifdef TARGET_SPARC
+typedef union {
+    float128 q;
+#if defined(HOST_WORDS_BIGENDIAN) \
+    || (defined(__arm__) && !defined(__VFP_FP__) && !defined(CONFIG_SOFTFLOAT))
+    struct {
+        uint32_t upmost;
+        uint32_t upper;
+        uint32_t lower;
+        uint32_t lowest;
+    } l;
+    struct {
+        uint64_t upper;
+        uint64_t lower;
+    } ll;
+#else
+    struct {
+        uint32_t lowest;
+        uint32_t lower;
+        uint32_t upper;
+        uint32_t upmost;
+    } l;
+    struct {
+        uint64_t lower;
+        uint64_t upper;
+    } ll;
+#endif
+} CPU_QuadU;
+#endif
+
+/* CPU memory access without any memory or io remapping */
+
+/*
+ * the generic syntax for the memory accesses is:
+ *
+ * load: ld{type}{sign}{size}{endian}_{access_type}(ptr)
+ *
+ * store: st{type}{size}{endian}_{access_type}(ptr, val)
+ *
+ * type is:
+ * (empty): integer access
+ *   f    : float access
+ *
+ * sign is:
+ * (empty): for floats or 32 bit size
+ *   u    : unsigned
+ *   s    : signed
+ *
+ * size is:
+ *   b: 8 bits
+ *   w: 16 bits
+ *   l: 32 bits
+ *   q: 64 bits
+ *
+ * endian is:
+ * (empty): target cpu endianness or 8 bit access
+ *   r    : reversed target cpu endianness (not implemented yet)
+ *   be   : big endian (not implemented yet)
+ *   le   : little endian (not implemented yet)
+ *
+ * access_type is:
+ *   raw    : host memory access
+ *   user   : user mode access using soft MMU
+ *   kernel : kernel mode access using soft MMU
+ */
+
+#ifdef VBOX
+void        remAbort(int rc, const char *pszTip) __attribute__((__noreturn__));
+
+void        remR3PhysRead(RTGCPHYS SrcGCPhys, void *pvDst, unsigned cb);
+RTCCUINTREG remR3PhysReadU8(RTGCPHYS SrcGCPhys);
+RTCCINTREG  remR3PhysReadS8(RTGCPHYS SrcGCPhys);
+RTCCUINTREG remR3PhysReadU16(RTGCPHYS SrcGCPhys);
+RTCCINTREG  remR3PhysReadS16(RTGCPHYS SrcGCPhys);
+RTCCUINTREG remR3PhysReadU32(RTGCPHYS SrcGCPhys);
+RTCCINTREG  remR3PhysReadS32(RTGCPHYS SrcGCPhys);
+uint64_t    remR3PhysReadU64(RTGCPHYS SrcGCPhys);
+int64_t     remR3PhysReadS64(RTGCPHYS SrcGCPhys);
+void        remR3PhysWrite(RTGCPHYS DstGCPhys, const void *pvSrc, unsigned cb);
+void        remR3PhysWriteU8(RTGCPHYS DstGCPhys, uint8_t val);
+void        remR3PhysWriteU16(RTGCPHYS DstGCPhys, uint16_t val);
+void        remR3PhysWriteU32(RTGCPHYS DstGCPhys, uint32_t val);
+void        remR3PhysWriteU64(RTGCPHYS DstGCPhys, uint64_t val);
+
+# ifndef REM_PHYS_ADDR_IN_TLB
+void       *remR3TlbGCPhys2Ptr(CPUState *env1, target_ulong physAddr, int fWritable);
+# endif
+
+#endif /* VBOX */
+
+#if defined(VBOX) && defined(REM_PHYS_ADDR_IN_TLB)
+
+DECLINLINE(uint8_t) ldub_p(const void *ptr)
+{
+    VBOX_CHECK_ADDR(ptr);
+    return remR3PhysReadU8((uintptr_t)ptr);
+}
+
+DECLINLINE(int8_t) ldsb_p(const void *ptr)
+{
+    VBOX_CHECK_ADDR(ptr);
+    return remR3PhysReadS8((uintptr_t)ptr);
+}
+
+DECLINLINE(void) stb_p(void *ptr, int v)
+{
+    VBOX_CHECK_ADDR(ptr);
+    remR3PhysWriteU8((uintptr_t)ptr, v);
+}
+
+DECLINLINE(uint32_t) lduw_le_p(const void *ptr)
+{
+    VBOX_CHECK_ADDR(ptr);
+    return remR3PhysReadU16((uintptr_t)ptr);
+}
+
+DECLINLINE(int32_t) ldsw_le_p(const void *ptr)
+{
+    VBOX_CHECK_ADDR(ptr);
+    return remR3PhysReadS16((uintptr_t)ptr);
+}
+
+DECLINLINE(void) stw_le_p(void *ptr, int v)
+{
+    VBOX_CHECK_ADDR(ptr);
+    remR3PhysWriteU16((uintptr_t)ptr, v);
+}
+
+DECLINLINE(uint32_t) ldl_le_p(const void *ptr)
+{
+    VBOX_CHECK_ADDR(ptr);
+    return remR3PhysReadU32((uintptr_t)ptr);
+}
+
+DECLINLINE(void) stl_le_p(void *ptr, int v)
+{
+    VBOX_CHECK_ADDR(ptr);
+    remR3PhysWriteU32((uintptr_t)ptr, v);
+}
+
+DECLINLINE(void) stq_le_p(void *ptr, uint64_t v)
+{
+    VBOX_CHECK_ADDR(ptr);
+    remR3PhysWriteU64((uintptr_t)ptr, v);
+}
+
+DECLINLINE(uint64_t) ldq_le_p(const void *ptr)
+{
+    VBOX_CHECK_ADDR(ptr);
+    return remR3PhysReadU64((uintptr_t)ptr);
+}
+
+# undef VBOX_CHECK_ADDR
+
+/* float access */
+
+DECLINLINE(float32) ldfl_le_p(const void *ptr)
+{
+    union {
+        float32 f;
+        uint32_t i;
+    } u;
+    u.i = ldl_le_p(ptr);
+    return u.f;
+}
+
+DECLINLINE(void) stfl_le_p(void *ptr, float32 v)
+{
+    union {
+        float32 f;
+        uint32_t i;
+    } u;
+    u.f = v;
+    stl_le_p(ptr, u.i);
+}
+
+DECLINLINE(float64) ldfq_le_p(const void *ptr)
+{
+    CPU_DoubleU u;
+    u.l.lower = ldl_le_p(ptr);
+    u.l.upper = ldl_le_p((uint8_t*)ptr + 4);
+    return u.d;
+}
+
+DECLINLINE(void) stfq_le_p(void *ptr, float64 v)
+{
+    CPU_DoubleU u;
+    u.d = v;
+    stl_le_p(ptr, u.l.lower);
+    stl_le_p((uint8_t*)ptr + 4, u.l.upper);
+}
+
+#else  /* !VBOX || !REM_PHYS_ADDR_IN_TLB */
+
+static inline int ldub_p(const void *ptr)
+{
+    return *(uint8_t *)ptr;
+}
+
+static inline int ldsb_p(const void *ptr)
+{
+    return *(int8_t *)ptr;
+}
+
+static inline void stb_p(void *ptr, int v)
+{
+    *(uint8_t *)ptr = v;
+}
+
+/* NOTE: on arm, putting 2 in /proc/sys/debug/alignment so that the
+   kernel handles unaligned load/stores may give better results, but
+   it is a system wide setting : bad */
+#if defined(HOST_WORDS_BIGENDIAN) || defined(WORDS_ALIGNED)
+
+/* conservative code for little endian unaligned accesses */
+static inline int lduw_le_p(const void *ptr)
+{
+#ifdef _ARCH_PPC
+    int val;
+    __asm__ __volatile__ ("lhbrx %0,0,%1" : "=r" (val) : "r" (ptr));
+    return val;
+#else
+    const uint8_t *p = ptr;
+    return p[0] | (p[1] << 8);
+#endif
+}
+
+static inline int ldsw_le_p(const void *ptr)
+{
+#ifdef _ARCH_PPC
+    int val;
+    __asm__ __volatile__ ("lhbrx %0,0,%1" : "=r" (val) : "r" (ptr));
+    return (int16_t)val;
+#else
+    const uint8_t *p = ptr;
+    return (int16_t)(p[0] | (p[1] << 8));
+#endif
+}
+
+static inline int ldl_le_p(const void *ptr)
+{
+#ifdef _ARCH_PPC
+    int val;
+    __asm__ __volatile__ ("lwbrx %0,0,%1" : "=r" (val) : "r" (ptr));
+    return val;
+#else
+    const uint8_t *p = ptr;
+    return p[0] | (p[1] << 8) | (p[2] << 16) | (p[3] << 24);
+#endif
+}
+
+static inline uint64_t ldq_le_p(const void *ptr)
+{
+    const uint8_t *p = ptr;
+    uint32_t v1, v2;
+    v1 = ldl_le_p(p);
+    v2 = ldl_le_p(p + 4);
+    return v1 | ((uint64_t)v2 << 32);
+}
+
+static inline void stw_le_p(void *ptr, int v)
+{
+#ifdef _ARCH_PPC
+    __asm__ __volatile__ ("sthbrx %1,0,%2" : "=m" (*(uint16_t *)ptr) : "r" (v), "r" (ptr));
+#else
+    uint8_t *p = ptr;
+    p[0] = v;
+    p[1] = v >> 8;
+#endif
+}
+
+static inline void stl_le_p(void *ptr, int v)
+{
+#ifdef _ARCH_PPC
+    __asm__ __volatile__ ("stwbrx %1,0,%2" : "=m" (*(uint32_t *)ptr) : "r" (v), "r" (ptr));
+#else
+    uint8_t *p = ptr;
+    p[0] = v;
+    p[1] = v >> 8;
+    p[2] = v >> 16;
+    p[3] = v >> 24;
+#endif
+}
+
+static inline void stq_le_p(void *ptr, uint64_t v)
+{
+    uint8_t *p = ptr;
+    stl_le_p(p, (uint32_t)v);
+    stl_le_p(p + 4, v >> 32);
+}
+
+/* float access */
+
+static inline float32 ldfl_le_p(const void *ptr)
+{
+    union {
+        float32 f;
+        uint32_t i;
+    } u;
+    u.i = ldl_le_p(ptr);
+    return u.f;
+}
+
+static inline void stfl_le_p(void *ptr, float32 v)
+{
+    union {
+        float32 f;
+        uint32_t i;
+    } u;
+    u.f = v;
+    stl_le_p(ptr, u.i);
+}
+
+static inline float64 ldfq_le_p(const void *ptr)
+{
+    CPU_DoubleU u;
+    u.l.lower = ldl_le_p(ptr);
+    u.l.upper = ldl_le_p(ptr + 4);
+    return u.d;
+}
+
+static inline void stfq_le_p(void *ptr, float64 v)
+{
+    CPU_DoubleU u;
+    u.d = v;
+    stl_le_p(ptr, u.l.lower);
+    stl_le_p(ptr + 4, u.l.upper);
+}
+
+#else
+
+static inline int lduw_le_p(const void *ptr)
+{
+    return *(uint16_t *)ptr;
+}
+
+static inline int ldsw_le_p(const void *ptr)
+{
+    return *(int16_t *)ptr;
+}
+
+static inline int ldl_le_p(const void *ptr)
+{
+    return *(uint32_t *)ptr;
+}
+
+static inline uint64_t ldq_le_p(const void *ptr)
+{
+    return *(uint64_t *)ptr;
+}
+
+static inline void stw_le_p(void *ptr, int v)
+{
+    *(uint16_t *)ptr = v;
+}
+
+static inline void stl_le_p(void *ptr, int v)
+{
+    *(uint32_t *)ptr = v;
+}
+
+static inline void stq_le_p(void *ptr, uint64_t v)
+{
+    *(uint64_t *)ptr = v;
+}
+
+/* float access */
+
+static inline float32 ldfl_le_p(const void *ptr)
+{
+    return *(float32 *)ptr;
+}
+
+static inline float64 ldfq_le_p(const void *ptr)
+{
+    return *(float64 *)ptr;
+}
+
+static inline void stfl_le_p(void *ptr, float32 v)
+{
+    *(float32 *)ptr = v;
+}
+
+static inline void stfq_le_p(void *ptr, float64 v)
+{
+    *(float64 *)ptr = v;
+}
+#endif
+
+#endif /* !VBOX || !REM_PHYS_ADDR_IN_TLB */
+
+#if !defined(HOST_WORDS_BIGENDIAN) || defined(WORDS_ALIGNED)
+
+static inline int lduw_be_p(const void *ptr)
+{
+#if defined(__i386__)
+    int val;
+    asm volatile ("movzwl %1, %0\n"
+                  "xchgb %b0, %h0\n"
+                  : "=q" (val)
+                  : "m" (*(uint16_t *)ptr));
+    return val;
+#else
+    const uint8_t *b = ptr;
+    return ((b[0] << 8) | b[1]);
+#endif
+}
+
+static inline int ldsw_be_p(const void *ptr)
+{
+#if defined(__i386__)
+    int val;
+    asm volatile ("movzwl %1, %0\n"
+                  "xchgb %b0, %h0\n"
+                  : "=q" (val)
+                  : "m" (*(uint16_t *)ptr));
+    return (int16_t)val;
+#else
+    const uint8_t *b = ptr;
+    return (int16_t)((b[0] << 8) | b[1]);
+#endif
+}
+
+static inline int ldl_be_p(const void *ptr)
+{
+#if defined(__i386__) || defined(__x86_64__)
+    int val;
+    asm volatile ("movl %1, %0\n"
+                  "bswap %0\n"
+                  : "=r" (val)
+                  : "m" (*(uint32_t *)ptr));
+    return val;
+#else
+    const uint8_t *b = ptr;
+    return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3];
+#endif
+}
+
+static inline uint64_t ldq_be_p(const void *ptr)
+{
+    uint32_t a,b;
+    a = ldl_be_p(ptr);
+    b = ldl_be_p((uint8_t *)ptr + 4);
+    return (((uint64_t)a<<32)|b);
+}
+
+static inline void stw_be_p(void *ptr, int v)
+{
+#if defined(__i386__)
+    asm volatile ("xchgb %b0, %h0\n"
+                  "movw %w0, %1\n"
+                  : "=q" (v)
+                  : "m" (*(uint16_t *)ptr), "0" (v));
+#else
+    uint8_t *d = (uint8_t *) ptr;
+    d[0] = v >> 8;
+    d[1] = v;
+#endif
+}
+
+static inline void stl_be_p(void *ptr, int v)
+{
+#if defined(__i386__) || defined(__x86_64__)
+    asm volatile ("bswap %0\n"
+                  "movl %0, %1\n"
+                  : "=r" (v)
+                  : "m" (*(uint32_t *)ptr), "0" (v));
+#else
+    uint8_t *d = (uint8_t *) ptr;
+    d[0] = v >> 24;
+    d[1] = v >> 16;
+    d[2] = v >> 8;
+    d[3] = v;
+#endif
+}
+
+static inline void stq_be_p(void *ptr, uint64_t v)
+{
+    stl_be_p(ptr, v >> 32);
+    stl_be_p((uint8_t *)ptr + 4, v);
+}
+
+/* float access */
+
+static inline float32 ldfl_be_p(const void *ptr)
+{
+    union {
+        float32 f;
+        uint32_t i;
+    } u;
+    u.i = ldl_be_p(ptr);
+    return u.f;
+}
+
+static inline void stfl_be_p(void *ptr, float32 v)
+{
+    union {
+        float32 f;
+        uint32_t i;
+    } u;
+    u.f = v;
+    stl_be_p(ptr, u.i);
+}
+
+static inline float64 ldfq_be_p(const void *ptr)
+{
+    CPU_DoubleU u;
+    u.l.upper = ldl_be_p(ptr);
+    u.l.lower = ldl_be_p((uint8_t *)ptr + 4);
+    return u.d;
+}
+
+static inline void stfq_be_p(void *ptr, float64 v)
+{
+    CPU_DoubleU u;
+    u.d = v;
+    stl_be_p(ptr, u.l.upper);
+    stl_be_p((uint8_t *)ptr + 4, u.l.lower);
+}
+
+#else
+
+static inline int lduw_be_p(const void *ptr)
+{
+    return *(uint16_t *)ptr;
+}
+
+static inline int ldsw_be_p(const void *ptr)
+{
+    return *(int16_t *)ptr;
+}
+
+static inline int ldl_be_p(const void *ptr)
+{
+    return *(uint32_t *)ptr;
+}
+
+static inline uint64_t ldq_be_p(const void *ptr)
+{
+    return *(uint64_t *)ptr;
+}
+
+static inline void stw_be_p(void *ptr, int v)
+{
+    *(uint16_t *)ptr = v;
+}
+
+static inline void stl_be_p(void *ptr, int v)
+{
+    *(uint32_t *)ptr = v;
+}
+
+static inline void stq_be_p(void *ptr, uint64_t v)
+{
+    *(uint64_t *)ptr = v;
+}
+
+/* float access */
+
+static inline float32 ldfl_be_p(const void *ptr)
+{
+    return *(float32 *)ptr;
+}
+
+static inline float64 ldfq_be_p(const void *ptr)
+{
+    return *(float64 *)ptr;
+}
+
+static inline void stfl_be_p(void *ptr, float32 v)
+{
+    *(float32 *)ptr = v;
+}
+
+static inline void stfq_be_p(void *ptr, float64 v)
+{
+    *(float64 *)ptr = v;
+}
+
+#endif
+
+/* target CPU memory access functions */
+#if defined(TARGET_WORDS_BIGENDIAN)
+#define lduw_p(p) lduw_be_p(p)
+#define ldsw_p(p) ldsw_be_p(p)
+#define ldl_p(p) ldl_be_p(p)
+#define ldq_p(p) ldq_be_p(p)
+#define ldfl_p(p) ldfl_be_p(p)
+#define ldfq_p(p) ldfq_be_p(p)
+#define stw_p(p, v) stw_be_p(p, v)
+#define stl_p(p, v) stl_be_p(p, v)
+#define stq_p(p, v) stq_be_p(p, v)
+#define stfl_p(p, v) stfl_be_p(p, v)
+#define stfq_p(p, v) stfq_be_p(p, v)
+#else
+#define lduw_p(p) lduw_le_p(p)
+#define ldsw_p(p) ldsw_le_p(p)
+#define ldl_p(p) ldl_le_p(p)
+#define ldq_p(p) ldq_le_p(p)
+#define ldfl_p(p) ldfl_le_p(p)
+#define ldfq_p(p) ldfq_le_p(p)
+#define stw_p(p, v) stw_le_p(p, v)
+#define stl_p(p, v) stl_le_p(p, v)
+#define stq_p(p, v) stq_le_p(p, v)
+#define stfl_p(p, v) stfl_le_p(p, v)
+#define stfq_p(p, v) stfq_le_p(p, v)
+#endif
+
+/* MMU memory access macros */
+
+#if defined(CONFIG_USER_ONLY)
+#include <assert.h>
+#include "qemu-types.h"
+
+/* On some host systems the guest address space is reserved on the host.
+ * This allows the guest address space to be offset to a convenient location.
+ */
+#if defined(CONFIG_USE_GUEST_BASE)
+extern uintptr_t guest_base;
+extern int have_guest_base;
+extern uintptr_t reserved_va;
+#define GUEST_BASE guest_base
+#define RESERVED_VA reserved_va
+#else
+#define GUEST_BASE 0ul
+#define RESERVED_VA 0ul
+#endif
+
+/* All direct uses of g2h and h2g need to go away for usermode softmmu.  */
+#define g2h(x) ((void *)((uintptr_t)(x) + GUEST_BASE))
+
+#if HOST_LONG_BITS <= TARGET_VIRT_ADDR_SPACE_BITS
+#define h2g_valid(x) 1
+#else
+#define h2g_valid(x) ({ \
+    uintptr_t __guest = (uintptr_t)(x) - GUEST_BASE; \
+    __guest < (1ul << TARGET_VIRT_ADDR_SPACE_BITS); \
+})
+#endif
+
+#define h2g(x) ({ \
+    uintptr_t __ret = (uintptr_t)(x) - GUEST_BASE; \
+    /* Check if given address fits target address space */ \
+    assert(h2g_valid(x)); \
+    (abi_ulong)__ret; \
+})
+
+#define saddr(x) g2h(x)
+#define laddr(x) g2h(x)
+
+#else /* !CONFIG_USER_ONLY */
+/* NOTE: we use double casts if pointers and target_ulong have
+   different sizes */
+#define saddr(x) (uint8_t *)(intptr_t)(x)
+#define laddr(x) (uint8_t *)(intptr_t)(x)
+#endif
+
+#define ldub_raw(p) ldub_p(laddr((p)))
+#define ldsb_raw(p) ldsb_p(laddr((p)))
+#define lduw_raw(p) lduw_p(laddr((p)))
+#define ldsw_raw(p) ldsw_p(laddr((p)))
+#define ldl_raw(p) ldl_p(laddr((p)))
+#define ldq_raw(p) ldq_p(laddr((p)))
+#define ldfl_raw(p) ldfl_p(laddr((p)))
+#define ldfq_raw(p) ldfq_p(laddr((p)))
+#define stb_raw(p, v) stb_p(saddr((p)), v)
+#define stw_raw(p, v) stw_p(saddr((p)), v)
+#define stl_raw(p, v) stl_p(saddr((p)), v)
+#define stq_raw(p, v) stq_p(saddr((p)), v)
+#define stfl_raw(p, v) stfl_p(saddr((p)), v)
+#define stfq_raw(p, v) stfq_p(saddr((p)), v)
+
+
+#if defined(CONFIG_USER_ONLY)
+
+/* if user mode, no other memory access functions */
+#define ldub(p) ldub_raw(p)
+#define ldsb(p) ldsb_raw(p)
+#define lduw(p) lduw_raw(p)
+#define ldsw(p) ldsw_raw(p)
+#define ldl(p) ldl_raw(p)
+#define ldq(p) ldq_raw(p)
+#define ldfl(p) ldfl_raw(p)
+#define ldfq(p) ldfq_raw(p)
+#define stb(p, v) stb_raw(p, v)
+#define stw(p, v) stw_raw(p, v)
+#define stl(p, v) stl_raw(p, v)
+#define stq(p, v) stq_raw(p, v)
+#define stfl(p, v) stfl_raw(p, v)
+#define stfq(p, v) stfq_raw(p, v)
+
+#define ldub_code(p) ldub_raw(p)
+#define ldsb_code(p) ldsb_raw(p)
+#define lduw_code(p) lduw_raw(p)
+#define ldsw_code(p) ldsw_raw(p)
+#define ldl_code(p) ldl_raw(p)
+#define ldq_code(p) ldq_raw(p)
+
+#define ldub_kernel(p) ldub_raw(p)
+#define ldsb_kernel(p) ldsb_raw(p)
+#define lduw_kernel(p) lduw_raw(p)
+#define ldsw_kernel(p) ldsw_raw(p)
+#define ldl_kernel(p) ldl_raw(p)
+#define ldq_kernel(p) ldq_raw(p)
+#define ldfl_kernel(p) ldfl_raw(p)
+#define ldfq_kernel(p) ldfq_raw(p)
+#define stb_kernel(p, v) stb_raw(p, v)
+#define stw_kernel(p, v) stw_raw(p, v)
+#define stl_kernel(p, v) stl_raw(p, v)
+#define stq_kernel(p, v) stq_raw(p, v)
+#define stfl_kernel(p, v) stfl_raw(p, v)
+#define stfq_kernel(p, vt) stfq_raw(p, v)
+
+#endif /* defined(CONFIG_USER_ONLY) */
+
+/* page related stuff */
+
+#define TARGET_PAGE_SIZE (1 << TARGET_PAGE_BITS)
+#define TARGET_PAGE_MASK ~(TARGET_PAGE_SIZE - 1)
+#define TARGET_PAGE_ALIGN(addr) (((addr) + TARGET_PAGE_SIZE - 1) & TARGET_PAGE_MASK)
+
+/* ??? These should be the larger of uintptr_t and target_ulong.  */
+extern size_t qemu_real_host_page_size;
+extern size_t qemu_host_page_bits;
+extern size_t qemu_host_page_size;
+extern uintptr_t qemu_host_page_mask;
+
+#define HOST_PAGE_ALIGN(addr) (((addr) + qemu_host_page_size - 1) & qemu_host_page_mask)
+
+/* same as PROT_xxx */
+#define PAGE_READ      0x0001
+#define PAGE_WRITE     0x0002
+#define PAGE_EXEC      0x0004
+#define PAGE_BITS      (PAGE_READ | PAGE_WRITE | PAGE_EXEC)
+#define PAGE_VALID     0x0008
+/* original state of the write flag (used when tracking self-modifying
+   code */
+#define PAGE_WRITE_ORG 0x0010
+#if defined(CONFIG_BSD) && defined(CONFIG_USER_ONLY)
+/* FIXME: Code that sets/uses this is broken and needs to go away.  */
+#define PAGE_RESERVED  0x0020
+#endif
+
+#if defined(CONFIG_USER_ONLY)
+void page_dump(FILE *f);
+
+typedef int (*walk_memory_regions_fn)(void *, abi_ulong,
+                                      abi_ulong, uintptr_t);
+int walk_memory_regions(void *, walk_memory_regions_fn);
+
+int page_get_flags(target_ulong address);
+void page_set_flags(target_ulong start, target_ulong end, int flags);
+int page_check_range(target_ulong start, target_ulong len, int flags);
+#endif
+
+CPUState *cpu_copy(CPUState *env);
+CPUState *qemu_get_cpu(int cpu);
+
+void cpu_dump_state(CPUState *env, FILE *f,
+                    int (*cpu_fprintf)(FILE *f, const char *fmt, ...),
+                    int flags);
+void cpu_dump_statistics (CPUState *env, FILE *f,
+                          int (*cpu_fprintf)(FILE *f, const char *fmt, ...),
+                          int flags);
+
+void QEMU_NORETURN cpu_abort(CPUState *env, const char *fmt, ...)
+#ifndef VBOX
+    __attribute__ ((__format__ (__printf__, 2, 3)));
+#else  /* VBOX */
+    ;
+#endif /* VBOX */
+extern CPUState *first_cpu;
+extern CPUState *cpu_single_env;
+
+#define CPU_INTERRUPT_HARD   0x02 /* hardware interrupt pending */
+#define CPU_INTERRUPT_EXITTB 0x04 /* exit the current TB (use for x86 a20 case) */
+#define CPU_INTERRUPT_TIMER  0x08 /* internal timer exception pending */
+#define CPU_INTERRUPT_FIQ    0x10 /* Fast interrupt pending.  */
+#define CPU_INTERRUPT_HALT   0x20 /* CPU halt wanted */
+#define CPU_INTERRUPT_SMI    0x40 /* (x86 only) SMI interrupt pending */
+#define CPU_INTERRUPT_DEBUG  0x80 /* Debug event occured.  */
+#define CPU_INTERRUPT_VIRQ   0x100 /* virtual interrupt pending.  */
+#define CPU_INTERRUPT_NMI    0x200 /* NMI pending. */
+#define CPU_INTERRUPT_INIT   0x400 /* INIT pending. */
+#define CPU_INTERRUPT_SIPI   0x800 /* SIPI pending. */
+#define CPU_INTERRUPT_MCE    0x1000 /* (x86 only) MCE pending. */
+
+#ifdef VBOX
+/** Executes a single instruction. cpu_exec() will normally return EXCP_SINGLE_INSTR. */
+# define CPU_INTERRUPT_SINGLE_INSTR             0x01000000
+/** Executing a CPU_INTERRUPT_SINGLE_INSTR request, quit the cpu_loop. (for exceptions and suchlike) */
+# define CPU_INTERRUPT_SINGLE_INSTR_IN_FLIGHT   0x02000000
+/** VM execution was interrupted by VMR3Reset, VMR3Suspend or VMR3PowerOff. */
+# define CPU_INTERRUPT_RC                       0x04000000
+/** Exit current TB to process an external request. */
+# define CPU_INTERRUPT_EXTERNAL_FLUSH_TLB       0x08000000
+/** Exit current TB to process an external request. */
+# define CPU_INTERRUPT_EXTERNAL_EXIT            0x10000000
+/** Exit current TB to process an external interrupt request. */
+# define CPU_INTERRUPT_EXTERNAL_HARD            0x20000000
+/** Exit current TB to process an external timer request. */
+# define CPU_INTERRUPT_EXTERNAL_TIMER           0x40000000
+/** Exit current TB to process an external DMA request. */
+# define CPU_INTERRUPT_EXTERNAL_DMA             0x80000000
+#endif /* VBOX */
+void cpu_interrupt(CPUState *s, int mask);
+void cpu_reset_interrupt(CPUState *env, int mask);
+
+void cpu_exit(CPUState *s);
+
+int qemu_cpu_has_work(CPUState *env);
+
+/* Breakpoint/watchpoint flags */
+#define BP_MEM_READ           0x01
+#define BP_MEM_WRITE          0x02
+#define BP_MEM_ACCESS         (BP_MEM_READ | BP_MEM_WRITE)
+#define BP_STOP_BEFORE_ACCESS 0x04
+#define BP_WATCHPOINT_HIT     0x08
+#define BP_GDB                0x10
+#define BP_CPU                0x20
+
+int cpu_breakpoint_insert(CPUState *env, target_ulong pc, int flags,
+                          CPUBreakpoint **breakpoint);
+int cpu_breakpoint_remove(CPUState *env, target_ulong pc, int flags);
+void cpu_breakpoint_remove_by_ref(CPUState *env, CPUBreakpoint *breakpoint);
+void cpu_breakpoint_remove_all(CPUState *env, int mask);
+int cpu_watchpoint_insert(CPUState *env, target_ulong addr, target_ulong len,
+                          int flags, CPUWatchpoint **watchpoint);
+int cpu_watchpoint_remove(CPUState *env, target_ulong addr,
+                          target_ulong len, int flags);
+void cpu_watchpoint_remove_by_ref(CPUState *env, CPUWatchpoint *watchpoint);
+void cpu_watchpoint_remove_all(CPUState *env, int mask);
+
+#define SSTEP_ENABLE  0x1  /* Enable simulated HW single stepping */
+#define SSTEP_NOIRQ   0x2  /* Do not use IRQ while single stepping */
+#define SSTEP_NOTIMER 0x4  /* Do not Timers while single stepping */
+
+void cpu_single_step(CPUState *env, int enabled);
+void cpu_reset(CPUState *s);
+int cpu_is_stopped(CPUState *env);
+void run_on_cpu(CPUState *env, void (*func)(void *data), void *data);
+
+#define CPU_LOG_TB_OUT_ASM (1 << 0)
+#define CPU_LOG_TB_IN_ASM  (1 << 1)
+#define CPU_LOG_TB_OP      (1 << 2)
+#define CPU_LOG_TB_OP_OPT  (1 << 3)
+#define CPU_LOG_INT        (1 << 4)
+#define CPU_LOG_EXEC       (1 << 5)
+#define CPU_LOG_PCALL      (1 << 6)
+#define CPU_LOG_IOPORT     (1 << 7)
+#define CPU_LOG_TB_CPU     (1 << 8)
+#define CPU_LOG_RESET      (1 << 9)
+
+/* define log items */
+typedef struct CPULogItem {
+    int mask;
+    const char *name;
+    const char *help;
+} CPULogItem;
+
+extern const CPULogItem cpu_log_items[];
+
+void cpu_set_log(int log_flags);
+void cpu_set_log_filename(const char *filename);
+int cpu_str_to_log_mask(const char *str);
+
+#if !defined(CONFIG_USER_ONLY)
+
+/* Return the physical page corresponding to a virtual one. Use it
+   only for debugging because no protection checks are done. Return -1
+   if no page found. */
+target_phys_addr_t cpu_get_phys_page_debug(CPUState *env, target_ulong addr);
+
+/* memory API */
+
+#ifndef VBOX
+extern int phys_ram_fd;
+extern ram_addr_t ram_size;
+#endif /* !VBOX */
+
+typedef struct RAMBlock {
+    uint8_t *host;
+    ram_addr_t offset;
+    ram_addr_t length;
+    char idstr[256];
+    QLIST_ENTRY(RAMBlock) next;
+#if defined(__linux__) && !defined(TARGET_S390X)
+    int fd;
+#endif
+} RAMBlock;
+
+typedef struct RAMList {
+    uint8_t *phys_dirty;
+#ifdef VBOX
+    /** This is required for bounds checking the phys_ram_dirty accesses.
+     * We have memory ranges (the high PC-BIOS mapping) which causes some pages
+     * to fall outside the dirty map. */
+    RTGCPHYS phys_dirty_size;
+#if 1
+# define VBOX_RAMLIST_DIRTY_BOUNDS_CHECK_RET(addr,rv) \
+    do { \
+        if (RT_UNLIKELY( ((addr) >> TARGET_PAGE_BITS) >= ram_list.phys_dirty_size)) { \
+            Log(("%s: %RGp\n", __FUNCTION__, (RTGCPHYS)addr)); \
+            return (rv); \
+        } \
+    } while (0)
+# define VBOX_RAMLIST_DIRTY_BOUNDS_CHECK_RETV(addr) \
+    do { \
+        if (RT_UNLIKELY( ((addr) >> TARGET_PAGE_BITS) >= ram_list.phys_dirty_size)) { \
+            Log(("%s: %RGp\n", __FUNCTION__, (RTGCPHYS)addr)); \
+            return; \
+        } \
+    } while (0)
+#else
+# define VBOX_RAMLIST_DIRTY_BOUNDS_CHECK_RET(addr,rv) \
+    AssertMsgReturn(((addr) >> TARGET_PAGE_BITS) < ram_list.phys_dirty_size, ("%#RGp\n", (RTGCPHYS)(addr)), (rv));
+#  define VBOX_RAMLIST_DIRTY_BOUNDS_CHECK_RETV(addr) \
+    AssertMsgReturnVoid(((addr) >> TARGET_PAGE_BITS) < ram_list.phys_dirty_size, ("%#RGp\n", (RTGCPHYS)(addr)));
+# endif
+#else
+# define VBOX_RAMLIST_DIRTY_BOUNDS_CHECK_RET(addr,rv) do {} while()
+# define VBOX_RAMLIST_DIRTY_BOUNDS_CHECK_RETV(addr) do {} while()
+#endif /* VBOX */
+    QLIST_HEAD(ram, RAMBlock) blocks;
+} RAMList;
+extern RAMList ram_list;
+
+extern const char *mem_path;
+extern int mem_prealloc;
+
+/* physical memory access */
+
+/* MMIO pages are identified by a combination of an IO device index and
+   3 flags.  The ROMD code stores the page ram offset in iotlb entry,
+   so only a limited number of ids are avaiable.  */
+
+#define IO_MEM_NB_ENTRIES  (1 << (TARGET_PAGE_BITS  - IO_MEM_SHIFT))
+
+/* Flags stored in the low bits of the TLB virtual address.  These are
+   defined so that fast path ram access is all zeros.  */
+/* Zero if TLB entry is valid.  */
+#define TLB_INVALID_MASK   (1 << 3)
+/* Set if TLB entry references a clean RAM page.  The iotlb entry will
+   contain the page physical address.  */
+#define TLB_NOTDIRTY    (1 << 4)
+/* Set if TLB entry is an IO callback.  */
+#define TLB_MMIO        (1 << 5)
+
+#define VGA_DIRTY_FLAG       0x01
+#define CODE_DIRTY_FLAG      0x02
+#define MIGRATION_DIRTY_FLAG 0x08
+
+/* read dirty bit (return 0 or 1) */
+static inline int cpu_physical_memory_is_dirty(ram_addr_t addr)
+{
+    VBOX_RAMLIST_DIRTY_BOUNDS_CHECK_RET(addr, 0);
+    return ram_list.phys_dirty[addr >> TARGET_PAGE_BITS] == 0xff;
+}
+
+static inline int cpu_physical_memory_get_dirty_flags(ram_addr_t addr)
+{
+    VBOX_RAMLIST_DIRTY_BOUNDS_CHECK_RET(addr, 0xff);
+    return ram_list.phys_dirty[addr >> TARGET_PAGE_BITS];
+}
+
+static inline int cpu_physical_memory_get_dirty(ram_addr_t addr,
+                                                int dirty_flags)
+{
+    VBOX_RAMLIST_DIRTY_BOUNDS_CHECK_RET(addr, 0xff & dirty_flags);
+    return ram_list.phys_dirty[addr >> TARGET_PAGE_BITS] & dirty_flags;
+}
+
+static inline void cpu_physical_memory_set_dirty(ram_addr_t addr)
+{
+    VBOX_RAMLIST_DIRTY_BOUNDS_CHECK_RETV(addr);
+    ram_list.phys_dirty[addr >> TARGET_PAGE_BITS] = 0xff;
+}
+
+static inline int cpu_physical_memory_set_dirty_flags(ram_addr_t addr,
+                                                      int dirty_flags)
+{
+    VBOX_RAMLIST_DIRTY_BOUNDS_CHECK_RET(addr, 0xff);
+    return ram_list.phys_dirty[addr >> TARGET_PAGE_BITS] |= dirty_flags;
+}
+
+static inline void cpu_physical_memory_mask_dirty_range(ram_addr_t start,
+                                                        int length,
+                                                        int dirty_flags)
+{
+    int i, mask, len;
+    uint8_t *p;
+
+    VBOX_RAMLIST_DIRTY_BOUNDS_CHECK_RETV(start);
+    len = length >> TARGET_PAGE_BITS;
+    mask = ~dirty_flags;
+    p = ram_list.phys_dirty + (start >> TARGET_PAGE_BITS);
+    for (i = 0; i < len; i++) {
+        p[i] &= mask;
+    }
+}
+
+void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t end,
+                                     int dirty_flags);
+void cpu_tlb_update_dirty(CPUState *env);
+
+int cpu_physical_memory_set_dirty_tracking(int enable);
+
+int cpu_physical_memory_get_dirty_tracking(void);
+
+int cpu_physical_sync_dirty_bitmap(target_phys_addr_t start_addr,
+                                   target_phys_addr_t end_addr);
+
+void dump_exec_info(FILE *f,
+                    int (*cpu_fprintf)(FILE *f, const char *fmt, ...));
+#endif /* !CONFIG_USER_ONLY */
+
+int cpu_memory_rw_debug(CPUState *env, target_ulong addr,
+                        uint8_t *buf, int len, int is_write);
+
+void cpu_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
+                        uint64_t mcg_status, uint64_t addr, uint64_t misc);
+
+#ifdef VBOX
+void tb_invalidate_virt(CPUState *env, uint32_t eip);
+#endif /* VBOX */
+
+#endif /* CPU_ALL_H */
diff --git a/src/recompiler/cpu-common.h b/src/recompiler/cpu-common.h
new file mode 100644
index 00000000..b523d527
--- /dev/null
+++ b/src/recompiler/cpu-common.h
@@ -0,0 +1,135 @@
+#ifndef CPU_COMMON_H
+#define CPU_COMMON_H 1
+
+/* CPU interfaces that are target indpendent.  */
+
+#if defined(__arm__) || defined(__sparc__) || defined(__mips__) || defined(__hppa__) || defined(__ia64__)
+#define WORDS_ALIGNED
+#endif
+
+#ifdef TARGET_PHYS_ADDR_BITS
+#include "targphys.h"
+#endif
+
+#ifndef NEED_CPU_H
+#include "poison.h"
+#endif
+
+#include "bswap.h"
+#include "qemu-queue.h"
+
+#if !defined(CONFIG_USER_ONLY)
+
+/* address in the RAM (different from a physical address) */
+typedef uintptr_t ram_addr_t;
+
+/* memory API */
+
+typedef void CPUWriteMemoryFunc(void *opaque, target_phys_addr_t addr, uint32_t value);
+typedef uint32_t CPUReadMemoryFunc(void *opaque, target_phys_addr_t addr);
+
+void cpu_register_physical_memory_offset(target_phys_addr_t start_addr,
+                                         ram_addr_t size,
+                                         ram_addr_t phys_offset,
+                                         ram_addr_t region_offset);
+static inline void cpu_register_physical_memory(target_phys_addr_t start_addr,
+                                                ram_addr_t size,
+                                                ram_addr_t phys_offset)
+{
+    cpu_register_physical_memory_offset(start_addr, size, phys_offset, 0);
+}
+
+ram_addr_t cpu_get_physical_page_desc(target_phys_addr_t addr);
+#ifndef VBOX
+ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev, const char *name,
+                        ram_addr_t size, void *host);
+ram_addr_t qemu_ram_alloc(DeviceState *dev, const char *name, ram_addr_t size);
+void qemu_ram_free(ram_addr_t addr);
+/* This should only be used for ram local to a device.  */
+void *qemu_get_ram_ptr(ram_addr_t addr);
+/* This should not be used by devices.  */
+ram_addr_t qemu_ram_addr_from_host(void *ptr);
+#endif /* !VBOX */
+
+int cpu_register_io_memory(CPUReadMemoryFunc * const *mem_read,
+                           CPUWriteMemoryFunc * const *mem_write,
+                           void *opaque);
+void cpu_unregister_io_memory(int table_address);
+
+void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
+                            int len, int is_write);
+static inline void cpu_physical_memory_read(target_phys_addr_t addr,
+                                            uint8_t *buf, int len)
+{
+    cpu_physical_memory_rw(addr, buf, len, 0);
+}
+static inline void cpu_physical_memory_write(target_phys_addr_t addr,
+                                             const uint8_t *buf, int len)
+{
+    cpu_physical_memory_rw(addr, (uint8_t *)buf, len, 1);
+}
+void *cpu_physical_memory_map(target_phys_addr_t addr,
+                              target_phys_addr_t *plen,
+                              int is_write);
+void cpu_physical_memory_unmap(void *buffer, target_phys_addr_t len,
+                               int is_write, target_phys_addr_t access_len);
+void *cpu_register_map_client(void *opaque, void (*callback)(void *opaque));
+void cpu_unregister_map_client(void *cookie);
+
+struct CPUPhysMemoryClient;
+typedef struct CPUPhysMemoryClient CPUPhysMemoryClient;
+struct CPUPhysMemoryClient {
+    void (*set_memory)(struct CPUPhysMemoryClient *client,
+                       target_phys_addr_t start_addr,
+                       ram_addr_t size,
+                       ram_addr_t phys_offset);
+    int (*sync_dirty_bitmap)(struct CPUPhysMemoryClient *client,
+                             target_phys_addr_t start_addr,
+                             target_phys_addr_t end_addr);
+    int (*migration_log)(struct CPUPhysMemoryClient *client,
+                         int enable);
+    QLIST_ENTRY(CPUPhysMemoryClient) list;
+};
+
+void cpu_register_phys_memory_client(CPUPhysMemoryClient *);
+void cpu_unregister_phys_memory_client(CPUPhysMemoryClient *);
+
+/* Coalesced MMIO regions are areas where write operations can be reordered.
+ * This usually implies that write operations are side-effect free.  This allows
+ * batching which can make a major impact on performance when using
+ * virtualization.
+ */
+void qemu_register_coalesced_mmio(target_phys_addr_t addr, ram_addr_t size);
+
+void qemu_unregister_coalesced_mmio(target_phys_addr_t addr, ram_addr_t size);
+
+void qemu_flush_coalesced_mmio_buffer(void);
+
+uint32_t ldub_phys(target_phys_addr_t addr);
+uint32_t lduw_phys(target_phys_addr_t addr);
+uint32_t ldl_phys(target_phys_addr_t addr);
+uint64_t ldq_phys(target_phys_addr_t addr);
+void stl_phys_notdirty(target_phys_addr_t addr, uint32_t val);
+void stq_phys_notdirty(target_phys_addr_t addr, uint64_t val);
+void stb_phys(target_phys_addr_t addr, uint32_t val);
+void stw_phys(target_phys_addr_t addr, uint32_t val);
+void stl_phys(target_phys_addr_t addr, uint32_t val);
+void stq_phys(target_phys_addr_t addr, uint64_t val);
+
+void cpu_physical_memory_write_rom(target_phys_addr_t addr,
+                                   const uint8_t *buf, int len);
+
+#define IO_MEM_SHIFT       3
+
+#define IO_MEM_RAM         (0 << IO_MEM_SHIFT) /* hardcoded offset */
+#define IO_MEM_ROM         (1 << IO_MEM_SHIFT) /* hardcoded offset */
+#define IO_MEM_UNASSIGNED  (2 << IO_MEM_SHIFT)
+#define IO_MEM_NOTDIRTY    (3 << IO_MEM_SHIFT)
+
+/* Acts like a ROM when read and like a device when written.  */
+#define IO_MEM_ROMD        (1)
+#define IO_MEM_SUBPAGE     (2)
+
+#endif
+
+#endif /* !CPU_COMMON_H */
diff --git a/src/recompiler/cpu-defs.h b/src/recompiler/cpu-defs.h
new file mode 100644
index 00000000..65d57063
--- /dev/null
+++ b/src/recompiler/cpu-defs.h
@@ -0,0 +1,235 @@
+/*
+ * common defines for all CPUs
+ *
+ * Copyright (c) 2003 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Oracle LGPL Disclaimer: For the avoidance of doubt, except that if any license choice
+ * other than GPL or LGPL is available it will apply instead, Oracle elects to use only
+ * the Lesser General Public License version 2.1 (LGPLv2) at this time for any software where
+ * a choice of LGPL license versions is made available with the language indicating
+ * that LGPLv2 or any later version may be used, or where a choice of which version
+ * of the LGPL is applied is otherwise unspecified.
+ */
+
+#ifndef CPU_DEFS_H
+#define CPU_DEFS_H
+
+#ifndef NEED_CPU_H
+#error cpu.h included from common code
+#endif
+
+#include "config.h"
+#include <setjmp.h>
+#include <inttypes.h>
+#ifndef VBOX
+#include <signal.h>
+#else  /* VBOX */
+# define sig_atomic_t int32_t
+#endif /* VBOX */
+#include "osdep.h"
+#include "qemu-queue.h"
+#include "targphys.h"
+
+#ifndef TARGET_LONG_BITS
+#error TARGET_LONG_BITS must be defined before including this header
+#endif
+
+#define TARGET_LONG_SIZE (TARGET_LONG_BITS / 8)
+
+/* target_ulong is the type of a virtual address */
+#if TARGET_LONG_SIZE == 4
+typedef int32_t target_long;
+typedef uint32_t target_ulong;
+#define TARGET_FMT_lx "%08x"
+#define TARGET_FMT_ld "%d"
+#define TARGET_FMT_lu "%u"
+#elif TARGET_LONG_SIZE == 8
+typedef int64_t target_long;
+typedef uint64_t target_ulong;
+#define TARGET_FMT_lx "%016" PRIx64
+#define TARGET_FMT_ld "%" PRId64
+#define TARGET_FMT_lu "%" PRIu64
+#else
+#error TARGET_LONG_SIZE undefined
+#endif
+
+#define HOST_LONG_SIZE (HOST_LONG_BITS / 8)
+
+#define EXCP_INTERRUPT 	0x10000 /* async interruption */
+#define EXCP_HLT        0x10001 /* hlt instruction reached */
+#define EXCP_DEBUG      0x10002 /* cpu stopped after a breakpoint or singlestep */
+#define EXCP_HALTED     0x10003 /* cpu is halted (waiting for external event) */
+#ifdef VBOX
+# define EXCP_EXECUTE_RAW   0x11024 /**< execute raw mode. */
+# define EXCP_EXECUTE_HM 0x11025 /**< execute hardware accelerated raw mode. */
+# define EXCP_SINGLE_INSTR  0x11026 /**< executed single instruction. */
+# define EXCP_RC            0x11027 /**< a EM rc was raised (VMR3Reset/Suspend/PowerOff). */
+#endif /* VBOX */
+
+#define TB_JMP_CACHE_BITS 12
+#define TB_JMP_CACHE_SIZE (1 << TB_JMP_CACHE_BITS)
+
+/* Only the bottom TB_JMP_PAGE_BITS of the jump cache hash bits vary for
+   addresses on the same page.  The top bits are the same.  This allows
+   TLB invalidation to quickly clear a subset of the hash table.  */
+#define TB_JMP_PAGE_BITS (TB_JMP_CACHE_BITS / 2)
+#define TB_JMP_PAGE_SIZE (1 << TB_JMP_PAGE_BITS)
+#define TB_JMP_ADDR_MASK (TB_JMP_PAGE_SIZE - 1)
+#define TB_JMP_PAGE_MASK (TB_JMP_CACHE_SIZE - TB_JMP_PAGE_SIZE)
+
+#if !defined(CONFIG_USER_ONLY)
+#define CPU_TLB_BITS 8
+#define CPU_TLB_SIZE (1 << CPU_TLB_BITS)
+
+#if HOST_LONG_BITS == 32 && TARGET_LONG_BITS == 32
+#define CPU_TLB_ENTRY_BITS 4
+#else
+#define CPU_TLB_ENTRY_BITS 5
+#endif
+
+typedef struct CPUTLBEntry {
+    /* bit TARGET_LONG_BITS to TARGET_PAGE_BITS : virtual address
+       bit TARGET_PAGE_BITS-1..4  : Nonzero for accesses that should not
+                                    go directly to ram.
+       bit 3                      : indicates that the entry is invalid
+       bit 2..0                   : zero
+    */
+    target_ulong addr_read;
+    target_ulong addr_write;
+    target_ulong addr_code;
+    /* Addend to virtual address to get host address.  IO accesses
+       use the corresponding iotlb value.  */
+    uintptr_t addend;
+    /* padding to get a power of two size */
+    uint8_t dummy[(1 << CPU_TLB_ENTRY_BITS) -
+                  (sizeof(target_ulong) * 3 +
+                   ((-sizeof(target_ulong) * 3) & (sizeof(uintptr_t) - 1)) +
+                   sizeof(uintptr_t))];
+} CPUTLBEntry;
+
+extern int CPUTLBEntry_wrong_size[sizeof(CPUTLBEntry) == (1 << CPU_TLB_ENTRY_BITS) ? 1 : -1];
+
+#define CPU_COMMON_TLB \
+    /* The meaning of the MMU modes is defined in the target code. */   \
+    CPUTLBEntry tlb_table[NB_MMU_MODES][CPU_TLB_SIZE];                  \
+    target_phys_addr_t iotlb[NB_MMU_MODES][CPU_TLB_SIZE];               \
+    target_ulong tlb_flush_addr;                                        \
+    target_ulong tlb_flush_mask;
+
+#else
+
+#define CPU_COMMON_TLB
+
+#endif
+
+
+#ifdef HOST_WORDS_BIGENDIAN
+typedef struct icount_decr_u16 {
+    uint16_t high;
+    uint16_t low;
+} icount_decr_u16;
+#else
+typedef struct icount_decr_u16 {
+    uint16_t low;
+    uint16_t high;
+} icount_decr_u16;
+#endif
+
+struct kvm_run;
+struct KVMState;
+struct qemu_work_item;
+
+typedef struct CPUBreakpoint {
+    target_ulong pc;
+    int flags; /* BP_* */
+    QTAILQ_ENTRY(CPUBreakpoint) entry;
+} CPUBreakpoint;
+
+typedef struct CPUWatchpoint {
+    target_ulong vaddr;
+    target_ulong len_mask;
+    int flags; /* BP_* */
+    QTAILQ_ENTRY(CPUWatchpoint) entry;
+} CPUWatchpoint;
+
+#define CPU_TEMP_BUF_NLONGS 128
+#define CPU_COMMON                                                      \
+    struct TranslationBlock *current_tb; /* currently executing TB  */  \
+    /* soft mmu support */                                              \
+    /* in order to avoid passing too many arguments to the MMIO         \
+       helpers, we store some rarely used information in the CPU        \
+       context) */                                                      \
+    uintptr_t mem_io_pc; /* host pc at which the memory was             \
+                                accessed */                             \
+    target_ulong mem_io_vaddr; /* target virtual addr at which the      \
+                                     memory was accessed */             \
+    uint32_t halted; /* Nonzero if the CPU is in suspend state */       \
+    uint32_t interrupt_request;                                         \
+    volatile sig_atomic_t exit_request;                                 \
+    CPU_COMMON_TLB                                                      \
+    struct TranslationBlock *tb_jmp_cache[TB_JMP_CACHE_SIZE];           \
+    /* buffer for temporaries in the code generator */                  \
+    long temp_buf[CPU_TEMP_BUF_NLONGS];                                 \
+                                                                        \
+    int64_t icount_extra; /* Instructions until next timer event.  */   \
+    /* Number of cycles left, with interrupt flag in high bit.          \
+       This allows a single read-compare-cbranch-write sequence to test \
+       for both decrementer underflow and exceptions.  */               \
+    union {                                                             \
+        uint32_t u32;                                                   \
+        icount_decr_u16 u16;                                            \
+    } icount_decr;                                                      \
+    uint32_t can_do_io; /* nonzero if memory mapped IO is safe.  */     \
+                                                                        \
+    /* from this point: preserved by CPU reset */                       \
+    /* ice debug support */                                             \
+    QTAILQ_HEAD(breakpoints_head, CPUBreakpoint) breakpoints;            \
+    int singlestep_enabled;                                             \
+                                                                        \
+    QTAILQ_HEAD(watchpoints_head, CPUWatchpoint) watchpoints;            \
+    CPUWatchpoint *watchpoint_hit;                                      \
+                                                                        \
+    struct GDBRegisterState *gdb_regs;                                  \
+                                                                        \
+    /* Core interrupt code */                                           \
+    jmp_buf jmp_env;                                                    \
+    int exception_index;                                                \
+                                                                        \
+    CPUState *next_cpu; /* next CPU sharing TB cache */                 \
+    int cpu_index; /* CPU index (informative) */                        \
+    uint32_t host_tid; /* host thread ID */                             \
+    int numa_node; /* NUMA node this cpu is belonging to  */            \
+    int nr_cores;  /* number of cores within this CPU package */        \
+    int nr_threads;/* number of threads within this CPU */              \
+    int running; /* Nonzero if cpu is currently running(usermode).  */  \
+    /* user data */                                                     \
+    void *opaque;                                                       \
+                                                                        \
+    uint32_t created;                                                   \
+    uint32_t stop;   /* Stop request */                                 \
+    uint32_t stopped; /* Artificially stopped */                        \
+    struct QemuThread *thread;                                          \
+    struct QemuCond *halt_cond;                                         \
+    struct qemu_work_item *queued_work_first, *queued_work_last;        \
+    const char *cpu_model_str;                                          \
+    struct KVMState *kvm_state;                                         \
+    struct kvm_run *kvm_run;                                            \
+    int kvm_fd;                                                         \
+    int kvm_vcpu_dirty;
+
+#endif
diff --git a/src/recompiler/cpu-exec.c b/src/recompiler/cpu-exec.c
new file mode 100644
index 00000000..4151363c
--- /dev/null
+++ b/src/recompiler/cpu-exec.c
@@ -0,0 +1,1455 @@
+/*
+ *  i386 emulator main execution loop
+ *
+ *  Copyright (c) 2003-2005 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Oracle LGPL Disclaimer: For the avoidance of doubt, except that if any license choice
+ * other than GPL or LGPL is available it will apply instead, Oracle elects to use only
+ * the Lesser General Public License version 2.1 (LGPLv2) at this time for any software where
+ * a choice of LGPL license versions is made available with the language indicating
+ * that LGPLv2 or any later version may be used, or where a choice of which version
+ * of the LGPL is applied is otherwise unspecified.
+ */
+
+#include "config.h"
+#include "exec.h"
+#include "disas.h"
+#include "tcg.h"
+#include "kvm.h"
+#include "qemu-barrier.h"
+
+#if !defined(CONFIG_SOFTMMU)
+#undef EAX
+#undef ECX
+#undef EDX
+#undef EBX
+#undef ESP
+#undef EBP
+#undef ESI
+#undef EDI
+#undef EIP
+#include <signal.h>
+#ifdef __linux__
+#include <sys/ucontext.h>
+#endif
+#endif
+
+#if defined(__sparc__) && !defined(CONFIG_SOLARIS)
+// Work around ugly bugs in glibc that mangle global register contents
+#undef env
+#define env cpu_single_env
+#endif
+
+int tb_invalidated_flag;
+
+//#define CONFIG_DEBUG_EXEC
+//#define DEBUG_SIGNAL
+
+int qemu_cpu_has_work(CPUState *env)
+{
+    return cpu_has_work(env);
+}
+
+void cpu_loop_exit(void)
+{
+    env->current_tb = NULL;
+    longjmp(env->jmp_env, 1);
+}
+
+/* exit the current TB from a signal handler. The host registers are
+   restored in a state compatible with the CPU emulator
+ */
+void cpu_resume_from_signal(CPUState *env1, void *puc)
+{
+#if !defined(CONFIG_SOFTMMU)
+#ifdef __linux__
+    struct ucontext *uc = puc;
+#elif defined(__OpenBSD__)
+    struct sigcontext *uc = puc;
+#endif
+#endif
+
+    env = env1;
+
+    /* XXX: restore cpu registers saved in host registers */
+
+#if !defined(CONFIG_SOFTMMU)
+    if (puc) {
+        /* XXX: use siglongjmp ? */
+#ifdef __linux__
+#ifdef __ia64
+        sigprocmask(SIG_SETMASK, (sigset_t *)&uc->uc_sigmask, NULL);
+#else
+        sigprocmask(SIG_SETMASK, &uc->uc_sigmask, NULL);
+#endif
+#elif defined(__OpenBSD__)
+        sigprocmask(SIG_SETMASK, &uc->sc_mask, NULL);
+#endif
+    }
+#endif
+    env->exception_index = -1;
+    longjmp(env->jmp_env, 1);
+}
+
+/* Execute the code without caching the generated code. An interpreter
+   could be used if available. */
+static void cpu_exec_nocache(int max_cycles, TranslationBlock *orig_tb)
+{
+    uintptr_t next_tb;
+    TranslationBlock *tb;
+
+    /* Should never happen.
+       We only end up here when an existing TB is too long.  */
+    if (max_cycles > CF_COUNT_MASK)
+        max_cycles = CF_COUNT_MASK;
+
+    tb = tb_gen_code(env, orig_tb->pc, orig_tb->cs_base, orig_tb->flags,
+                     max_cycles);
+    env->current_tb = tb;
+    /* execute the generated code */
+#if defined(VBOX) && defined(GCC_WITH_BUGGY_REGPARM)
+    tcg_qemu_tb_exec(tb->tc_ptr, next_tb);
+#else
+    next_tb = tcg_qemu_tb_exec(tb->tc_ptr);
+#endif
+    env->current_tb = NULL;
+
+    if ((next_tb & 3) == 2) {
+        /* Restore PC.  This may happen if async event occurs before
+           the TB starts executing.  */
+        cpu_pc_from_tb(env, tb);
+    }
+    tb_phys_invalidate(tb, -1);
+    tb_free(tb);
+}
+
+static TranslationBlock *tb_find_slow(target_ulong pc,
+                                      target_ulong cs_base,
+                                      uint64_t flags)
+{
+    TranslationBlock *tb, **ptb1;
+    unsigned int h;
+    tb_page_addr_t phys_pc, phys_page1, phys_page2;
+    target_ulong virt_page2;
+
+    tb_invalidated_flag = 0;
+
+    /* find translated block using physical mappings */
+    phys_pc = get_page_addr_code(env, pc);
+    phys_page1 = phys_pc & TARGET_PAGE_MASK;
+    phys_page2 = -1;
+    h = tb_phys_hash_func(phys_pc);
+    ptb1 = &tb_phys_hash[h];
+    for(;;) {
+        tb = *ptb1;
+        if (!tb)
+            goto not_found;
+        if (tb->pc == pc &&
+            tb->page_addr[0] == phys_page1 &&
+            tb->cs_base == cs_base &&
+            tb->flags == flags) {
+            /* check next page if needed */
+            if (tb->page_addr[1] != -1) {
+                virt_page2 = (pc & TARGET_PAGE_MASK) +
+                    TARGET_PAGE_SIZE;
+                phys_page2 = get_page_addr_code(env, virt_page2);
+                if (tb->page_addr[1] == phys_page2)
+                    goto found;
+            } else {
+                goto found;
+            }
+        }
+        ptb1 = &tb->phys_hash_next;
+    }
+ not_found:
+   /* if no translated code available, then translate it now */
+    tb = tb_gen_code(env, pc, cs_base, flags, 0);
+
+ found:
+    /* we add the TB in the virtual pc hash table */
+    env->tb_jmp_cache[tb_jmp_cache_hash_func(pc)] = tb;
+    return tb;
+}
+
+static inline TranslationBlock *tb_find_fast(void)
+{
+    TranslationBlock *tb;
+    target_ulong cs_base, pc;
+    int flags;
+
+    /* we record a subset of the CPU state. It will
+       always be the same before a given translated block
+       is executed. */
+    cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
+    tb = env->tb_jmp_cache[tb_jmp_cache_hash_func(pc)];
+    if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base ||
+                 tb->flags != flags)) {
+        tb = tb_find_slow(pc, cs_base, flags);
+    }
+    return tb;
+}
+
+static CPUDebugExcpHandler *debug_excp_handler;
+
+CPUDebugExcpHandler *cpu_set_debug_excp_handler(CPUDebugExcpHandler *handler)
+{
+    CPUDebugExcpHandler *old_handler = debug_excp_handler;
+
+    debug_excp_handler = handler;
+    return old_handler;
+}
+
+static void cpu_handle_debug_exception(CPUState *env)
+{
+    CPUWatchpoint *wp;
+
+    if (!env->watchpoint_hit)
+        QTAILQ_FOREACH(wp, &env->watchpoints, entry)
+            wp->flags &= ~BP_WATCHPOINT_HIT;
+
+    if (debug_excp_handler)
+        debug_excp_handler(env);
+}
+
+/* main execution loop */
+
+volatile sig_atomic_t exit_request;
+
+int cpu_exec(CPUState *env1)
+{
+    volatile host_reg_t saved_env_reg;
+    int ret VBOX_ONLY(= 0), interrupt_request;
+    TranslationBlock *tb;
+    uint8_t *tc_ptr;
+    uintptr_t next_tb;
+
+# ifndef VBOX
+    if (cpu_halted(env1) == EXCP_HALTED)
+        return EXCP_HALTED;
+# endif /* !VBOX */
+
+    cpu_single_env = env1;
+
+    /* the access to env below is actually saving the global register's
+       value, so that files not including target-xyz/exec.h are free to
+       use it.  */
+    QEMU_BUILD_BUG_ON (sizeof (saved_env_reg) != sizeof (env));
+    saved_env_reg = (host_reg_t) env;
+    barrier();
+    env = env1;
+
+    if (unlikely(exit_request)) {
+        env->exit_request = 1;
+    }
+
+#if defined(TARGET_I386)
+    if (!kvm_enabled()) {
+        /* put eflags in CPU temporary format */
+        CC_SRC = env->eflags & (CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C);
+        DF = 1 - (2 * ((env->eflags >> 10) & 1));
+        CC_OP = CC_OP_EFLAGS;
+        env->eflags &= ~(DF_MASK | CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C);
+    }
+#elif defined(TARGET_SPARC)
+#elif defined(TARGET_M68K)
+    env->cc_op = CC_OP_FLAGS;
+    env->cc_dest = env->sr & 0xf;
+    env->cc_x = (env->sr >> 4) & 1;
+#elif defined(TARGET_ALPHA)
+#elif defined(TARGET_ARM)
+#elif defined(TARGET_PPC)
+#elif defined(TARGET_MICROBLAZE)
+#elif defined(TARGET_MIPS)
+#elif defined(TARGET_SH4)
+#elif defined(TARGET_CRIS)
+#elif defined(TARGET_S390X)
+    /* XXXXX */
+#else
+#error unsupported target CPU
+#endif
+#ifndef VBOX /* VBOX: We need to raise traps and suchlike from the outside. */
+    env->exception_index = -1;
+#endif /* !VBOX */
+
+    /* prepare setjmp context for exception handling */
+    for(;;) {
+        if (setjmp(env->jmp_env) == 0) {
+#if defined(__sparc__) && !defined(CONFIG_SOLARIS)
+#undef env
+                    env = cpu_single_env;
+#define env cpu_single_env
+#endif
+#ifdef VBOX
+            env->current_tb = NULL; /* probably not needed, but whatever... */
+
+            /*
+             * Check for fatal errors first
+             */
+            if (env->interrupt_request & CPU_INTERRUPT_RC) {
+                env->exception_index = EXCP_RC;
+                ASMAtomicAndS32((int32_t volatile *)&env->interrupt_request, ~CPU_INTERRUPT_RC);
+                ret = env->exception_index;
+                cpu_loop_exit();
+            }
+#endif
+
+            /* if an exception is pending, we execute it here */
+            if (env->exception_index >= 0) {
+                if (env->exception_index >= EXCP_INTERRUPT) {
+                    /* exit request from the cpu execution loop */
+                    ret = env->exception_index;
+#ifdef VBOX /* because of the above stuff */
+                    env->exception_index = -1;
+#endif
+                    if (ret == EXCP_DEBUG)
+                        cpu_handle_debug_exception(env);
+                    break;
+                } else {
+#if defined(CONFIG_USER_ONLY)
+                    /* if user mode only, we simulate a fake exception
+                       which will be handled outside the cpu execution
+                       loop */
+#if defined(TARGET_I386)
+                    do_interrupt_user(env->exception_index,
+                                      env->exception_is_int,
+                                      env->error_code,
+                                      env->exception_next_eip);
+                    /* successfully delivered */
+                    env->old_exception = -1;
+#endif
+                    ret = env->exception_index;
+                    break;
+#else
+#if defined(TARGET_I386)
+                    /* simulate a real cpu exception. On i386, it can
+                       trigger new exceptions, but we do not handle
+                       double or triple faults yet. */
+#  ifdef VBOX
+                    RAWEx_ProfileStart(env, STATS_IRQ_HANDLING);
+                    Log(("do_interrupt: vec=%#x int=%d pc=%04x:%RGv\n", env->exception_index, env->exception_is_int,
+                         env->segs[R_CS].selector, (RTGCPTR)env->exception_next_eip));
+#  endif /* VBOX */
+                    do_interrupt(env->exception_index,
+                                 env->exception_is_int && env->exception_is_int != EXCEPTION_IS_INT_VALUE_HARDWARE_IRQ,
+                                 env->error_code,
+                                 env->exception_next_eip,
+                                 env->exception_is_int == EXCEPTION_IS_INT_VALUE_HARDWARE_IRQ);
+                    /* successfully delivered */
+                    env->old_exception = -1;
+#  ifdef VBOX
+                    RAWEx_ProfileStop(env, STATS_IRQ_HANDLING);
+#  endif /* VBOX */
+#elif defined(TARGET_PPC)
+                    do_interrupt(env);
+#elif defined(TARGET_MICROBLAZE)
+                    do_interrupt(env);
+#elif defined(TARGET_MIPS)
+                    do_interrupt(env);
+#elif defined(TARGET_SPARC)
+                    do_interrupt(env);
+#elif defined(TARGET_ARM)
+                    do_interrupt(env);
+#elif defined(TARGET_SH4)
+                    do_interrupt(env);
+#elif defined(TARGET_ALPHA)
+                    do_interrupt(env);
+#elif defined(TARGET_CRIS)
+                    do_interrupt(env);
+#elif defined(TARGET_M68K)
+                    do_interrupt(0);
+#endif
+                    env->exception_index = -1;
+#endif
+                }
+            }
+
+# ifndef VBOX
+            if (kvm_enabled()) {
+                kvm_cpu_exec(env);
+                longjmp(env->jmp_env, 1);
+            }
+# endif /* !VBOX */
+
+            next_tb = 0; /* force lookup of first TB */
+            for(;;) {
+                interrupt_request = env->interrupt_request;
+                if (unlikely(interrupt_request)) {
+                    if (unlikely(env->singlestep_enabled & SSTEP_NOIRQ)) {
+                        /* Mask out external interrupts for this step. */
+                        interrupt_request &= ~(CPU_INTERRUPT_HARD |
+                                               CPU_INTERRUPT_FIQ |
+                                               CPU_INTERRUPT_SMI |
+                                               CPU_INTERRUPT_NMI);
+                    }
+                    if (interrupt_request & CPU_INTERRUPT_DEBUG) {
+                        env->interrupt_request &= ~CPU_INTERRUPT_DEBUG;
+                        env->exception_index = EXCP_DEBUG;
+                        cpu_loop_exit();
+                    }
+#if defined(TARGET_ARM) || defined(TARGET_SPARC) || defined(TARGET_MIPS) || \
+    defined(TARGET_PPC) || defined(TARGET_ALPHA) || defined(TARGET_CRIS) || \
+    defined(TARGET_MICROBLAZE)
+                    if (interrupt_request & CPU_INTERRUPT_HALT) {
+                        env->interrupt_request &= ~CPU_INTERRUPT_HALT;
+                        env->halted = 1;
+                        env->exception_index = EXCP_HLT;
+                        cpu_loop_exit();
+                    }
+#endif
+#if defined(TARGET_I386)
+# ifdef VBOX
+                    /* Memory registration may post a tlb flush request, process it ASAP. */
+                    if (interrupt_request & (CPU_INTERRUPT_EXTERNAL_FLUSH_TLB)) {
+                        tlb_flush(env, true); /* (clears the flush flag) */
+                    }
+
+                    /* Single instruction exec request, we execute it and return (one way or the other).
+                       The caller will always reschedule after doing this operation! */
+                    if (interrupt_request & CPU_INTERRUPT_SINGLE_INSTR)
+                    {
+                        /* not in flight are we? (if we are, we trapped) */
+                        if (!(env->interrupt_request & CPU_INTERRUPT_SINGLE_INSTR_IN_FLIGHT))
+                        {
+                            ASMAtomicOrS32((int32_t volatile *)&env->interrupt_request, CPU_INTERRUPT_SINGLE_INSTR_IN_FLIGHT);
+                            env->exception_index = EXCP_SINGLE_INSTR;
+                            if (emulate_single_instr(env) == -1)
+                                AssertMsgFailed(("REM: emulate_single_instr failed for EIP=%RGv!!\n", (RTGCPTR)env->eip));
+
+                            /* When we receive an external interrupt during execution of this single
+                               instruction, then we should stay here. We will leave when we're ready
+                               for raw-mode or when interrupted by pending EMT requests.  */
+                            interrupt_request = env->interrupt_request; /* reload this! */
+                            if (   !(interrupt_request & CPU_INTERRUPT_HARD)
+                                || !(env->eflags & IF_MASK)
+                                ||  (env->hflags & HF_INHIBIT_IRQ_MASK)
+                                ||  (env->state & CPU_RAW_HM)
+                               )
+                            {
+                                env->exception_index = ret = EXCP_SINGLE_INSTR;
+                                cpu_loop_exit();
+                            }
+                        }
+                        /* Clear CPU_INTERRUPT_SINGLE_INSTR and leave CPU_INTERRUPT_SINGLE_INSTR_IN_FLIGHT set. */
+                        ASMAtomicAndS32((int32_t volatile *)&env->interrupt_request, ~CPU_INTERRUPT_SINGLE_INSTR);
+                    }
+# endif /* VBOX */
+
+# ifndef VBOX /** @todo reconcile our code with the following...  */
+                    if (interrupt_request & CPU_INTERRUPT_INIT) {
+                            svm_check_intercept(SVM_EXIT_INIT);
+                            do_cpu_init(env);
+                            env->exception_index = EXCP_HALTED;
+                            cpu_loop_exit();
+                    } else if (interrupt_request & CPU_INTERRUPT_SIPI) {
+                            do_cpu_sipi(env);
+                    } else if (env->hflags2 & HF2_GIF_MASK) {
+                        if ((interrupt_request & CPU_INTERRUPT_SMI) &&
+                            !(env->hflags & HF_SMM_MASK)) {
+                            svm_check_intercept(SVM_EXIT_SMI);
+                            env->interrupt_request &= ~CPU_INTERRUPT_SMI;
+                            do_smm_enter();
+                            next_tb = 0;
+                        } else if ((interrupt_request & CPU_INTERRUPT_NMI) &&
+                                   !(env->hflags2 & HF2_NMI_MASK)) {
+                            env->interrupt_request &= ~CPU_INTERRUPT_NMI;
+                            env->hflags2 |= HF2_NMI_MASK;
+                            do_interrupt(EXCP02_NMI, 0, 0, 0, 1);
+                            next_tb = 0;
+			} else if (interrupt_request & CPU_INTERRUPT_MCE) {
+                            env->interrupt_request &= ~CPU_INTERRUPT_MCE;
+                            do_interrupt(EXCP12_MCHK, 0, 0, 0, 0);
+                            next_tb = 0;
+                        } else if ((interrupt_request & CPU_INTERRUPT_HARD) &&
+                                   (((env->hflags2 & HF2_VINTR_MASK) &&
+                                     (env->hflags2 & HF2_HIF_MASK)) ||
+                                    (!(env->hflags2 & HF2_VINTR_MASK) &&
+                                     (env->eflags & IF_MASK &&
+                                      !(env->hflags & HF_INHIBIT_IRQ_MASK))))) {
+                            int intno;
+                            svm_check_intercept(SVM_EXIT_INTR);
+                            env->interrupt_request &= ~(CPU_INTERRUPT_HARD | CPU_INTERRUPT_VIRQ);
+                            intno = cpu_get_pic_interrupt(env);
+                            qemu_log_mask(CPU_LOG_TB_IN_ASM, "Servicing hardware INT=0x%02x\n", intno);
+#if defined(__sparc__) && !defined(CONFIG_SOLARIS)
+#undef env
+                    env = cpu_single_env;
+#define env cpu_single_env
+#endif
+                            do_interrupt(intno, 0, 0, 0, 1);
+                            /* ensure that no TB jump will be modified as
+                               the program flow was changed */
+                            next_tb = 0;
+#if !defined(CONFIG_USER_ONLY)
+                        } else if ((interrupt_request & CPU_INTERRUPT_VIRQ) &&
+                                   (env->eflags & IF_MASK) &&
+                                   !(env->hflags & HF_INHIBIT_IRQ_MASK)) {
+                            int intno;
+                            /* FIXME: this should respect TPR */
+                            svm_check_intercept(SVM_EXIT_VINTR);
+                            intno = ldl_phys(env->vm_vmcb + offsetof(struct vmcb, control.int_vector));
+                            qemu_log_mask(CPU_LOG_TB_IN_ASM, "Servicing virtual hardware INT=0x%02x\n", intno);
+                            do_interrupt(intno, 0, 0, 0, 1);
+                            env->interrupt_request &= ~CPU_INTERRUPT_VIRQ;
+                            next_tb = 0;
+#endif
+                        }
+                    }
+# else  /* VBOX */
+                    RAWEx_ProfileStart(env, STATS_IRQ_HANDLING);
+                    if ((interrupt_request & CPU_INTERRUPT_SMI) &&
+                        !(env->hflags & HF_SMM_MASK)) {
+                        env->interrupt_request &= ~CPU_INTERRUPT_SMI;
+                        do_smm_enter();
+                        next_tb = 0;
+                    }
+                    else if ((interrupt_request & CPU_INTERRUPT_HARD) &&
+                             (env->eflags & IF_MASK) &&
+                             !(env->hflags & HF_INHIBIT_IRQ_MASK))
+                    {
+                        /* if hardware interrupt pending, we execute it */
+                        int intno;
+                        ASMAtomicAndS32((int32_t volatile *)&env->interrupt_request, ~CPU_INTERRUPT_HARD);
+                        intno = cpu_get_pic_interrupt(env);
+                        if (intno >= 0)
+                        {
+                            Log(("do_interrupt %d\n", intno));
+                            do_interrupt(intno, 0, 0, 0, 1);
+                        }
+                        /* ensure that no TB jump will be modified as
+                           the program flow was changed */
+                        next_tb = 0;
+                    }
+# endif /* VBOX */
+#elif defined(TARGET_PPC)
+#if 0
+                    if ((interrupt_request & CPU_INTERRUPT_RESET)) {
+                        cpu_reset(env);
+                    }
+#endif
+                    if (interrupt_request & CPU_INTERRUPT_HARD) {
+                        ppc_hw_interrupt(env);
+                        if (env->pending_interrupts == 0)
+                            env->interrupt_request &= ~CPU_INTERRUPT_HARD;
+                        next_tb = 0;
+                    }
+#elif defined(TARGET_MICROBLAZE)
+                    if ((interrupt_request & CPU_INTERRUPT_HARD)
+                        && (env->sregs[SR_MSR] & MSR_IE)
+                        && !(env->sregs[SR_MSR] & (MSR_EIP | MSR_BIP))
+                        && !(env->iflags & (D_FLAG | IMM_FLAG))) {
+                        env->exception_index = EXCP_IRQ;
+                        do_interrupt(env);
+                        next_tb = 0;
+                    }
+#elif defined(TARGET_MIPS)
+                    if ((interrupt_request & CPU_INTERRUPT_HARD) &&
+                        (env->CP0_Status & env->CP0_Cause & CP0Ca_IP_mask) &&
+                        (env->CP0_Status & (1 << CP0St_IE)) &&
+                        !(env->CP0_Status & (1 << CP0St_EXL)) &&
+                        !(env->CP0_Status & (1 << CP0St_ERL)) &&
+                        !(env->hflags & MIPS_HFLAG_DM)) {
+                        /* Raise it */
+                        env->exception_index = EXCP_EXT_INTERRUPT;
+                        env->error_code = 0;
+                        do_interrupt(env);
+                        next_tb = 0;
+                    }
+#elif defined(TARGET_SPARC)
+                    if (interrupt_request & CPU_INTERRUPT_HARD) {
+                        if (cpu_interrupts_enabled(env) &&
+                            env->interrupt_index > 0) {
+                            int pil = env->interrupt_index & 0xf;
+                            int type = env->interrupt_index & 0xf0;
+
+                            if (((type == TT_EXTINT) &&
+                                  cpu_pil_allowed(env, pil)) ||
+                                  type != TT_EXTINT) {
+                                env->exception_index = env->interrupt_index;
+                                do_interrupt(env);
+                                next_tb = 0;
+                            }
+                        }
+		    } else if (interrupt_request & CPU_INTERRUPT_TIMER) {
+			//do_interrupt(0, 0, 0, 0, 0);
+			env->interrupt_request &= ~CPU_INTERRUPT_TIMER;
+		    }
+#elif defined(TARGET_ARM)
+                    if (interrupt_request & CPU_INTERRUPT_FIQ
+                        && !(env->uncached_cpsr & CPSR_F)) {
+                        env->exception_index = EXCP_FIQ;
+                        do_interrupt(env);
+                        next_tb = 0;
+                    }
+                    /* ARMv7-M interrupt return works by loading a magic value
+                       into the PC.  On real hardware the load causes the
+                       return to occur.  The qemu implementation performs the
+                       jump normally, then does the exception return when the
+                       CPU tries to execute code at the magic address.
+                       This will cause the magic PC value to be pushed to
+                       the stack if an interrupt occured at the wrong time.
+                       We avoid this by disabling interrupts when
+                       pc contains a magic address.  */
+                    if (interrupt_request & CPU_INTERRUPT_HARD
+                        && ((IS_M(env) && env->regs[15] < 0xfffffff0)
+                            || !(env->uncached_cpsr & CPSR_I))) {
+                        env->exception_index = EXCP_IRQ;
+                        do_interrupt(env);
+                        next_tb = 0;
+                    }
+#elif defined(TARGET_SH4)
+                    if (interrupt_request & CPU_INTERRUPT_HARD) {
+                        do_interrupt(env);
+                        next_tb = 0;
+                    }
+#elif defined(TARGET_ALPHA)
+                    if (interrupt_request & CPU_INTERRUPT_HARD) {
+                        do_interrupt(env);
+                        next_tb = 0;
+                    }
+#elif defined(TARGET_CRIS)
+                    if (interrupt_request & CPU_INTERRUPT_HARD
+                        && (env->pregs[PR_CCS] & I_FLAG)
+                        && !env->locked_irq) {
+                        env->exception_index = EXCP_IRQ;
+                        do_interrupt(env);
+                        next_tb = 0;
+                    }
+                    if (interrupt_request & CPU_INTERRUPT_NMI
+                        && (env->pregs[PR_CCS] & M_FLAG)) {
+                        env->exception_index = EXCP_NMI;
+                        do_interrupt(env);
+                        next_tb = 0;
+                    }
+#elif defined(TARGET_M68K)
+                    if (interrupt_request & CPU_INTERRUPT_HARD
+                        && ((env->sr & SR_I) >> SR_I_SHIFT)
+                            < env->pending_level) {
+                        /* Real hardware gets the interrupt vector via an
+                           IACK cycle at this point.  Current emulated
+                           hardware doesn't rely on this, so we
+                           provide/save the vector when the interrupt is
+                           first signalled.  */
+                        env->exception_index = env->pending_vector;
+                        do_interrupt(1);
+                        next_tb = 0;
+                    }
+#endif
+                   /* Don't use the cached interupt_request value,
+                      do_interrupt may have updated the EXITTB flag. */
+                    if (env->interrupt_request & CPU_INTERRUPT_EXITTB) {
+#ifndef VBOX
+                        env->interrupt_request &= ~CPU_INTERRUPT_EXITTB;
+#else  /* VBOX */
+                        ASMAtomicAndS32((int32_t volatile *)&env->interrupt_request, ~CPU_INTERRUPT_EXITTB);
+#endif /* VBOX */
+                        /* ensure that no TB jump will be modified as
+                           the program flow was changed */
+                        next_tb = 0;
+                    }
+#ifdef VBOX
+                    RAWEx_ProfileStop(env, STATS_IRQ_HANDLING);
+                    if (interrupt_request & CPU_INTERRUPT_RC) {
+                        env->exception_index = EXCP_RC;
+                        ASMAtomicAndS32((int32_t volatile *)&env->interrupt_request, ~CPU_INTERRUPT_RC);
+                        ret = env->exception_index;
+                        cpu_loop_exit();
+                    }
+                    if (interrupt_request & (CPU_INTERRUPT_EXTERNAL_EXIT)) {
+                        ASMAtomicAndS32((int32_t volatile *)&env->interrupt_request, ~(CPU_INTERRUPT_EXTERNAL_EXIT));
+                        env->exit_request = 1;
+                    }
+#endif
+                }
+                if (unlikely(env->exit_request)) {
+                    env->exit_request = 0;
+                    env->exception_index = EXCP_INTERRUPT;
+                    cpu_loop_exit();
+                }
+
+#ifdef VBOX
+                /*
+                 * Check if we the CPU state allows us to execute the code in raw-mode.
+                 */
+                RAWEx_ProfileStart(env, STATS_RAW_CHECK);
+                if (remR3CanExecuteRaw(env,
+                                       env->eip + env->segs[R_CS].base,
+                                       env->hflags | (env->eflags & (IOPL_MASK | TF_MASK | VM_MASK)),
+                                       &env->exception_index))
+                {
+                    RAWEx_ProfileStop(env, STATS_RAW_CHECK);
+                    ret = env->exception_index;
+                    cpu_loop_exit();
+                }
+                RAWEx_ProfileStop(env, STATS_RAW_CHECK);
+#endif /* VBOX */
+
+#if defined(DEBUG_DISAS) || defined(CONFIG_DEBUG_EXEC)
+                if (qemu_loglevel_mask(CPU_LOG_TB_CPU)) {
+                    /* restore flags in standard format */
+#if defined(TARGET_I386)
+                    env->eflags = env->eflags | helper_cc_compute_all(CC_OP) | (DF & DF_MASK);
+                    log_cpu_state(env, X86_DUMP_CCOP);
+                    env->eflags &= ~(DF_MASK | CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C);
+#elif defined(TARGET_M68K)
+                    cpu_m68k_flush_flags(env, env->cc_op);
+                    env->cc_op = CC_OP_FLAGS;
+                    env->sr = (env->sr & 0xffe0)
+                              | env->cc_dest | (env->cc_x << 4);
+                    log_cpu_state(env, 0);
+#else
+                    log_cpu_state(env, 0);
+#endif
+                }
+#endif /* DEBUG_DISAS || CONFIG_DEBUG_EXEC */
+#ifdef VBOX
+                RAWEx_ProfileStart(env, STATS_TLB_LOOKUP);
+#endif /*VBOX*/
+                spin_lock(&tb_lock);
+                tb = tb_find_fast();
+                /* Note: we do it here to avoid a gcc bug on Mac OS X when
+                   doing it in tb_find_slow */
+                if (tb_invalidated_flag) {
+                    /* as some TB could have been invalidated because
+                       of memory exceptions while generating the code, we
+                       must recompute the hash index here */
+                    next_tb = 0;
+                    tb_invalidated_flag = 0;
+                }
+#ifdef CONFIG_DEBUG_EXEC
+                qemu_log_mask(CPU_LOG_EXEC, "Trace %p [" TARGET_FMT_lx "] %s\n",
+                             (void *)tb->tc_ptr, tb->pc,
+                             lookup_symbol(tb->pc));
+#endif
+                /* see if we can patch the calling TB. When the TB
+                   spans two pages, we cannot safely do a direct
+                   jump. */
+#ifndef VBOX
+                if (next_tb != 0 && tb->page_addr[1] == -1) {
+#else  /* VBOX */
+                if (next_tb != 0 && !(tb->cflags & CF_RAW_MODE) && tb->page_addr[1] == -1) {
+#endif /* VBOX */
+                    tb_add_jump((TranslationBlock *)(next_tb & ~3), next_tb & 3, tb);
+                }
+                spin_unlock(&tb_lock);
+#ifdef VBOX
+                RAWEx_ProfileStop(env, STATS_TLB_LOOKUP);
+#endif
+
+                /* cpu_interrupt might be called while translating the
+                   TB, but before it is linked into a potentially
+                   infinite loop and becomes env->current_tb. Avoid
+                   starting execution if there is a pending interrupt. */
+                env->current_tb = tb;
+                barrier();
+                if (likely(!env->exit_request)) {
+                    tc_ptr = tb->tc_ptr;
+                /* execute the generated code */
+#ifdef VBOX
+                    RAWEx_ProfileStart(env, STATS_QEMU_RUN_EMULATED_CODE);
+#endif
+#if defined(__sparc__) && !defined(CONFIG_SOLARIS)
+#undef env
+                    env = cpu_single_env;
+#define env cpu_single_env
+#endif
+                    Log5(("REM: tb=%p tc_ptr=%p %04x:%08RGv\n", tb, tc_ptr, env->segs[R_CS].selector, (RTGCPTR)env->eip));
+#if defined(VBOX) && defined(GCC_WITH_BUGGY_REGPARM)
+                    tcg_qemu_tb_exec(tc_ptr, next_tb);
+#else
+                    next_tb = tcg_qemu_tb_exec(tc_ptr);
+#endif
+                    if (next_tb)
+                        Log5(("REM: next_tb=%p %04x:%08RGv\n", next_tb, env->segs[R_CS].selector, (RTGCPTR)env->eip));
+#ifdef VBOX
+                    RAWEx_ProfileStop(env, STATS_QEMU_RUN_EMULATED_CODE);
+#endif
+                    if ((next_tb & 3) == 2) {
+                        /* Instruction counter expired.  */
+                        int insns_left;
+                        tb = (TranslationBlock *)(uintptr_t)(next_tb & ~3);
+                        /* Restore PC.  */
+                        cpu_pc_from_tb(env, tb);
+                        insns_left = env->icount_decr.u32;
+                        if (env->icount_extra && insns_left >= 0) {
+                            /* Refill decrementer and continue execution.  */
+                            env->icount_extra += insns_left;
+                            if (env->icount_extra > 0xffff) {
+                                insns_left = 0xffff;
+                            } else {
+                                insns_left = env->icount_extra;
+                            }
+                            env->icount_extra -= insns_left;
+                            env->icount_decr.u16.low = insns_left;
+                        } else {
+                            if (insns_left > 0) {
+                                /* Execute remaining instructions.  */
+                                cpu_exec_nocache(insns_left, tb);
+                            }
+                            env->exception_index = EXCP_INTERRUPT;
+                            next_tb = 0;
+                            cpu_loop_exit();
+                        }
+                    }
+                }
+                env->current_tb = NULL;
+                /* reset soft MMU for next block (it can currently
+                   only be set by a memory fault) */
+            } /* for(;;) */
+        }
+#ifdef VBOX_HIGH_RES_TIMERS_HACK
+        /* NULL the current_tb here so cpu_interrupt() doesn't do anything
+           unnecessary (like crashing during emulate single instruction).
+           Note! Don't use env1->pVM here, the code wouldn't run with
+                 gcc-4.4/amd64 anymore, see #3883. */
+        env->current_tb = NULL;
+        if (    !(env->interrupt_request & (  CPU_INTERRUPT_DEBUG | CPU_INTERRUPT_EXTERNAL_EXIT | CPU_INTERRUPT_RC
+                                            | CPU_INTERRUPT_SINGLE_INSTR | CPU_INTERRUPT_SINGLE_INSTR_IN_FLIGHT))
+            &&  (   (env->interrupt_request & CPU_INTERRUPT_EXTERNAL_TIMER)
+                 || TMTimerPollBool(env->pVM, env->pVCpu)) ) {
+            ASMAtomicAndS32((int32_t volatile *)&env->interrupt_request, ~CPU_INTERRUPT_EXTERNAL_TIMER);
+            remR3ProfileStart(STATS_QEMU_RUN_TIMERS);
+            TMR3TimerQueuesDo(env->pVM);
+            remR3ProfileStop(STATS_QEMU_RUN_TIMERS);
+        }
+#endif
+    } /* for(;;) */
+
+
+#if defined(TARGET_I386)
+    /* restore flags in standard format */
+    env->eflags = env->eflags | helper_cc_compute_all(CC_OP) | (DF & DF_MASK);
+#elif defined(TARGET_ARM)
+    /* XXX: Save/restore host fpu exception state?.  */
+#elif defined(TARGET_SPARC)
+#elif defined(TARGET_PPC)
+#elif defined(TARGET_M68K)
+    cpu_m68k_flush_flags(env, env->cc_op);
+    env->cc_op = CC_OP_FLAGS;
+    env->sr = (env->sr & 0xffe0)
+              | env->cc_dest | (env->cc_x << 4);
+#elif defined(TARGET_MICROBLAZE)
+#elif defined(TARGET_MIPS)
+#elif defined(TARGET_SH4)
+#elif defined(TARGET_ALPHA)
+#elif defined(TARGET_CRIS)
+#elif defined(TARGET_S390X)
+    /* XXXXX */
+#else
+#error unsupported target CPU
+#endif
+
+    /* restore global registers */
+    barrier();
+    env = (void *) saved_env_reg;
+
+# ifndef VBOX /* we might be using elsewhere, we only have one. */
+    /* fail safe : never use cpu_single_env outside cpu_exec() */
+    cpu_single_env = NULL;
+# endif
+    return ret;
+}
+
+/* must only be called from the generated code as an exception can be
+   generated */
+void tb_invalidate_page_range(target_ulong start, target_ulong end)
+{
+    /* XXX: cannot enable it yet because it yields to MMU exception
+       where NIP != read address on PowerPC */
+#if 0
+    target_ulong phys_addr;
+    phys_addr = get_phys_addr_code(env, start);
+    tb_invalidate_phys_page_range(phys_addr, phys_addr + end - start, 0);
+#endif
+}
+
+#if defined(TARGET_I386) && defined(CONFIG_USER_ONLY)
+
+void cpu_x86_load_seg(CPUX86State *s, int seg_reg, int selector)
+{
+    CPUX86State *saved_env;
+
+    saved_env = env;
+    env = s;
+    if (!(env->cr[0] & CR0_PE_MASK) || (env->eflags & VM_MASK)) {
+        selector &= 0xffff;
+        cpu_x86_load_seg_cache(env, seg_reg, selector,
+                               (selector << 4), 0xffff, 0);
+    } else {
+        helper_load_seg(seg_reg, selector);
+    }
+    env = saved_env;
+}
+
+void cpu_x86_fsave(CPUX86State *s, target_ulong ptr, int data32)
+{
+    CPUX86State *saved_env;
+
+    saved_env = env;
+    env = s;
+
+    helper_fsave(ptr, data32);
+
+    env = saved_env;
+}
+
+void cpu_x86_frstor(CPUX86State *s, target_ulong ptr, int data32)
+{
+    CPUX86State *saved_env;
+
+    saved_env = env;
+    env = s;
+
+    helper_frstor(ptr, data32);
+
+    env = saved_env;
+}
+
+#endif /* TARGET_I386 */
+
+#if !defined(CONFIG_SOFTMMU)
+
+#if defined(TARGET_I386)
+#define EXCEPTION_ACTION raise_exception_err(env->exception_index, env->error_code)
+#else
+#define EXCEPTION_ACTION cpu_loop_exit()
+#endif
+
+/* 'pc' is the host PC at which the exception was raised. 'address' is
+   the effective address of the memory exception. 'is_write' is 1 if a
+   write caused the exception and otherwise 0'. 'old_set' is the
+   signal set which should be restored */
+static inline int handle_cpu_signal(uintptr_t pc, uintptr_t address,
+                                    int is_write, sigset_t *old_set,
+                                    void *puc)
+{
+    TranslationBlock *tb;
+    int ret;
+
+    if (cpu_single_env)
+        env = cpu_single_env; /* XXX: find a correct solution for multithread */
+#if defined(DEBUG_SIGNAL)
+    qemu_printf("qemu: SIGSEGV pc=0x%08lx address=%08lx w=%d oldset=0x%08lx\n",
+                pc, address, is_write, *(unsigned long *)old_set);
+#endif
+    /* XXX: locking issue */
+    if (is_write && page_unprotect(h2g(address), pc, puc)) {
+        return 1;
+    }
+
+    /* see if it is an MMU fault */
+    ret = cpu_handle_mmu_fault(env, address, is_write, MMU_USER_IDX, 0);
+    if (ret < 0)
+        return 0; /* not an MMU fault */
+    if (ret == 0)
+        return 1; /* the MMU fault was handled without causing real CPU fault */
+    /* now we have a real cpu fault */
+    tb = tb_find_pc(pc);
+    if (tb) {
+        /* the PC is inside the translated code. It means that we have
+           a virtual CPU fault */
+        cpu_restore_state(tb, env, pc, puc);
+    }
+
+    /* we restore the process signal mask as the sigreturn should
+       do it (XXX: use sigsetjmp) */
+    sigprocmask(SIG_SETMASK, old_set, NULL);
+    EXCEPTION_ACTION;
+
+    /* never comes here */
+    return 1;
+}
+
+#if defined(__i386__)
+
+#if defined(__APPLE__)
+# include <sys/ucontext.h>
+
+# define EIP_sig(context)  (*((unsigned long*)&(context)->uc_mcontext->ss.eip))
+# define TRAP_sig(context)    ((context)->uc_mcontext->es.trapno)
+# define ERROR_sig(context)   ((context)->uc_mcontext->es.err)
+# define MASK_sig(context)    ((context)->uc_sigmask)
+#elif defined (__NetBSD__)
+# include <ucontext.h>
+
+# define EIP_sig(context)     ((context)->uc_mcontext.__gregs[_REG_EIP])
+# define TRAP_sig(context)    ((context)->uc_mcontext.__gregs[_REG_TRAPNO])
+# define ERROR_sig(context)   ((context)->uc_mcontext.__gregs[_REG_ERR])
+# define MASK_sig(context)    ((context)->uc_sigmask)
+#elif defined (__FreeBSD__) || defined(__DragonFly__)
+# include <ucontext.h>
+
+# define EIP_sig(context)  (*((unsigned long*)&(context)->uc_mcontext.mc_eip))
+# define TRAP_sig(context)    ((context)->uc_mcontext.mc_trapno)
+# define ERROR_sig(context)   ((context)->uc_mcontext.mc_err)
+# define MASK_sig(context)    ((context)->uc_sigmask)
+#elif defined(__OpenBSD__)
+# define EIP_sig(context)     ((context)->sc_eip)
+# define TRAP_sig(context)    ((context)->sc_trapno)
+# define ERROR_sig(context)   ((context)->sc_err)
+# define MASK_sig(context)    ((context)->sc_mask)
+#else
+# define EIP_sig(context)     ((context)->uc_mcontext.gregs[REG_EIP])
+# define TRAP_sig(context)    ((context)->uc_mcontext.gregs[REG_TRAPNO])
+# define ERROR_sig(context)   ((context)->uc_mcontext.gregs[REG_ERR])
+# define MASK_sig(context)    ((context)->uc_sigmask)
+#endif
+
+int cpu_signal_handler(int host_signum, void *pinfo,
+                       void *puc)
+{
+    siginfo_t *info = pinfo;
+#if defined(__NetBSD__) || defined (__FreeBSD__) || defined(__DragonFly__)
+    ucontext_t *uc = puc;
+#elif defined(__OpenBSD__)
+    struct sigcontext *uc = puc;
+#else
+    struct ucontext *uc = puc;
+#endif
+    uintptr_t pc;
+    int trapno;
+
+#ifndef REG_EIP
+/* for glibc 2.1 */
+#define REG_EIP    EIP
+#define REG_ERR    ERR
+#define REG_TRAPNO TRAPNO
+#endif
+    pc = EIP_sig(uc);
+    trapno = TRAP_sig(uc);
+    return handle_cpu_signal(pc, (uintptr_t)info->si_addr,
+                             trapno == 0xe ?
+                             (ERROR_sig(uc) >> 1) & 1 : 0,
+                             &MASK_sig(uc), puc);
+}
+
+#elif defined(__x86_64__)
+
+#ifdef __NetBSD__
+#define PC_sig(context)       _UC_MACHINE_PC(context)
+#define TRAP_sig(context)     ((context)->uc_mcontext.__gregs[_REG_TRAPNO])
+#define ERROR_sig(context)    ((context)->uc_mcontext.__gregs[_REG_ERR])
+#define MASK_sig(context)     ((context)->uc_sigmask)
+#elif defined(__OpenBSD__)
+#define PC_sig(context)       ((context)->sc_rip)
+#define TRAP_sig(context)     ((context)->sc_trapno)
+#define ERROR_sig(context)    ((context)->sc_err)
+#define MASK_sig(context)     ((context)->sc_mask)
+#elif defined (__FreeBSD__) || defined(__DragonFly__)
+#include <ucontext.h>
+
+#define PC_sig(context)  (*((unsigned long*)&(context)->uc_mcontext.mc_rip))
+#define TRAP_sig(context)     ((context)->uc_mcontext.mc_trapno)
+#define ERROR_sig(context)    ((context)->uc_mcontext.mc_err)
+#define MASK_sig(context)     ((context)->uc_sigmask)
+#else
+#define PC_sig(context)       ((context)->uc_mcontext.gregs[REG_RIP])
+#define TRAP_sig(context)     ((context)->uc_mcontext.gregs[REG_TRAPNO])
+#define ERROR_sig(context)    ((context)->uc_mcontext.gregs[REG_ERR])
+#define MASK_sig(context)     ((context)->uc_sigmask)
+#endif
+
+int cpu_signal_handler(int host_signum, void *pinfo,
+                       void *puc)
+{
+    siginfo_t *info = pinfo;
+    uintptr_t pc;
+#if defined(__NetBSD__) || defined (__FreeBSD__) || defined(__DragonFly__)
+    ucontext_t *uc = puc;
+#elif defined(__OpenBSD__)
+    struct sigcontext *uc = puc;
+#else
+    struct ucontext *uc = puc;
+#endif
+
+    pc = PC_sig(uc);
+    return handle_cpu_signal(pc, (uintptr_t)info->si_addr,
+                             TRAP_sig(uc) == 0xe ?
+                             (ERROR_sig(uc) >> 1) & 1 : 0,
+                             &MASK_sig(uc), puc);
+}
+
+#elif defined(_ARCH_PPC)
+
+/***********************************************************************
+ * signal context platform-specific definitions
+ * From Wine
+ */
+#ifdef linux
+/* All Registers access - only for local access */
+# define REG_sig(reg_name, context)		((context)->uc_mcontext.regs->reg_name)
+/* Gpr Registers access  */
+# define GPR_sig(reg_num, context)		REG_sig(gpr[reg_num], context)
+# define IAR_sig(context)			REG_sig(nip, context)	/* Program counter */
+# define MSR_sig(context)			REG_sig(msr, context)   /* Machine State Register (Supervisor) */
+# define CTR_sig(context)			REG_sig(ctr, context)   /* Count register */
+# define XER_sig(context)			REG_sig(xer, context) /* User's integer exception register */
+# define LR_sig(context)			REG_sig(link, context) /* Link register */
+# define CR_sig(context)			REG_sig(ccr, context) /* Condition register */
+/* Float Registers access  */
+# define FLOAT_sig(reg_num, context)		(((double*)((char*)((context)->uc_mcontext.regs+48*4)))[reg_num])
+# define FPSCR_sig(context)			(*(int*)((char*)((context)->uc_mcontext.regs+(48+32*2)*4)))
+/* Exception Registers access */
+# define DAR_sig(context)			REG_sig(dar, context)
+# define DSISR_sig(context)			REG_sig(dsisr, context)
+# define TRAP_sig(context)			REG_sig(trap, context)
+#endif /* linux */
+
+#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
+#include <ucontext.h>
+# define IAR_sig(context)		((context)->uc_mcontext.mc_srr0)
+# define MSR_sig(context)		((context)->uc_mcontext.mc_srr1)
+# define CTR_sig(context)		((context)->uc_mcontext.mc_ctr)
+# define XER_sig(context)		((context)->uc_mcontext.mc_xer)
+# define LR_sig(context)		((context)->uc_mcontext.mc_lr)
+# define CR_sig(context)		((context)->uc_mcontext.mc_cr)
+/* Exception Registers access */
+# define DAR_sig(context)		((context)->uc_mcontext.mc_dar)
+# define DSISR_sig(context)		((context)->uc_mcontext.mc_dsisr)
+# define TRAP_sig(context)		((context)->uc_mcontext.mc_exc)
+#endif /* __FreeBSD__|| __FreeBSD_kernel__ */
+
+#ifdef __APPLE__
+# include <sys/ucontext.h>
+typedef struct ucontext SIGCONTEXT;
+/* All Registers access - only for local access */
+# define REG_sig(reg_name, context)		((context)->uc_mcontext->ss.reg_name)
+# define FLOATREG_sig(reg_name, context)	((context)->uc_mcontext->fs.reg_name)
+# define EXCEPREG_sig(reg_name, context)	((context)->uc_mcontext->es.reg_name)
+# define VECREG_sig(reg_name, context)		((context)->uc_mcontext->vs.reg_name)
+/* Gpr Registers access */
+# define GPR_sig(reg_num, context)		REG_sig(r##reg_num, context)
+# define IAR_sig(context)			REG_sig(srr0, context)	/* Program counter */
+# define MSR_sig(context)			REG_sig(srr1, context)  /* Machine State Register (Supervisor) */
+# define CTR_sig(context)			REG_sig(ctr, context)
+# define XER_sig(context)			REG_sig(xer, context) /* Link register */
+# define LR_sig(context)			REG_sig(lr, context)  /* User's integer exception register */
+# define CR_sig(context)			REG_sig(cr, context)  /* Condition register */
+/* Float Registers access */
+# define FLOAT_sig(reg_num, context)		FLOATREG_sig(fpregs[reg_num], context)
+# define FPSCR_sig(context)			((double)FLOATREG_sig(fpscr, context))
+/* Exception Registers access */
+# define DAR_sig(context)			EXCEPREG_sig(dar, context)     /* Fault registers for coredump */
+# define DSISR_sig(context)			EXCEPREG_sig(dsisr, context)
+# define TRAP_sig(context)			EXCEPREG_sig(exception, context) /* number of powerpc exception taken */
+#endif /* __APPLE__ */
+
+int cpu_signal_handler(int host_signum, void *pinfo,
+                       void *puc)
+{
+    siginfo_t *info = pinfo;
+#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
+    ucontext_t *uc = puc;
+#else
+    struct ucontext *uc = puc;
+#endif
+    uintptr_t pc;
+    int is_write;
+
+    pc = IAR_sig(uc);
+    is_write = 0;
+#if 0
+    /* ppc 4xx case */
+    if (DSISR_sig(uc) & 0x00800000)
+        is_write = 1;
+#else
+    if (TRAP_sig(uc) != 0x400 && (DSISR_sig(uc) & 0x02000000))
+        is_write = 1;
+#endif
+    return handle_cpu_signal(pc, (uintptr_t)info->si_addr,
+                             is_write, &uc->uc_sigmask, puc);
+}
+
+#elif defined(__alpha__)
+
+int cpu_signal_handler(int host_signum, void *pinfo,
+                           void *puc)
+{
+    siginfo_t *info = pinfo;
+    struct ucontext *uc = puc;
+    uint32_t *pc = uc->uc_mcontext.sc_pc;
+    uint32_t insn = *pc;
+    int is_write = 0;
+
+    /* XXX: need kernel patch to get write flag faster */
+    switch (insn >> 26) {
+    case 0x0d: // stw
+    case 0x0e: // stb
+    case 0x0f: // stq_u
+    case 0x24: // stf
+    case 0x25: // stg
+    case 0x26: // sts
+    case 0x27: // stt
+    case 0x2c: // stl
+    case 0x2d: // stq
+    case 0x2e: // stl_c
+    case 0x2f: // stq_c
+	is_write = 1;
+    }
+
+    return handle_cpu_signal(pc, (uintptr_t)info->si_addr,
+                             is_write, &uc->uc_sigmask, puc);
+}
+#elif defined(__sparc__)
+
+int cpu_signal_handler(int host_signum, void *pinfo,
+                       void *puc)
+{
+    siginfo_t *info = pinfo;
+    int is_write;
+    uint32_t insn;
+#if !defined(__arch64__) || defined(CONFIG_SOLARIS)
+    uint32_t *regs = (uint32_t *)(info + 1);
+    void *sigmask = (regs + 20);
+    /* XXX: is there a standard glibc define ? */
+    uintptr_t pc = regs[1];
+#else
+#ifdef __linux__
+    struct sigcontext *sc = puc;
+    uintptr_t pc = sc->sigc_regs.tpc;
+    void *sigmask = (void *)sc->sigc_mask;
+#elif defined(__OpenBSD__)
+    struct sigcontext *uc = puc;
+    uintptr_t pc = uc->sc_pc;
+    void *sigmask = (void *)(uintptr_t)uc->sc_mask;
+#endif
+#endif
+
+    /* XXX: need kernel patch to get write flag faster */
+    is_write = 0;
+    insn = *(uint32_t *)pc;
+    if ((insn >> 30) == 3) {
+      switch((insn >> 19) & 0x3f) {
+      case 0x05: // stb
+      case 0x15: // stba
+      case 0x06: // sth
+      case 0x16: // stha
+      case 0x04: // st
+      case 0x14: // sta
+      case 0x07: // std
+      case 0x17: // stda
+      case 0x0e: // stx
+      case 0x1e: // stxa
+      case 0x24: // stf
+      case 0x34: // stfa
+      case 0x27: // stdf
+      case 0x37: // stdfa
+      case 0x26: // stqf
+      case 0x36: // stqfa
+      case 0x25: // stfsr
+      case 0x3c: // casa
+      case 0x3e: // casxa
+	is_write = 1;
+	break;
+      }
+    }
+    return handle_cpu_signal(pc, (uintptr_t)info->si_addr,
+                             is_write, sigmask, NULL);
+}
+
+#elif defined(__arm__)
+
+int cpu_signal_handler(int host_signum, void *pinfo,
+                       void *puc)
+{
+    siginfo_t *info = pinfo;
+    struct ucontext *uc = puc;
+    uintptr_t pc;
+    int is_write;
+
+#if (__GLIBC__ < 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ <= 3))
+    pc = uc->uc_mcontext.gregs[R15];
+#else
+    pc = uc->uc_mcontext.arm_pc;
+#endif
+    /* XXX: compute is_write */
+    is_write = 0;
+    return handle_cpu_signal(pc, (uintptr_t)info->si_addr,
+                             is_write,
+                             &uc->uc_sigmask, puc);
+}
+
+#elif defined(__mc68000)
+
+int cpu_signal_handler(int host_signum, void *pinfo,
+                       void *puc)
+{
+    siginfo_t *info = pinfo;
+    struct ucontext *uc = puc;
+    uintptr_t pc;
+    int is_write;
+
+    pc = uc->uc_mcontext.gregs[16];
+    /* XXX: compute is_write */
+    is_write = 0;
+    return handle_cpu_signal(pc, (uintptr_t)info->si_addr,
+                             is_write,
+                             &uc->uc_sigmask, puc);
+}
+
+#elif defined(__ia64)
+
+#ifndef __ISR_VALID
+  /* This ought to be in <bits/siginfo.h>... */
+# define __ISR_VALID	1
+#endif
+
+int cpu_signal_handler(int host_signum, void *pinfo, void *puc)
+{
+    siginfo_t *info = pinfo;
+    struct ucontext *uc = puc;
+    uintptr_t ip;
+    int is_write = 0;
+
+    ip = uc->uc_mcontext.sc_ip;
+    switch (host_signum) {
+      case SIGILL:
+      case SIGFPE:
+      case SIGSEGV:
+      case SIGBUS:
+      case SIGTRAP:
+	  if (info->si_code && (info->si_segvflags & __ISR_VALID))
+	      /* ISR.W (write-access) is bit 33:  */
+	      is_write = (info->si_isr >> 33) & 1;
+	  break;
+
+      default:
+	  break;
+    }
+    return handle_cpu_signal(ip, (uintptr_t)info->si_addr,
+                             is_write,
+                             (sigset_t *)&uc->uc_sigmask, puc);
+}
+
+#elif defined(__s390__)
+
+int cpu_signal_handler(int host_signum, void *pinfo,
+                       void *puc)
+{
+    siginfo_t *info = pinfo;
+    struct ucontext *uc = puc;
+    uintptr_t pc;
+    uint16_t *pinsn;
+    int is_write = 0;
+
+    pc = uc->uc_mcontext.psw.addr;
+
+    /* ??? On linux, the non-rt signal handler has 4 (!) arguments instead
+       of the normal 2 arguments.  The 3rd argument contains the "int_code"
+       from the hardware which does in fact contain the is_write value.
+       The rt signal handler, as far as I can tell, does not give this value
+       at all.  Not that we could get to it from here even if it were.  */
+    /* ??? This is not even close to complete, since it ignores all
+       of the read-modify-write instructions.  */
+    pinsn = (uint16_t *)pc;
+    switch (pinsn[0] >> 8) {
+    case 0x50: /* ST */
+    case 0x42: /* STC */
+    case 0x40: /* STH */
+        is_write = 1;
+        break;
+    case 0xc4: /* RIL format insns */
+        switch (pinsn[0] & 0xf) {
+        case 0xf: /* STRL */
+        case 0xb: /* STGRL */
+        case 0x7: /* STHRL */
+            is_write = 1;
+        }
+        break;
+    case 0xe3: /* RXY format insns */
+        switch (pinsn[2] & 0xff) {
+        case 0x50: /* STY */
+        case 0x24: /* STG */
+        case 0x72: /* STCY */
+        case 0x70: /* STHY */
+        case 0x8e: /* STPQ */
+        case 0x3f: /* STRVH */
+        case 0x3e: /* STRV */
+        case 0x2f: /* STRVG */
+            is_write = 1;
+        }
+        break;
+    }
+    return handle_cpu_signal(pc, (uintptr_t)info->si_addr,
+                             is_write, &uc->uc_sigmask, puc);
+}
+
+#elif defined(__mips__)
+
+int cpu_signal_handler(int host_signum, void *pinfo,
+                       void *puc)
+{
+    siginfo_t *info = pinfo;
+    struct ucontext *uc = puc;
+    greg_t pc = uc->uc_mcontext.pc;
+    int is_write;
+
+    /* XXX: compute is_write */
+    is_write = 0;
+    return handle_cpu_signal(pc, (uintptr_t)info->si_addr,
+                             is_write, &uc->uc_sigmask, puc);
+}
+
+#elif defined(__hppa__)
+
+int cpu_signal_handler(int host_signum, void *pinfo,
+                       void *puc)
+{
+    struct siginfo *info = pinfo;
+    struct ucontext *uc = puc;
+    uintptr_t pc = uc->uc_mcontext.sc_iaoq[0];
+    uint32_t insn = *(uint32_t *)pc;
+    int is_write = 0;
+
+    /* XXX: need kernel patch to get write flag faster.  */
+    switch (insn >> 26) {
+    case 0x1a: /* STW */
+    case 0x19: /* STH */
+    case 0x18: /* STB */
+    case 0x1b: /* STWM */
+        is_write = 1;
+        break;
+
+    case 0x09: /* CSTWX, FSTWX, FSTWS */
+    case 0x0b: /* CSTDX, FSTDX, FSTDS */
+        /* Distinguish from coprocessor load ... */
+        is_write = (insn >> 9) & 1;
+        break;
+
+    case 0x03:
+        switch ((insn >> 6) & 15) {
+        case 0xa: /* STWS */
+        case 0x9: /* STHS */
+        case 0x8: /* STBS */
+        case 0xe: /* STWAS */
+        case 0xc: /* STBYS */
+            is_write = 1;
+        }
+        break;
+    }
+
+    return handle_cpu_signal(pc, (uintptr_t)info->si_addr,
+                             is_write, &uc->uc_sigmask, puc);
+}
+
+#else
+
+#error host CPU specific signal handler needed
+
+#endif
+
+#endif /* !defined(CONFIG_SOFTMMU) */
diff --git a/src/recompiler/cutils.c b/src/recompiler/cutils.c
new file mode 100644
index 00000000..5fbf70bd
--- /dev/null
+++ b/src/recompiler/cutils.c
@@ -0,0 +1,752 @@
+/*
+ * Simple C functions to supplement the C library
+ *
+ * Copyright (c) 2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "host-utils.h"
+
+#ifdef VBOX
+# include "osdep.h"
+
+
+static inline int toupper(int ch) {
+  if ( (unsigned int)(ch - 'a') < 26u )
+    ch += 'A' - 'a';
+  return ch;
+}
+
+/* Quick sort from OpenSolaris:
+     http://src.opensolaris.org/source/raw/onnv/onnv-gate/usr/src/common/util/qsort.c */
+/*
+ * choose a median of 3 values
+ *
+ * note: cstyle specifically prohibits nested conditional operators
+ * but this is the only way to do the median of 3 function in-line
+ */
+#define	med3(a, b, c) (cmp((a), (b)) < 0) \
+	? ((cmp((b), (c)) < 0) ? (b) : (cmp((a), (c)) < 0) ? (c) : (a)) \
+	: ((cmp((b), (c)) > 0) ? (b) : (cmp((a), (c)) > 0) ? (c) : (a))
+
+#define	THRESH_L	5	/* threshold for insertion sort */
+#define	THRESH_M3	20	/* threshold for median of 3 */
+#define	THRESH_M9	50	/* threshold for median of 9 */
+
+typedef struct {
+	char	*b_lim;
+	size_t	nrec;
+} stk_t;
+
+/*
+ * The following swap functions should not create a stack frame
+ * the SPARC call / return instruction will be executed
+ * but the a save / restore will not be executed
+ * which means we won't do a window turn with the spill / fill overhead
+ * verify this by examining the assembly code
+ */
+
+/* ARGSUSED */
+static void
+swapp32(uint32_t *r1, uint32_t *r2, size_t cnt)
+{
+	uint32_t temp;
+
+	temp = *r1;
+	*r1++ = *r2;
+	*r2++ = temp;
+}
+
+/* ARGSUSED */
+static void
+swapp64(uint64_t *r1, uint64_t *r2, size_t cnt)
+{
+	uint64_t temp;
+
+	temp = *r1;
+	*r1++ = *r2;
+	*r2++ = temp;
+}
+
+static void
+swapi(uint32_t *r1, uint32_t *r2, size_t cnt)
+{
+	uint32_t temp;
+
+	/* character by character */
+	while (cnt--) {
+		temp = *r1;
+		*r1++ = *r2;
+		*r2++ = temp;
+	}
+}
+
+static void
+swapb(char *r1, char *r2, size_t cnt)
+{
+	char	temp;
+
+	/* character by character */
+	while (cnt--) {
+		temp = *r1;
+		*r1++ = *r2;
+		*r2++ = temp;
+	}
+}
+
+/*
+ * qsort() is a general purpose, in-place sorting routine using a
+ * user provided call back function for comparisons.  This implementation
+ * utilizes a ternary quicksort algorithm, and cuts over to an
+ * insertion sort for partitions involving fewer than THRESH_L records.
+ *
+ * Potential User Errors
+ *   There is no return value from qsort, this function has no method
+ *   of alerting the user that a sort did not work or could not work.
+ *   We do not print an error message or exit the process or thread,
+ *   Even if we can detect an error, We CANNOT silently return without
+ *   sorting the data, if we did so the user could never be sure the
+ *   sort completed successfully.
+ *   It is possible we could change the return value of sort from void
+ *   to int and return success or some error codes, but this gets into
+ *   standards  and compatibility issues.
+ *
+ *   Examples of qsort parameter errors might be
+ *   1) record size (rsiz) equal to 0
+ *      qsort will loop and never return.
+ *   2) record size (rsiz) less than 0
+ *      rsiz is unsigned, so a negative value is insanely large
+ *   3) number of records (nrec) is 0
+ *      This is legal - qsort will return without examining any records
+ *   4) number of records (nrec) is less than 0
+ *      nrec is unsigned, so a negative value is insanely large.
+ *   5) nrec * rsiz > memory allocation for sort array
+ *      a segment violation may occur
+ *      corruption of other memory may occur
+ *   6) The base address of the sort array is invalid
+ *      a segment violation may occur
+ *      corruption of other memory may occur
+ *   7) The user call back function is invalid
+ *      we may get alignment errors or segment violations
+ *      we may jump into never-never land
+ *
+ *   Some less obvious errors might be
+ *   8) The user compare function is not comparing correctly
+ *   9) The user compare function modifies the data records
+ */
+
+void
+qemu_qsort(
+	void		*basep,
+	size_t		nrec,
+	size_t		rsiz,
+	int		(*cmp)(const void *, const void *))
+{
+	size_t		i;		/* temporary variable */
+
+	/* variables used by swap */
+	void		(*swapf)(char *, char *, size_t);
+	size_t		loops;
+
+	/* variables used by sort */
+	stk_t		stack[8 * sizeof (nrec) + 1];
+	stk_t		*sp;
+	char		*b_lim;		/* bottom limit */
+	char		*b_dup;		/* bottom duplicate */
+	char		*b_par;		/* bottom partition */
+	char		*t_lim;		/* top limit */
+	char		*t_dup;		/* top duplicate */
+	char		*t_par;		/* top partition */
+	char		*m1, *m2, *m3;	/* median pointers */
+	uintptr_t	d_bytelength;	/* byte length of duplicate records */
+	int		b_nrec;
+	int		t_nrec;
+	int		cv;		/* results of compare (bottom / top) */
+
+	/*
+	 * choose a swap function based on alignment and size
+	 *
+	 * The qsort function sorts an array of fixed length records.
+	 * We have very limited knowledge about the data record itself.
+	 * It may be that the data record is in the array we are sorting
+	 * or it may be that the array contains pointers or indexes to
+	 * the actual data record and all that we are sorting is the indexes.
+	 *
+	 * The following decision will choose an optimal swap function
+	 * based on the size and alignment of the data records
+	 *   swapp64	will swap 64 bit pointers
+	 *   swapp32	will swap 32 bit pointers
+	 *   swapi	will swap an array of 32 bit integers
+	 *   swapb	will swap an array of 8 bit characters
+	 *
+	 * swapi and swapb will also require the variable loops to be set
+	 * to control the length of the array being swapped
+	 */
+	if ((((uintptr_t)basep & (sizeof (uint64_t) - 1)) == 0) &&
+	    (rsiz == sizeof (uint64_t))) {
+		loops = 1;
+		swapf = (void (*)(char *, char *, size_t))swapp64;
+	} else if ((((uintptr_t)basep & (sizeof (uint32_t) - 1)) == 0) &&
+	    (rsiz == sizeof (uint32_t))) {
+		loops = 1;
+		swapf = (void (*)(char *, char *, size_t))swapp32;
+	} else if ((((uintptr_t)basep & (sizeof (uint32_t) - 1)) == 0) &&
+	    ((rsiz & (sizeof (uint32_t) - 1)) == 0)) {
+		loops = rsiz / sizeof (int);
+		swapf = (void (*)(char *, char *, size_t))swapi;
+	} else {
+		loops = rsiz;
+		swapf = swapb;
+	}
+
+	/*
+	 * qsort is a partitioning sort
+	 *
+	 * the stack is the bookkeeping mechanism to keep track of all
+	 * the partitions.
+	 *
+	 * each sort pass takes one partition and sorts it into two partitions.
+	 * at the top of the loop we simply take the partition on the top
+	 * of the stack and sort it. See the comments at the bottom
+	 * of the loop regarding which partitions to add in what order.
+	 *
+	 * initially put the whole partition on the stack
+	 */
+	sp = stack;
+	sp->b_lim = (char *)basep;
+	sp->nrec = nrec;
+	sp++;
+	while (sp > stack) {
+		sp--;
+		b_lim = sp->b_lim;
+		nrec = sp->nrec;
+
+		/*
+		 * a linear insertion sort i faster than a qsort for
+		 * very small number of records (THRESH_L)
+		 *
+		 * if number records < threshold use linear insertion sort
+		 *
+		 * this also handles the special case where the partition
+		 * 0 or 1 records length.
+		 */
+		if (nrec < THRESH_L) {
+			/*
+			 * Linear insertion sort
+			 */
+			t_par = b_lim;
+			for (i = 1; i < nrec; i++) {
+				t_par += rsiz;
+				b_par = t_par;
+				while (b_par > b_lim) {
+					b_par -= rsiz;
+					if ((*cmp)(b_par, b_par + rsiz) <= 0) {
+						break;
+					}
+					(*swapf)(b_par, b_par + rsiz, loops);
+				}
+			}
+
+			/*
+			 * a linear insertion sort will put all records
+			 * in their final position and will not create
+			 * subpartitions.
+			 *
+			 * therefore when the insertion sort is complete
+			 * just go to the top of the loop and get the
+			 * next partition to sort.
+			 */
+			continue;
+		}
+
+		/* quicksort */
+
+		/*
+		 * choose a pivot record
+		 *
+		 * Ideally the pivot record will divide the partition
+		 * into two equal parts. however we have to balance the
+		 * work involved in selecting the pivot record with the
+		 * expected benefit.
+		 *
+		 * The choice of pivot record depends on the number of
+		 * records in the partition
+		 *
+		 * for small partitions (nrec < THRESH_M3)
+		 *   we just select the record in the middle of the partition
+		 *
+		 * if (nrec >= THRESH_M3 && nrec < THRESH_M9)
+		 *   we select three values and choose the median of 3
+		 *
+		 * if (nrec >= THRESH_M9)
+		 *   then we use an approximate median of 9
+		 *   9 records are selected and grouped in 3 groups of 3
+		 *   the median of each of these 3 groups is fed into another
+		 *   median of 3 decision.
+		 *
+		 * Each median of 3 decision is 2 or 3 compares,
+		 * so median of 9 costs between 8 and 12 compares.
+		 *
+		 * i is byte distance between two consecutive samples
+		 * m2 will point to the pivot record
+		 */
+		if (nrec < THRESH_M3) {
+			m2 = b_lim + (nrec / 2) * rsiz;
+		} else if (nrec < THRESH_M9) {
+			/* use median of 3 */
+			i = ((nrec - 1) / 2) * rsiz;
+			m2 = med3(b_lim, b_lim + i, b_lim + 2 * i);
+		} else {
+			/* approx median of 9 */
+			i = ((nrec - 1) / 8) * rsiz;
+			m1 = med3(b_lim, b_lim +  i, b_lim + 2 * i);
+			m2 = med3(b_lim + 3 * i, b_lim + 4 * i, b_lim + 5 * i);
+			m3 = med3(b_lim + 6 * i, b_lim + 7 * i, b_lim + 8 * i);
+			m2 = med3(m1, m2, m3);
+		}
+
+		/*
+		 * quick sort partitioning
+		 *
+		 * The partition limits are defined by bottom and top pointers
+		 * b_lim and t_lim.
+		 *
+		 * qsort uses a fairly standard method of moving the
+		 * partitioning pointers, b_par and t_par, to the middle of
+		 * the partition and exchanging records that are in the
+		 * wrong part of the partition.
+		 *
+		 * Two enhancements have been made to the basic algorithm.
+		 * One for handling duplicate records and one to minimize
+		 * the number of swaps.
+		 *
+		 * Two duplicate records pointers are (b_dup and t_dup) are
+		 * initially set to b_lim and t_lim.  Each time a record
+		 * whose sort key value is equal to the pivot record is found
+		 * it will be swapped with the record pointed to by
+		 * b_dup or t_dup and the duplicate pointer will be
+		 * incremented toward the center.
+		 * When partitioning is complete, all the duplicate records
+		 * will have been collected at the upper and lower limits of
+		 * the partition and can easily be moved adjacent to the
+		 * pivot record.
+		 *
+		 * The second optimization is to minimize the number of swaps.
+		 * The pointer m2 points to the pivot record.
+		 * During partitioning, if m2 is ever equal to the partitioning
+		 * pointers, b_par or t_par, then b_par or t_par just moves
+		 * onto the next record without doing a compare.
+		 * If as a result of duplicate record detection,
+		 * b_dup or t_dup is ever equal to m2,
+		 * then m2 is changed to point to the duplicate record and
+		 * b_dup or t_dup is incremented with out swapping records.
+		 *
+		 * When partitioning is done, we may not have the same pivot
+		 * record that we started with, but we will have one with
+		 * an equal sort key.
+		 */
+		b_dup = b_par		= b_lim;
+		t_dup = t_par = t_lim	= b_lim + rsiz * (nrec - 1);
+		for (;;) {
+
+			/* move bottom pointer up */
+			for (; b_par <= t_par; b_par += rsiz) {
+				if (b_par == m2) {
+					continue;
+				}
+				cv = cmp(b_par, m2);
+				if (cv > 0) {
+					break;
+				}
+				if (cv == 0) {
+					if (b_dup == m2) {
+						m2 = b_par;
+					} else if (b_dup != b_par) {
+						(*swapf)(b_dup, b_par, loops);
+					}
+					b_dup += rsiz;
+				}
+			}
+
+			/* move top pointer down */
+			for (; b_par < t_par; t_par -= rsiz) {
+				if (t_par == m2) {
+					continue;
+				}
+				cv = cmp(t_par, m2);
+				if (cv < 0) {
+					break;
+				}
+				if (cv == 0) {
+					if (t_dup == m2) {
+						m2 = t_par;
+					} else if (t_dup != t_par) {
+						(*swapf)(t_dup, t_par, loops);
+					}
+					t_dup -= rsiz;
+				}
+			}
+
+			/* break if we are done partitioning */
+			if (b_par >= t_par) {
+				break;
+			}
+
+			/* exchange records at upper and lower break points */
+			(*swapf)(b_par, t_par, loops);
+			b_par += rsiz;
+			t_par -= rsiz;
+		}
+
+		/*
+		 * partitioning is now complete
+		 *
+		 * there are two termination conditions from the partitioning
+		 * loop above.  Either b_par or t_par have crossed or
+		 * they are equal.
+		 *
+		 * we need to swap the pivot record to its final position
+		 * m2 could be in either the upper or lower partitions
+		 * or it could already be in its final position
+		 */
+		/*
+		 * R[b_par] > R[m2]
+		 * R[t_par] < R[m2]
+		 */
+		if (t_par < b_par) {
+			if (m2 < t_par) {
+				(*swapf)(m2, t_par, loops);
+				m2 = b_par = t_par;
+			} else if (m2 > b_par) {
+				(*swapf)(m2, b_par, loops);
+				m2 = t_par = b_par;
+			} else {
+				b_par = t_par = m2;
+			}
+		} else {
+			if (m2 < t_par) {
+				t_par = b_par = t_par - rsiz;
+			}
+			if (m2 != b_par) {
+				(*swapf)(m2, b_par, loops);
+			}
+			m2 = t_par;
+		}
+
+		/*
+		 * move bottom duplicates next to pivot
+		 * optimized to eliminate overlap
+		 */
+		d_bytelength = b_dup - b_lim;
+		if (b_par - b_dup < d_bytelength) {
+			b_dup = b_lim + (b_par - b_dup);
+		}
+		while (b_dup > b_lim) {
+			b_dup -= rsiz;
+			b_par -= rsiz;
+			(*swapf)(b_dup, b_par, loops);
+		}
+		b_par = m2 - d_bytelength;
+
+		/*
+		 * move top duplicates next to pivot
+		 */
+		d_bytelength = t_lim - t_dup;
+		if (t_dup - t_par < d_bytelength) {
+			t_dup = t_lim - (t_dup - t_par);
+		}
+		while (t_dup < t_lim) {
+			t_dup += rsiz;
+			t_par += rsiz;
+			(*swapf)(t_dup, t_par, loops);
+		}
+		t_par = m2 + d_bytelength;
+
+		/*
+		 * when a qsort pass completes there are three partitions
+		 * 1) the lower contains all records less than pivot
+		 * 2) the upper contains all records greater than pivot
+		 * 3) the pivot partition contains all record equal to pivot
+		 *
+		 * all records in the pivot partition are in their final
+		 * position and do not need to be accounted for by the stack
+		 *
+		 * when adding partitions to the stack
+		 * it is important to add the largest partition first
+		 * to prevent stack overflow.
+		 *
+		 * calculate number of unsorted records in top and bottom
+		 * push resulting partitions on stack
+		 */
+		b_nrec = (b_par - b_lim) / rsiz;
+		t_nrec = (t_lim - t_par) / rsiz;
+		if (b_nrec < t_nrec) {
+			sp->b_lim = t_par + rsiz;
+			sp->nrec = t_nrec;
+			sp++;
+			sp->b_lim = b_lim;
+			sp->nrec = b_nrec;
+			sp++;
+		} else {
+			sp->b_lim = b_lim;
+			sp->nrec = b_nrec;
+			sp++;
+			sp->b_lim = t_par + rsiz;
+			sp->nrec = t_nrec;
+			sp++;
+		}
+	}
+}
+
+#endif /* VBOX */
+
+void pstrcpy(char *buf, int buf_size, const char *str)
+{
+    int c;
+    char *q = buf;
+
+    if (buf_size <= 0)
+        return;
+
+    for(;;) {
+        c = *str++;
+        if (c == 0 || q >= buf + buf_size - 1)
+            break;
+        *q++ = c;
+    }
+    *q = '\0';
+}
+
+/* strcat and truncate. */
+char *pstrcat(char *buf, int buf_size, const char *s)
+{
+    int len;
+    len = strlen(buf);
+    if (len < buf_size)
+        pstrcpy(buf + len, buf_size - len, s);
+    return buf;
+}
+
+int strstart(const char *str, const char *val, const char **ptr)
+{
+    const char *p, *q;
+    p = str;
+    q = val;
+    while (*q != '\0') {
+        if (*p != *q)
+            return 0;
+        p++;
+        q++;
+    }
+    if (ptr)
+        *ptr = p;
+    return 1;
+}
+
+int stristart(const char *str, const char *val, const char **ptr)
+{
+    const char *p, *q;
+    p = str;
+    q = val;
+    while (*q != '\0') {
+        if (qemu_toupper(*p) != qemu_toupper(*q))
+            return 0;
+        p++;
+        q++;
+    }
+    if (ptr)
+        *ptr = p;
+    return 1;
+}
+
+/* XXX: use host strnlen if available ? */
+int qemu_strnlen(const char *s, int max_len)
+{
+    int i;
+
+    for(i = 0; i < max_len; i++) {
+        if (s[i] == '\0') {
+            break;
+        }
+    }
+    return i;
+}
+
+#ifndef VBOX
+time_t mktimegm(struct tm *tm)
+{
+    time_t t;
+    int y = tm->tm_year + 1900, m = tm->tm_mon + 1, d = tm->tm_mday;
+    if (m < 3) {
+        m += 12;
+        y--;
+    }
+    t = 86400 * (d + (153 * m - 457) / 5 + 365 * y + y / 4 - y / 100 +
+                 y / 400 - 719469);
+    t += 3600 * tm->tm_hour + 60 * tm->tm_min + tm->tm_sec;
+    return t;
+}
+#endif /* !VBOX */
+
+int qemu_fls(int i)
+{
+    return 32 - clz32(i);
+}
+
+#ifndef VBOX
+
+/*
+ * Make sure data goes on disk, but if possible do not bother to
+ * write out the inode just for timestamp updates.
+ *
+ * Unfortunately even in 2009 many operating systems do not support
+ * fdatasync and have to fall back to fsync.
+ */
+int qemu_fdatasync(int fd)
+{
+#ifdef CONFIG_FDATASYNC
+    return fdatasync(fd);
+#else
+    return fsync(fd);
+#endif
+}
+
+/* io vectors */
+
+void qemu_iovec_init(QEMUIOVector *qiov, int alloc_hint)
+{
+    qiov->iov = qemu_malloc(alloc_hint * sizeof(struct iovec));
+    qiov->niov = 0;
+    qiov->nalloc = alloc_hint;
+    qiov->size = 0;
+}
+
+void qemu_iovec_init_external(QEMUIOVector *qiov, struct iovec *iov, int niov)
+{
+    int i;
+
+    qiov->iov = iov;
+    qiov->niov = niov;
+    qiov->nalloc = -1;
+    qiov->size = 0;
+    for (i = 0; i < niov; i++)
+        qiov->size += iov[i].iov_len;
+}
+
+void qemu_iovec_add(QEMUIOVector *qiov, void *base, size_t len)
+{
+    assert(qiov->nalloc != -1);
+
+    if (qiov->niov == qiov->nalloc) {
+        qiov->nalloc = 2 * qiov->nalloc + 1;
+        qiov->iov = qemu_realloc(qiov->iov, qiov->nalloc * sizeof(struct iovec));
+    }
+    qiov->iov[qiov->niov].iov_base = base;
+    qiov->iov[qiov->niov].iov_len = len;
+    qiov->size += len;
+    ++qiov->niov;
+}
+
+/*
+ * Copies iovecs from src to the end dst until src is completely copied or the
+ * total size of the copied iovec reaches size. The size of the last copied
+ * iovec is changed in order to fit the specified total size if it isn't a
+ * perfect fit already.
+ */
+void qemu_iovec_concat(QEMUIOVector *dst, QEMUIOVector *src, size_t size)
+{
+    int i;
+    size_t done;
+
+    assert(dst->nalloc != -1);
+
+    done = 0;
+    for (i = 0; (i < src->niov) && (done != size); i++) {
+        if (done + src->iov[i].iov_len > size) {
+            qemu_iovec_add(dst, src->iov[i].iov_base, size - done);
+            break;
+        } else {
+            qemu_iovec_add(dst, src->iov[i].iov_base, src->iov[i].iov_len);
+        }
+        done += src->iov[i].iov_len;
+    }
+}
+
+void qemu_iovec_destroy(QEMUIOVector *qiov)
+{
+    assert(qiov->nalloc != -1);
+
+    qemu_free(qiov->iov);
+}
+
+void qemu_iovec_reset(QEMUIOVector *qiov)
+{
+    assert(qiov->nalloc != -1);
+
+    qiov->niov = 0;
+    qiov->size = 0;
+}
+
+void qemu_iovec_to_buffer(QEMUIOVector *qiov, void *buf)
+{
+    uint8_t *p = (uint8_t *)buf;
+    int i;
+
+    for (i = 0; i < qiov->niov; ++i) {
+        memcpy(p, qiov->iov[i].iov_base, qiov->iov[i].iov_len);
+        p += qiov->iov[i].iov_len;
+    }
+}
+
+void qemu_iovec_from_buffer(QEMUIOVector *qiov, const void *buf, size_t count)
+{
+    const uint8_t *p = (const uint8_t *)buf;
+    size_t copy;
+    int i;
+
+    for (i = 0; i < qiov->niov && count; ++i) {
+        copy = count;
+        if (copy > qiov->iov[i].iov_len)
+            copy = qiov->iov[i].iov_len;
+        memcpy(qiov->iov[i].iov_base, p, copy);
+        p     += copy;
+        count -= copy;
+    }
+}
+
+#ifndef _WIN32
+/* Sets a specific flag */
+int fcntl_setfl(int fd, int flag)
+{
+    int flags;
+
+    flags = fcntl(fd, F_GETFL);
+    if (flags == -1)
+        return -errno;
+
+    if (fcntl(fd, F_SETFL, flags | flag) == -1)
+        return -errno;
+
+    return 0;
+}
+#endif
+
+#endif /* !VBOX */
+
diff --git a/src/recompiler/def-helper.h b/src/recompiler/def-helper.h
new file mode 100644
index 00000000..ad5b9459
--- /dev/null
+++ b/src/recompiler/def-helper.h
@@ -0,0 +1,258 @@
+/* Helper file for declaring TCG helper functions.
+   Should be included at the start and end of target-foo/helper.h.
+
+   Targets should use DEF_HELPER_N and DEF_HELPER_FLAGS_N to declare helper
+   functions.  Names should be specified without the helper_ prefix, and
+   the return and argument types specified.  3 basic types are understood
+   (i32, i64 and ptr).  Additional aliases are provided for convenience and
+   to match the types used by the C helper implementation.
+
+   The target helper.h should be included in all files that use/define
+   helper functions.  THis will ensure that function prototypes are
+   consistent.  In addition it should be included an extra two times for
+   helper.c, defining:
+    GEN_HELPER 1 to produce op generation functions (gen_helper_*)
+    GEN_HELPER 2 to do runtime registration helper functions.
+ */
+
+#ifndef DEF_HELPER_H
+#define DEF_HELPER_H 1
+
+#define HELPER(name) glue(helper_, name)
+
+#define GET_TCGV_i32 GET_TCGV_I32
+#define GET_TCGV_i64 GET_TCGV_I64
+#define GET_TCGV_ptr GET_TCGV_PTR
+
+/* Some types that make sense in C, but not for TCG.  */
+#define dh_alias_i32 i32
+#define dh_alias_s32 i32
+#define dh_alias_int i32
+#define dh_alias_i64 i64
+#define dh_alias_s64 i64
+#define dh_alias_f32 i32
+#define dh_alias_f64 i64
+#if TARGET_LONG_BITS == 32
+#define dh_alias_tl i32
+#else
+#define dh_alias_tl i64
+#endif
+#define dh_alias_ptr ptr
+#define dh_alias_void void
+#define dh_alias_env ptr
+#ifdef VBOX
+# if ARCH_BITS == 32
+#  define dh_alias_RTCCUINTREG   i32
+#  define dh_alias_RTCCINTREG    i32
+# else
+#  define dh_alias_RTCCUINTREG   i64
+#  define dh_alias_RTCCINTREG    i64
+# endif
+#endif
+#define dh_alias(t) glue(dh_alias_, t)
+
+#define dh_ctype_i32 uint32_t
+#define dh_ctype_s32 int32_t
+#define dh_ctype_int int
+#define dh_ctype_i64 uint64_t
+#define dh_ctype_s64 int64_t
+#define dh_ctype_f32 float32
+#define dh_ctype_f64 float64
+#define dh_ctype_tl target_ulong
+#define dh_ctype_ptr void *
+#define dh_ctype_void void
+#define dh_ctype_env CPUState *
+#ifdef VBOX
+# if ARCH_BITS == 32
+#  define dh_ctype_RTCCUINTREG   uint32_t
+#  define dh_ctype_RTCCINTREG    int32_t
+# else
+#  define dh_ctype_RTCCUINTREG   uint64_t
+#  define dh_ctype_RTCCINTREG    int64_t
+# endif
+#endif
+#define dh_ctype(t) dh_ctype_##t
+
+/* We can't use glue() here because it falls foul of C preprocessor
+   recursive expansion rules.  */
+#define dh_retvar_decl0_void void
+#define dh_retvar_decl0_i32 TCGv_i32 retval
+#define dh_retvar_decl0_i64 TCGv_i64 retval
+#define dh_retvar_decl0_ptr TCGv_ptr retval
+#define dh_retvar_decl0(t) glue(dh_retvar_decl0_, dh_alias(t))
+
+#define dh_retvar_decl_void
+#define dh_retvar_decl_i32 TCGv_i32 retval,
+#define dh_retvar_decl_i64 TCGv_i64 retval,
+#define dh_retvar_decl_ptr TCGv_ptr retval,
+#define dh_retvar_decl(t) glue(dh_retvar_decl_, dh_alias(t))
+
+#define dh_retvar_void TCG_CALL_DUMMY_ARG
+#define dh_retvar_i32 GET_TCGV_i32(retval)
+#define dh_retvar_i64 GET_TCGV_i64(retval)
+#define dh_retvar_ptr GET_TCGV_ptr(retval)
+#define dh_retvar(t) glue(dh_retvar_, dh_alias(t))
+
+#define dh_is_64bit_void 0
+#define dh_is_64bit_i32 0
+#define dh_is_64bit_i64 1
+#define dh_is_64bit_ptr (TCG_TARGET_REG_BITS == 64)
+#define dh_is_64bit(t) glue(dh_is_64bit_, dh_alias(t))
+
+#define dh_is_signed_void 0
+#define dh_is_signed_i32 0
+#define dh_is_signed_s32 1
+#define dh_is_signed_i64 0
+#define dh_is_signed_s64 1
+#define dh_is_signed_f32 0
+#define dh_is_signed_f64 0
+#define dh_is_signed_tl  0
+#define dh_is_signed_int 1
+/* ??? This is highly specific to the host cpu.  There are even special
+   extension instructions that may be required, e.g. ia64's addp4.  But
+   for now we don't support any 64-bit targets with 32-bit pointers.  */
+#define dh_is_signed_ptr 0
+#define dh_is_signed_env dh_is_signed_ptr
+#define dh_is_signed(t) dh_is_signed_##t
+
+#define dh_sizemask(t, n) \
+  sizemask |= dh_is_64bit(t) << (n*2); \
+  sizemask |= dh_is_signed(t) << (n*2+1)
+
+#define dh_arg(t, n) \
+  args[n - 1] = glue(GET_TCGV_, dh_alias(t))(glue(arg, n)); \
+  dh_sizemask(t, n)
+
+#define dh_arg_decl(t, n) glue(TCGv_, dh_alias(t)) glue(arg, n)
+
+
+#define DEF_HELPER_0(name, ret) \
+    DEF_HELPER_FLAGS_0(name, 0, ret)
+#define DEF_HELPER_1(name, ret, t1) \
+    DEF_HELPER_FLAGS_1(name, 0, ret, t1)
+#define DEF_HELPER_2(name, ret, t1, t2) \
+    DEF_HELPER_FLAGS_2(name, 0, ret, t1, t2)
+#define DEF_HELPER_3(name, ret, t1, t2, t3) \
+    DEF_HELPER_FLAGS_3(name, 0, ret, t1, t2, t3)
+#define DEF_HELPER_4(name, ret, t1, t2, t3, t4) \
+    DEF_HELPER_FLAGS_4(name, 0, ret, t1, t2, t3, t4)
+
+#endif /* DEF_HELPER_H */
+
+#ifndef GEN_HELPER
+/* Function prototypes.  */
+
+#define DEF_HELPER_FLAGS_0(name, flags, ret) \
+dh_ctype(ret) HELPER(name) (void);
+
+#define DEF_HELPER_FLAGS_1(name, flags, ret, t1) \
+dh_ctype(ret) HELPER(name) (dh_ctype(t1));
+
+#define DEF_HELPER_FLAGS_2(name, flags, ret, t1, t2) \
+dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2));
+
+#define DEF_HELPER_FLAGS_3(name, flags, ret, t1, t2, t3) \
+dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3));
+
+#define DEF_HELPER_FLAGS_4(name, flags, ret, t1, t2, t3, t4) \
+dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
+                                   dh_ctype(t4));
+
+#undef GEN_HELPER
+#define GEN_HELPER -1
+
+#elif GEN_HELPER == 1
+/* Gen functions.  */
+
+#define DEF_HELPER_FLAGS_0(name, flags, ret) \
+static inline void glue(gen_helper_, name)(dh_retvar_decl0(ret)) \
+{ \
+  int sizemask; \
+  sizemask = dh_is_64bit(ret); \
+  tcg_gen_helperN(HELPER(name), flags, sizemask, dh_retvar(ret), 0, NULL); \
+}
+
+#define DEF_HELPER_FLAGS_1(name, flags, ret, t1) \
+static inline void glue(gen_helper_, name)(dh_retvar_decl(ret) dh_arg_decl(t1, 1)) \
+{ \
+  TCGArg args[1]; \
+  int sizemask = 0; \
+  dh_sizemask(ret, 0); \
+  dh_arg(t1, 1); \
+  tcg_gen_helperN(HELPER(name), flags, sizemask, dh_retvar(ret), 1, args); \
+}
+
+#define DEF_HELPER_FLAGS_2(name, flags, ret, t1, t2) \
+static inline void glue(gen_helper_, name)(dh_retvar_decl(ret) dh_arg_decl(t1, 1), \
+    dh_arg_decl(t2, 2)) \
+{ \
+  TCGArg args[2]; \
+  int sizemask = 0; \
+  dh_sizemask(ret, 0); \
+  dh_arg(t1, 1); \
+  dh_arg(t2, 2); \
+  tcg_gen_helperN(HELPER(name), flags, sizemask, dh_retvar(ret), 2, args); \
+}
+
+#define DEF_HELPER_FLAGS_3(name, flags, ret, t1, t2, t3) \
+static inline void glue(gen_helper_, name)(dh_retvar_decl(ret) dh_arg_decl(t1, 1), \
+    dh_arg_decl(t2, 2), dh_arg_decl(t3, 3)) \
+{ \
+  TCGArg args[3]; \
+  int sizemask = 0; \
+  dh_sizemask(ret, 0); \
+  dh_arg(t1, 1); \
+  dh_arg(t2, 2); \
+  dh_arg(t3, 3); \
+  tcg_gen_helperN(HELPER(name), flags, sizemask, dh_retvar(ret), 3, args); \
+}
+
+#define DEF_HELPER_FLAGS_4(name, flags, ret, t1, t2, t3, t4) \
+static inline void glue(gen_helper_, name)(dh_retvar_decl(ret) dh_arg_decl(t1, 1), \
+    dh_arg_decl(t2, 2), dh_arg_decl(t3, 3), dh_arg_decl(t4, 4)) \
+{ \
+  TCGArg args[4]; \
+  int sizemask = 0; \
+  dh_sizemask(ret, 0); \
+  dh_arg(t1, 1); \
+  dh_arg(t2, 2); \
+  dh_arg(t3, 3); \
+  dh_arg(t4, 4); \
+  tcg_gen_helperN(HELPER(name), flags, sizemask, dh_retvar(ret), 4, args); \
+}
+
+#undef GEN_HELPER
+#define GEN_HELPER -1
+
+#elif GEN_HELPER == 2
+/* Register helpers.  */
+
+#define DEF_HELPER_FLAGS_0(name, flags, ret) \
+tcg_register_helper(HELPER(name), #name);
+
+#define DEF_HELPER_FLAGS_1(name, flags, ret, t1) \
+DEF_HELPER_FLAGS_0(name, flags, ret)
+
+#define DEF_HELPER_FLAGS_2(name, flags, ret, t1, t2) \
+DEF_HELPER_FLAGS_0(name, flags, ret)
+
+#define DEF_HELPER_FLAGS_3(name, flags, ret, t1, t2, t3) \
+DEF_HELPER_FLAGS_0(name, flags, ret)
+
+#define DEF_HELPER_FLAGS_4(name, flags, ret, t1, t2, t3, t4) \
+DEF_HELPER_FLAGS_0(name, flags, ret)
+
+#undef GEN_HELPER
+#define GEN_HELPER -1
+
+#elif GEN_HELPER == -1
+/* Undefine macros.  */
+
+#undef DEF_HELPER_FLAGS_0
+#undef DEF_HELPER_FLAGS_1
+#undef DEF_HELPER_FLAGS_2
+#undef DEF_HELPER_FLAGS_3
+#undef DEF_HELPER_FLAGS_4
+#undef GEN_HELPER
+
+#endif
diff --git a/src/recompiler/disas.h b/src/recompiler/disas.h
new file mode 100644
index 00000000..a20df127
--- /dev/null
+++ b/src/recompiler/disas.h
@@ -0,0 +1,47 @@
+#ifndef _QEMU_DISAS_H
+#define _QEMU_DISAS_H
+
+#include "qemu-common.h"
+
+#ifdef NEED_CPU_H
+/* Disassemble this for me please... (debugging). */
+void disas(FILE *out, void *code, unsigned long size);
+void target_disas(FILE *out, target_ulong code, target_ulong size, int flags);
+
+#ifndef VBOX
+/* The usual mess... FIXME: Remove this condition once dyngen-exec.h is gone */
+#ifndef __DYNGEN_EXEC_H__
+void monitor_disas(Monitor *mon, CPUState *env,
+                   target_ulong pc, int nb_insn, int is_physical, int flags);
+#endif
+#endif /*!VBOX*/
+
+/* Look up symbol for debugging purpose.  Returns "" if unknown. */
+const char *lookup_symbol(target_ulong orig_addr);
+#endif
+
+struct syminfo;
+struct elf32_sym;
+struct elf64_sym;
+
+#if defined(CONFIG_USER_ONLY)
+typedef const char *(*lookup_symbol_t)(struct syminfo *s, target_ulong orig_addr);
+#else
+typedef const char *(*lookup_symbol_t)(struct syminfo *s, target_phys_addr_t orig_addr);
+#endif
+
+struct syminfo {
+    lookup_symbol_t lookup_symbol;
+    unsigned int disas_num_syms;
+    union {
+      struct elf32_sym *elf32;
+      struct elf64_sym *elf64;
+    } disas_symtab;
+    const char *disas_strtab;
+    struct syminfo *next;
+};
+
+/* Filled in by elfload.c.  Simplistic, but will do for now. */
+extern struct syminfo *syminfos;
+
+#endif /* _QEMU_DISAS_H */
diff --git a/src/recompiler/dyngen-exec.h b/src/recompiler/dyngen-exec.h
new file mode 100644
index 00000000..5397a211
--- /dev/null
+++ b/src/recompiler/dyngen-exec.h
@@ -0,0 +1,131 @@
+/*
+ *  dyngen defines for micro operation code
+ *
+ *  Copyright (c) 2003 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Oracle LGPL Disclaimer: For the avoidance of doubt, except that if any license choice
+ * other than GPL or LGPL is available it will apply instead, Oracle elects to use only
+ * the Lesser General Public License version 2.1 (LGPLv2) at this time for any software where
+ * a choice of LGPL license versions is made available with the language indicating
+ * that LGPLv2 or any later version may be used, or where a choice of which version
+ * of the LGPL is applied is otherwise unspecified.
+ */
+
+#if !defined(__DYNGEN_EXEC_H__)
+#define __DYNGEN_EXEC_H__
+
+#ifndef VBOX
+/* prevent Solaris from trying to typedef FILE in gcc's
+   include/floatingpoint.h which will conflict with the
+   definition down below */
+#ifdef __sun__
+#define _FILEDEFED
+#endif
+#endif /* !VBOX */
+
+/* NOTE: standard headers should be used with special care at this
+   point because host CPU registers are used as global variables. Some
+   host headers do not allow that. */
+#include <stddef.h>
+#ifndef VBOX
+#include <stdint.h>
+
+#ifdef __OpenBSD__
+#include <sys/types.h>
+#endif
+
+/* XXX: This may be wrong for 64-bit ILP32 hosts.  */
+typedef void * host_reg_t;
+
+#ifdef CONFIG_BSD
+typedef struct __sFILE FILE;
+#else
+typedef struct FILE FILE;
+#endif
+extern int fprintf(FILE *, const char *, ...);
+extern int fputs(const char *, FILE *);
+extern int printf(const char *, ...);
+
+#else  /* VBOX */
+
+/* XXX: This may be wrong for 64-bit ILP32 hosts.  */
+typedef void * host_reg_t;
+
+#include <iprt/stdint.h>
+#include <stdio.h>
+
+#endif /* VBOX */
+
+#if defined(__i386__)
+# ifndef VBOX
+#define AREG0 "ebp"
+# else  /* VBOX - why are we different? frame-pointer optimizations on mac? */
+#  define AREG0 "esi"
+# endif /* VBOX */
+#elif defined(__x86_64__)
+#define AREG0 "r14"
+#elif defined(_ARCH_PPC)
+#define AREG0 "r27"
+#elif defined(__arm__)
+#define AREG0 "r7"
+#elif defined(__hppa__)
+#define AREG0 "r17"
+#elif defined(__mips__)
+#define AREG0 "s0"
+#elif defined(__sparc__)
+#ifdef CONFIG_SOLARIS
+#define AREG0 "g2"
+#else
+#ifdef __sparc_v9__
+#define AREG0 "g5"
+#else
+#define AREG0 "g6"
+#endif
+#endif
+#elif defined(__s390__)
+#define AREG0 "r10"
+#elif defined(__alpha__)
+/* Note $15 is the frame pointer, so anything in op-i386.c that would
+   require a frame pointer, like alloca, would probably loose.  */
+#define AREG0 "$15"
+#elif defined(__mc68000)
+#define AREG0 "%a5"
+#elif defined(__ia64__)
+#define AREG0 "r7"
+#else
+#error unsupported CPU
+#endif
+
+#define xglue(x, y) x ## y
+#define glue(x, y) xglue(x, y)
+#define stringify(s)	tostring(s)
+#define tostring(s)	#s
+
+/* The return address may point to the start of the next instruction.
+   Subtracting one gets us the call instruction itself.  */
+#if defined(__s390__) && !defined(__s390x__)
+# define GETPC() ((void*)(((uintptr_t)__builtin_return_address(0) & 0x7fffffffUL) - 1))
+#elif defined(__arm__)
+/* Thumb return addresses have the low bit set, so we need to subtract two.
+   This is still safe in ARM mode because instructions are 4 bytes.  */
+# define GETPC() ((void *)((uintptr_t)__builtin_return_address(0) - 2))
+#else
+# define GETPC() ((void *)((uintptr_t)__builtin_return_address(0) - 1))
+#endif
+
+#endif /* !defined(__DYNGEN_EXEC_H__) */
diff --git a/src/recompiler/elf.h b/src/recompiler/elf.h
new file mode 100644
index 00000000..eb9e3bec
--- /dev/null
+++ b/src/recompiler/elf.h
@@ -0,0 +1,1196 @@
+#ifndef _QEMU_ELF_H
+#define _QEMU_ELF_H
+
+#include <inttypes.h>
+
+/* 32-bit ELF base types. */
+typedef uint32_t Elf32_Addr;
+typedef uint16_t Elf32_Half;
+typedef uint32_t Elf32_Off;
+typedef int32_t  Elf32_Sword;
+typedef uint32_t Elf32_Word;
+
+/* 64-bit ELF base types. */
+typedef uint64_t Elf64_Addr;
+typedef uint16_t Elf64_Half;
+typedef int16_t	 Elf64_SHalf;
+typedef uint64_t Elf64_Off;
+typedef int32_t	 Elf64_Sword;
+typedef uint32_t Elf64_Word;
+typedef uint64_t Elf64_Xword;
+typedef int64_t  Elf64_Sxword;
+
+/* These constants are for the segment types stored in the image headers */
+#define PT_NULL    0
+#define PT_LOAD    1
+#define PT_DYNAMIC 2
+#define PT_INTERP  3
+#define PT_NOTE    4
+#define PT_SHLIB   5
+#define PT_PHDR    6
+#define PT_LOPROC  0x70000000
+#define PT_HIPROC  0x7fffffff
+#define PT_MIPS_REGINFO		0x70000000
+#define PT_MIPS_OPTIONS		0x70000001
+
+/* Flags in the e_flags field of the header */
+/* MIPS architecture level. */
+#define EF_MIPS_ARCH_1		0x00000000	/* -mips1 code.  */
+#define EF_MIPS_ARCH_2		0x10000000	/* -mips2 code.  */
+#define EF_MIPS_ARCH_3		0x20000000	/* -mips3 code.  */
+#define EF_MIPS_ARCH_4		0x30000000	/* -mips4 code.  */
+#define EF_MIPS_ARCH_5		0x40000000	/* -mips5 code.  */
+#define EF_MIPS_ARCH_32		0x50000000	/* MIPS32 code.  */
+#define EF_MIPS_ARCH_64		0x60000000	/* MIPS64 code.  */
+
+/* The ABI of a file. */
+#define EF_MIPS_ABI_O32		0x00001000	/* O32 ABI.  */
+#define EF_MIPS_ABI_O64		0x00002000	/* O32 extended for 64 bit.  */
+
+#define EF_MIPS_NOREORDER 0x00000001
+#define EF_MIPS_PIC       0x00000002
+#define EF_MIPS_CPIC      0x00000004
+#define EF_MIPS_ABI2		0x00000020
+#define EF_MIPS_OPTIONS_FIRST	0x00000080
+#define EF_MIPS_32BITMODE	0x00000100
+#define EF_MIPS_ABI		0x0000f000
+#define EF_MIPS_ARCH      0xf0000000
+
+/* These constants define the different elf file types */
+#define ET_NONE   0
+#define ET_REL    1
+#define ET_EXEC   2
+#define ET_DYN    3
+#define ET_CORE   4
+#define ET_LOPROC 0xff00
+#define ET_HIPROC 0xffff
+
+/* These constants define the various ELF target machines */
+#define EM_NONE  0
+#define EM_M32   1
+#define EM_SPARC 2
+#define EM_386   3
+#define EM_68K   4
+#define EM_88K   5
+#define EM_486   6   /* Perhaps disused */
+#define EM_860   7
+
+#define EM_MIPS		8	/* MIPS R3000 (officially, big-endian only) */
+
+#define EM_MIPS_RS4_BE 10	/* MIPS R4000 big-endian */
+
+#define EM_PARISC      15	/* HPPA */
+
+#define EM_SPARC32PLUS 18	/* Sun's "v8plus" */
+
+#define EM_PPC	       20	/* PowerPC */
+#define EM_PPC64       21       /* PowerPC64 */
+
+#define EM_ARM		40		/* ARM */
+
+#define EM_SH	       42	/* SuperH */
+
+#define EM_SPARCV9     43	/* SPARC v9 64-bit */
+
+#define EM_IA_64	50	/* HP/Intel IA-64 */
+
+#define EM_X86_64	62	/* AMD x86-64 */
+
+#define EM_S390		22	/* IBM S/390 */
+
+#define EM_CRIS         76      /* Axis Communications 32-bit embedded processor */
+
+#define EM_V850		87	/* NEC v850 */
+
+#define EM_H8_300H      47      /* Hitachi H8/300H */
+#define EM_H8S          48      /* Hitachi H8S     */
+
+/*
+ * This is an interim value that we will use until the committee comes
+ * up with a final number.
+ */
+#define EM_ALPHA	0x9026
+
+/* Bogus old v850 magic number, used by old tools.  */
+#define EM_CYGNUS_V850	0x9080
+
+/*
+ * This is the old interim value for S/390 architecture
+ */
+#define EM_S390_OLD     0xA390
+
+#define EM_MICROBLAZE      189
+#define EM_MICROBLAZE_OLD  0xBAAB
+
+/* This is the info that is needed to parse the dynamic section of the file */
+#define DT_NULL		0
+#define DT_NEEDED	1
+#define DT_PLTRELSZ	2
+#define DT_PLTGOT	3
+#define DT_HASH		4
+#define DT_STRTAB	5
+#define DT_SYMTAB	6
+#define DT_RELA		7
+#define DT_RELASZ	8
+#define DT_RELAENT	9
+#define DT_STRSZ	10
+#define DT_SYMENT	11
+#define DT_INIT		12
+#define DT_FINI		13
+#define DT_SONAME	14
+#define DT_RPATH 	15
+#define DT_SYMBOLIC	16
+#define DT_REL	        17
+#define DT_RELSZ	18
+#define DT_RELENT	19
+#define DT_PLTREL	20
+#define DT_DEBUG	21
+#define DT_TEXTREL	22
+#define DT_JMPREL	23
+#define DT_LOPROC	0x70000000
+#define DT_HIPROC	0x7fffffff
+#define DT_MIPS_RLD_VERSION	0x70000001
+#define DT_MIPS_TIME_STAMP	0x70000002
+#define DT_MIPS_ICHECKSUM	0x70000003
+#define DT_MIPS_IVERSION	0x70000004
+#define DT_MIPS_FLAGS		0x70000005
+  #define RHF_NONE		  0
+  #define RHF_HARDWAY		  1
+  #define RHF_NOTPOT		  2
+#define DT_MIPS_BASE_ADDRESS	0x70000006
+#define DT_MIPS_CONFLICT	0x70000008
+#define DT_MIPS_LIBLIST		0x70000009
+#define DT_MIPS_LOCAL_GOTNO	0x7000000a
+#define DT_MIPS_CONFLICTNO	0x7000000b
+#define DT_MIPS_LIBLISTNO	0x70000010
+#define DT_MIPS_SYMTABNO	0x70000011
+#define DT_MIPS_UNREFEXTNO	0x70000012
+#define DT_MIPS_GOTSYM		0x70000013
+#define DT_MIPS_HIPAGENO	0x70000014
+#define DT_MIPS_RLD_MAP		0x70000016
+
+/* This info is needed when parsing the symbol table */
+#define STB_LOCAL  0
+#define STB_GLOBAL 1
+#define STB_WEAK   2
+
+#define STT_NOTYPE  0
+#define STT_OBJECT  1
+#define STT_FUNC    2
+#define STT_SECTION 3
+#define STT_FILE    4
+
+#define ELF_ST_BIND(x)		((x) >> 4)
+#define ELF_ST_TYPE(x)		(((unsigned int) x) & 0xf)
+#define ELF32_ST_BIND(x)	ELF_ST_BIND(x)
+#define ELF32_ST_TYPE(x)	ELF_ST_TYPE(x)
+#define ELF64_ST_BIND(x)	ELF_ST_BIND(x)
+#define ELF64_ST_TYPE(x)	ELF_ST_TYPE(x)
+
+/* Symbolic values for the entries in the auxiliary table
+   put on the initial stack */
+#define AT_NULL   0	/* end of vector */
+#define AT_IGNORE 1	/* entry should be ignored */
+#define AT_EXECFD 2	/* file descriptor of program */
+#define AT_PHDR   3	/* program headers for program */
+#define AT_PHENT  4	/* size of program header entry */
+#define AT_PHNUM  5	/* number of program headers */
+#define AT_PAGESZ 6	/* system page size */
+#define AT_BASE   7	/* base address of interpreter */
+#define AT_FLAGS  8	/* flags */
+#define AT_ENTRY  9	/* entry point of program */
+#define AT_NOTELF 10	/* program is not ELF */
+#define AT_UID    11	/* real uid */
+#define AT_EUID   12	/* effective uid */
+#define AT_GID    13	/* real gid */
+#define AT_EGID   14	/* effective gid */
+#define AT_PLATFORM 15  /* string identifying CPU for optimizations */
+#define AT_HWCAP  16    /* arch dependent hints at CPU capabilities */
+#define AT_CLKTCK 17	/* frequency at which times() increments */
+
+typedef struct dynamic{
+  Elf32_Sword d_tag;
+  union{
+    Elf32_Sword	d_val;
+    Elf32_Addr	d_ptr;
+  } d_un;
+} Elf32_Dyn;
+
+typedef struct {
+  Elf64_Sxword d_tag;		/* entry tag value */
+  union {
+    Elf64_Xword d_val;
+    Elf64_Addr d_ptr;
+  } d_un;
+} Elf64_Dyn;
+
+/* The following are used with relocations */
+#define ELF32_R_SYM(x) ((x) >> 8)
+#define ELF32_R_TYPE(x) ((x) & 0xff)
+
+#define ELF64_R_SYM(i)			((i) >> 32)
+#define ELF64_R_TYPE(i)			((i) & 0xffffffff)
+#define ELF64_R_TYPE_DATA(i)            (((ELF64_R_TYPE(i) >> 8) ^ 0x00800000) - 0x00800000)
+
+#define R_386_NONE	0
+#define R_386_32	1
+#define R_386_PC32	2
+#define R_386_GOT32	3
+#define R_386_PLT32	4
+#define R_386_COPY	5
+#define R_386_GLOB_DAT	6
+#define R_386_JMP_SLOT	7
+#define R_386_RELATIVE	8
+#define R_386_GOTOFF	9
+#define R_386_GOTPC	10
+#define R_386_NUM	11
+/* Not a dynamic reloc, so not included in R_386_NUM.  Used in TCG.  */
+#define R_386_PC8	23
+
+#define R_MIPS_NONE		0
+#define R_MIPS_16		1
+#define R_MIPS_32		2
+#define R_MIPS_REL32		3
+#define R_MIPS_26		4
+#define R_MIPS_HI16		5
+#define R_MIPS_LO16		6
+#define R_MIPS_GPREL16		7
+#define R_MIPS_LITERAL		8
+#define R_MIPS_GOT16		9
+#define R_MIPS_PC16		10
+#define R_MIPS_CALL16		11
+#define R_MIPS_GPREL32		12
+/* The remaining relocs are defined on Irix, although they are not
+   in the MIPS ELF ABI.  */
+#define R_MIPS_UNUSED1		13
+#define R_MIPS_UNUSED2		14
+#define R_MIPS_UNUSED3		15
+#define R_MIPS_SHIFT5		16
+#define R_MIPS_SHIFT6		17
+#define R_MIPS_64		18
+#define R_MIPS_GOT_DISP		19
+#define R_MIPS_GOT_PAGE		20
+#define R_MIPS_GOT_OFST		21
+/*
+ * The following two relocation types are specified in the MIPS ABI
+ * conformance guide version 1.2 but not yet in the psABI.
+ */
+#define R_MIPS_GOTHI16		22
+#define R_MIPS_GOTLO16		23
+#define R_MIPS_SUB		24
+#define R_MIPS_INSERT_A		25
+#define R_MIPS_INSERT_B		26
+#define R_MIPS_DELETE		27
+#define R_MIPS_HIGHER		28
+#define R_MIPS_HIGHEST		29
+/*
+ * The following two relocation types are specified in the MIPS ABI
+ * conformance guide version 1.2 but not yet in the psABI.
+ */
+#define R_MIPS_CALLHI16		30
+#define R_MIPS_CALLLO16		31
+/*
+ * This range is reserved for vendor specific relocations.
+ */
+#define R_MIPS_LOVENDOR		100
+#define R_MIPS_HIVENDOR		127
+
+
+/*
+ * Sparc ELF relocation types
+ */
+#define	R_SPARC_NONE		0
+#define	R_SPARC_8		1
+#define	R_SPARC_16		2
+#define	R_SPARC_32		3
+#define	R_SPARC_DISP8		4
+#define	R_SPARC_DISP16		5
+#define	R_SPARC_DISP32		6
+#define	R_SPARC_WDISP30		7
+#define	R_SPARC_WDISP22		8
+#define	R_SPARC_HI22		9
+#define	R_SPARC_22		10
+#define	R_SPARC_13		11
+#define	R_SPARC_LO10		12
+#define	R_SPARC_GOT10		13
+#define	R_SPARC_GOT13		14
+#define	R_SPARC_GOT22		15
+#define	R_SPARC_PC10		16
+#define	R_SPARC_PC22		17
+#define	R_SPARC_WPLT30		18
+#define	R_SPARC_COPY		19
+#define	R_SPARC_GLOB_DAT	20
+#define	R_SPARC_JMP_SLOT	21
+#define	R_SPARC_RELATIVE	22
+#define	R_SPARC_UA32		23
+#define R_SPARC_PLT32		24
+#define R_SPARC_HIPLT22		25
+#define R_SPARC_LOPLT10		26
+#define R_SPARC_PCPLT32		27
+#define R_SPARC_PCPLT22		28
+#define R_SPARC_PCPLT10		29
+#define R_SPARC_10		30
+#define R_SPARC_11		31
+#define R_SPARC_64		32
+#define R_SPARC_OLO10           33
+#define R_SPARC_HH22            34
+#define R_SPARC_HM10            35
+#define R_SPARC_LM22            36
+#define R_SPARC_WDISP16		40
+#define R_SPARC_WDISP19		41
+#define R_SPARC_7		43
+#define R_SPARC_5		44
+#define R_SPARC_6		45
+
+/* Bits present in AT_HWCAP, primarily for Sparc32.  */
+
+#define HWCAP_SPARC_FLUSH       1    /* CPU supports flush instruction. */
+#define HWCAP_SPARC_STBAR       2
+#define HWCAP_SPARC_SWAP        4
+#define HWCAP_SPARC_MULDIV      8
+#define HWCAP_SPARC_V9		16
+#define HWCAP_SPARC_ULTRA3	32
+
+/*
+ * 68k ELF relocation types
+ */
+#define R_68K_NONE	0
+#define R_68K_32	1
+#define R_68K_16	2
+#define R_68K_8		3
+#define R_68K_PC32	4
+#define R_68K_PC16	5
+#define R_68K_PC8	6
+#define R_68K_GOT32	7
+#define R_68K_GOT16	8
+#define R_68K_GOT8	9
+#define R_68K_GOT32O	10
+#define R_68K_GOT16O	11
+#define R_68K_GOT8O	12
+#define R_68K_PLT32	13
+#define R_68K_PLT16	14
+#define R_68K_PLT8	15
+#define R_68K_PLT32O	16
+#define R_68K_PLT16O	17
+#define R_68K_PLT8O	18
+#define R_68K_COPY	19
+#define R_68K_GLOB_DAT	20
+#define R_68K_JMP_SLOT	21
+#define R_68K_RELATIVE	22
+
+/*
+ * Alpha ELF relocation types
+ */
+#define R_ALPHA_NONE            0       /* No reloc */
+#define R_ALPHA_REFLONG         1       /* Direct 32 bit */
+#define R_ALPHA_REFQUAD         2       /* Direct 64 bit */
+#define R_ALPHA_GPREL32         3       /* GP relative 32 bit */
+#define R_ALPHA_LITERAL         4       /* GP relative 16 bit w/optimization */
+#define R_ALPHA_LITUSE          5       /* Optimization hint for LITERAL */
+#define R_ALPHA_GPDISP          6       /* Add displacement to GP */
+#define R_ALPHA_BRADDR          7       /* PC+4 relative 23 bit shifted */
+#define R_ALPHA_HINT            8       /* PC+4 relative 16 bit shifted */
+#define R_ALPHA_SREL16          9       /* PC relative 16 bit */
+#define R_ALPHA_SREL32          10      /* PC relative 32 bit */
+#define R_ALPHA_SREL64          11      /* PC relative 64 bit */
+#define R_ALPHA_GPRELHIGH       17      /* GP relative 32 bit, high 16 bits */
+#define R_ALPHA_GPRELLOW        18      /* GP relative 32 bit, low 16 bits */
+#define R_ALPHA_GPREL16         19      /* GP relative 16 bit */
+#define R_ALPHA_COPY            24      /* Copy symbol at runtime */
+#define R_ALPHA_GLOB_DAT        25      /* Create GOT entry */
+#define R_ALPHA_JMP_SLOT        26      /* Create PLT entry */
+#define R_ALPHA_RELATIVE        27      /* Adjust by program base */
+#define R_ALPHA_BRSGP		28
+#define R_ALPHA_TLSGD           29
+#define R_ALPHA_TLS_LDM         30
+#define R_ALPHA_DTPMOD64        31
+#define R_ALPHA_GOTDTPREL       32
+#define R_ALPHA_DTPREL64        33
+#define R_ALPHA_DTPRELHI        34
+#define R_ALPHA_DTPRELLO        35
+#define R_ALPHA_DTPREL16        36
+#define R_ALPHA_GOTTPREL        37
+#define R_ALPHA_TPREL64         38
+#define R_ALPHA_TPRELHI         39
+#define R_ALPHA_TPRELLO         40
+#define R_ALPHA_TPREL16         41
+
+#define SHF_ALPHA_GPREL		0x10000000
+
+
+/* PowerPC relocations defined by the ABIs */
+#define R_PPC_NONE		0
+#define R_PPC_ADDR32		1	/* 32bit absolute address */
+#define R_PPC_ADDR24		2	/* 26bit address, 2 bits ignored.  */
+#define R_PPC_ADDR16		3	/* 16bit absolute address */
+#define R_PPC_ADDR16_LO		4	/* lower 16bit of absolute address */
+#define R_PPC_ADDR16_HI		5	/* high 16bit of absolute address */
+#define R_PPC_ADDR16_HA		6	/* adjusted high 16bit */
+#define R_PPC_ADDR14		7	/* 16bit address, 2 bits ignored */
+#define R_PPC_ADDR14_BRTAKEN	8
+#define R_PPC_ADDR14_BRNTAKEN	9
+#define R_PPC_REL24		10	/* PC relative 26 bit */
+#define R_PPC_REL14		11	/* PC relative 16 bit */
+#define R_PPC_REL14_BRTAKEN	12
+#define R_PPC_REL14_BRNTAKEN	13
+#define R_PPC_GOT16		14
+#define R_PPC_GOT16_LO		15
+#define R_PPC_GOT16_HI		16
+#define R_PPC_GOT16_HA		17
+#define R_PPC_PLTREL24		18
+#define R_PPC_COPY		19
+#define R_PPC_GLOB_DAT		20
+#define R_PPC_JMP_SLOT		21
+#define R_PPC_RELATIVE		22
+#define R_PPC_LOCAL24PC		23
+#define R_PPC_UADDR32		24
+#define R_PPC_UADDR16		25
+#define R_PPC_REL32		26
+#define R_PPC_PLT32		27
+#define R_PPC_PLTREL32		28
+#define R_PPC_PLT16_LO		29
+#define R_PPC_PLT16_HI		30
+#define R_PPC_PLT16_HA		31
+#define R_PPC_SDAREL16		32
+#define R_PPC_SECTOFF		33
+#define R_PPC_SECTOFF_LO	34
+#define R_PPC_SECTOFF_HI	35
+#define R_PPC_SECTOFF_HA	36
+/* Keep this the last entry.  */
+#ifndef R_PPC_NUM
+#define R_PPC_NUM		37
+#endif
+
+/* ARM specific declarations */
+
+/* Processor specific flags for the ELF header e_flags field.  */
+#define EF_ARM_RELEXEC     0x01
+#define EF_ARM_HASENTRY    0x02
+#define EF_ARM_INTERWORK   0x04
+#define EF_ARM_APCS_26     0x08
+#define EF_ARM_APCS_FLOAT  0x10
+#define EF_ARM_PIC         0x20
+#define EF_ALIGN8          0x40		/* 8-bit structure alignment is in use */
+#define EF_NEW_ABI         0x80
+#define EF_OLD_ABI         0x100
+
+/* Additional symbol types for Thumb */
+#define STT_ARM_TFUNC      0xd
+
+/* ARM-specific values for sh_flags */
+#define SHF_ARM_ENTRYSECT  0x10000000   /* Section contains an entry point */
+#define SHF_ARM_COMDEF     0x80000000   /* Section may be multiply defined
+					   in the input to a link step */
+
+/* ARM-specific program header flags */
+#define PF_ARM_SB          0x10000000   /* Segment contains the location
+					   addressed by the static base */
+
+/* ARM relocs.  */
+#define R_ARM_NONE		0	/* No reloc */
+#define R_ARM_PC24		1	/* PC relative 26 bit branch */
+#define R_ARM_ABS32		2	/* Direct 32 bit  */
+#define R_ARM_REL32		3	/* PC relative 32 bit */
+#define R_ARM_PC13		4
+#define R_ARM_ABS16		5	/* Direct 16 bit */
+#define R_ARM_ABS12		6	/* Direct 12 bit */
+#define R_ARM_THM_ABS5		7
+#define R_ARM_ABS8		8	/* Direct 8 bit */
+#define R_ARM_SBREL32		9
+#define R_ARM_THM_PC22		10
+#define R_ARM_THM_PC8		11
+#define R_ARM_AMP_VCALL9	12
+#define R_ARM_SWI24		13
+#define R_ARM_THM_SWI8		14
+#define R_ARM_XPC25		15
+#define R_ARM_THM_XPC22		16
+#define R_ARM_COPY		20	/* Copy symbol at runtime */
+#define R_ARM_GLOB_DAT		21	/* Create GOT entry */
+#define R_ARM_JUMP_SLOT		22	/* Create PLT entry */
+#define R_ARM_RELATIVE		23	/* Adjust by program base */
+#define R_ARM_GOTOFF		24	/* 32 bit offset to GOT */
+#define R_ARM_GOTPC		25	/* 32 bit PC relative offset to GOT */
+#define R_ARM_GOT32		26	/* 32 bit GOT entry */
+#define R_ARM_PLT32		27	/* 32 bit PLT address */
+#define R_ARM_CALL              28
+#define R_ARM_JUMP24            29
+#define R_ARM_GNU_VTENTRY	100
+#define R_ARM_GNU_VTINHERIT	101
+#define R_ARM_THM_PC11		102	/* thumb unconditional branch */
+#define R_ARM_THM_PC9		103	/* thumb conditional branch */
+#define R_ARM_RXPC25		249
+#define R_ARM_RSBREL32		250
+#define R_ARM_THM_RPC22		251
+#define R_ARM_RREL32		252
+#define R_ARM_RABS22		253
+#define R_ARM_RPC24		254
+#define R_ARM_RBASE		255
+/* Keep this the last entry.  */
+#define R_ARM_NUM		256
+
+/* s390 relocations defined by the ABIs */
+#define R_390_NONE		0	/* No reloc.  */
+#define R_390_8			1	/* Direct 8 bit.  */
+#define R_390_12		2	/* Direct 12 bit.  */
+#define R_390_16		3	/* Direct 16 bit.  */
+#define R_390_32		4	/* Direct 32 bit.  */
+#define R_390_PC32		5	/* PC relative 32 bit.	*/
+#define R_390_GOT12		6	/* 12 bit GOT offset.  */
+#define R_390_GOT32		7	/* 32 bit GOT offset.  */
+#define R_390_PLT32		8	/* 32 bit PC relative PLT address.  */
+#define R_390_COPY		9	/* Copy symbol at runtime.  */
+#define R_390_GLOB_DAT		10	/* Create GOT entry.  */
+#define R_390_JMP_SLOT		11	/* Create PLT entry.  */
+#define R_390_RELATIVE		12	/* Adjust by program base.  */
+#define R_390_GOTOFF32		13	/* 32 bit offset to GOT.	 */
+#define R_390_GOTPC		14	/* 32 bit PC rel. offset to GOT.  */
+#define R_390_GOT16		15	/* 16 bit GOT offset.  */
+#define R_390_PC16		16	/* PC relative 16 bit.	*/
+#define R_390_PC16DBL		17	/* PC relative 16 bit shifted by 1.  */
+#define R_390_PLT16DBL		18	/* 16 bit PC rel. PLT shifted by 1.  */
+#define R_390_PC32DBL		19	/* PC relative 32 bit shifted by 1.  */
+#define R_390_PLT32DBL		20	/* 32 bit PC rel. PLT shifted by 1.  */
+#define R_390_GOTPCDBL		21	/* 32 bit PC rel. GOT shifted by 1.  */
+#define R_390_64		22	/* Direct 64 bit.  */
+#define R_390_PC64		23	/* PC relative 64 bit.	*/
+#define R_390_GOT64		24	/* 64 bit GOT offset.  */
+#define R_390_PLT64		25	/* 64 bit PC relative PLT address.  */
+#define R_390_GOTENT		26	/* 32 bit PC rel. to GOT entry >> 1. */
+#define R_390_GOTOFF16		27	/* 16 bit offset to GOT. */
+#define R_390_GOTOFF64		28	/* 64 bit offset to GOT. */
+#define R_390_GOTPLT12		29	/* 12 bit offset to jump slot.	*/
+#define R_390_GOTPLT16		30	/* 16 bit offset to jump slot.	*/
+#define R_390_GOTPLT32		31	/* 32 bit offset to jump slot.	*/
+#define R_390_GOTPLT64		32	/* 64 bit offset to jump slot.	*/
+#define R_390_GOTPLTENT		33	/* 32 bit rel. offset to jump slot.  */
+#define R_390_PLTOFF16		34	/* 16 bit offset from GOT to PLT. */
+#define R_390_PLTOFF32		35	/* 32 bit offset from GOT to PLT. */
+#define R_390_PLTOFF64		36	/* 16 bit offset from GOT to PLT. */
+#define R_390_TLS_LOAD		37	/* Tag for load insn in TLS code. */
+#define R_390_TLS_GDCALL	38	/* Tag for function call in general
+                                           dynamic TLS code.  */
+#define R_390_TLS_LDCALL	39	/* Tag for function call in local
+                                           dynamic TLS code.  */
+#define R_390_TLS_GD32		40	/* Direct 32 bit for general dynamic
+                                           thread local data.  */
+#define R_390_TLS_GD64		41	/* Direct 64 bit for general dynamic
+                                           thread local data.  */
+#define R_390_TLS_GOTIE12	42	/* 12 bit GOT offset for static TLS
+                                           block offset.  */
+#define R_390_TLS_GOTIE32	43	/* 32 bit GOT offset for static TLS
+                                           block offset.  */
+#define R_390_TLS_GOTIE64	44	/* 64 bit GOT offset for static TLS
+                                           block offset.  */
+#define R_390_TLS_LDM32		45	/* Direct 32 bit for local dynamic
+                                           thread local data in LD code.  */
+#define R_390_TLS_LDM64		46	/* Direct 64 bit for local dynamic
+                                           thread local data in LD code.  */
+#define R_390_TLS_IE32		47	/* 32 bit address of GOT entry for
+                                           negated static TLS block offset.  */
+#define R_390_TLS_IE64		48	/* 64 bit address of GOT entry for
+                                           negated static TLS block offset.  */
+#define R_390_TLS_IEENT		49	/* 32 bit rel. offset to GOT entry for
+                                           negated static TLS block offset.  */
+#define R_390_TLS_LE32		50	/* 32 bit negated offset relative to
+                                           static TLS block.  */
+#define R_390_TLS_LE64		51	/* 64 bit negated offset relative to
+                                           static TLS block.  */
+#define R_390_TLS_LDO32		52	/* 32 bit offset relative to TLS
+                                           block.  */
+#define R_390_TLS_LDO64		53	/* 64 bit offset relative to TLS
+                                           block.  */
+#define R_390_TLS_DTPMOD	54	/* ID of module containing symbol.  */
+#define R_390_TLS_DTPOFF	55	/* Offset in TLS block.  */
+#define R_390_TLS_TPOFF		56	/* Negate offset in static TLS
+                                           block.  */
+/* Keep this the last entry.  */
+#define R_390_NUM	57
+
+/* x86-64 relocation types */
+#define R_X86_64_NONE		0	/* No reloc */
+#define R_X86_64_64		1	/* Direct 64 bit  */
+#define R_X86_64_PC32		2	/* PC relative 32 bit signed */
+#define R_X86_64_GOT32		3	/* 32 bit GOT entry */
+#define R_X86_64_PLT32		4	/* 32 bit PLT address */
+#define R_X86_64_COPY		5	/* Copy symbol at runtime */
+#define R_X86_64_GLOB_DAT	6	/* Create GOT entry */
+#define R_X86_64_JUMP_SLOT	7	/* Create PLT entry */
+#define R_X86_64_RELATIVE	8	/* Adjust by program base */
+#define R_X86_64_GOTPCREL	9	/* 32 bit signed pc relative
+					   offset to GOT */
+#define R_X86_64_32		10	/* Direct 32 bit zero extended */
+#define R_X86_64_32S		11	/* Direct 32 bit sign extended */
+#define R_X86_64_16		12	/* Direct 16 bit zero extended */
+#define R_X86_64_PC16		13	/* 16 bit sign extended pc relative */
+#define R_X86_64_8		14	/* Direct 8 bit sign extended  */
+#define R_X86_64_PC8		15	/* 8 bit sign extended pc relative */
+
+#define R_X86_64_NUM		16
+
+/* Legal values for e_flags field of Elf64_Ehdr.  */
+
+#define EF_ALPHA_32BIT		1	/* All addresses are below 2GB */
+
+/* HPPA specific definitions.  */
+
+/* Legal values for e_flags field of Elf32_Ehdr.  */
+
+#define EF_PARISC_TRAPNIL	0x00010000 /* Trap nil pointer dereference.  */
+#define EF_PARISC_EXT		0x00020000 /* Program uses arch. extensions. */
+#define EF_PARISC_LSB		0x00040000 /* Program expects little endian. */
+#define EF_PARISC_WIDE		0x00080000 /* Program expects wide mode.  */
+#define EF_PARISC_NO_KABP	0x00100000 /* No kernel assisted branch
+					      prediction.  */
+#define EF_PARISC_LAZYSWAP	0x00400000 /* Allow lazy swapping.  */
+#define EF_PARISC_ARCH		0x0000ffff /* Architecture version.  */
+
+/* Defined values for `e_flags & EF_PARISC_ARCH' are:  */
+
+#define EFA_PARISC_1_0		    0x020b /* PA-RISC 1.0 big-endian.  */
+#define EFA_PARISC_1_1		    0x0210 /* PA-RISC 1.1 big-endian.  */
+#define EFA_PARISC_2_0		    0x0214 /* PA-RISC 2.0 big-endian.  */
+
+/* Additional section indeces.  */
+
+#define SHN_PARISC_ANSI_COMMON	0xff00	   /* Section for tenatively declared
+					      symbols in ANSI C.  */
+#define SHN_PARISC_HUGE_COMMON	0xff01	   /* Common blocks in huge model.  */
+
+/* Legal values for sh_type field of Elf32_Shdr.  */
+
+#define SHT_PARISC_EXT		0x70000000 /* Contains product specific ext. */
+#define SHT_PARISC_UNWIND	0x70000001 /* Unwind information.  */
+#define SHT_PARISC_DOC		0x70000002 /* Debug info for optimized code. */
+
+/* Legal values for sh_flags field of Elf32_Shdr.  */
+
+#define SHF_PARISC_SHORT	0x20000000 /* Section with short addressing. */
+#define SHF_PARISC_HUGE		0x40000000 /* Section far from gp.  */
+#define SHF_PARISC_SBP		0x80000000 /* Static branch prediction code. */
+
+/* Legal values for ST_TYPE subfield of st_info (symbol type).  */
+
+#define STT_PARISC_MILLICODE	13	/* Millicode function entry point.  */
+
+#define STT_HP_OPAQUE		(STT_LOOS + 0x1)
+#define STT_HP_STUB		(STT_LOOS + 0x2)
+
+/* HPPA relocs.  */
+
+#define R_PARISC_NONE		0	/* No reloc.  */
+#define R_PARISC_DIR32		1	/* Direct 32-bit reference.  */
+#define R_PARISC_DIR21L		2	/* Left 21 bits of eff. address.  */
+#define R_PARISC_DIR17R		3	/* Right 17 bits of eff. address.  */
+#define R_PARISC_DIR17F		4	/* 17 bits of eff. address.  */
+#define R_PARISC_DIR14R		6	/* Right 14 bits of eff. address.  */
+#define R_PARISC_PCREL32	9	/* 32-bit rel. address.  */
+#define R_PARISC_PCREL21L	10	/* Left 21 bits of rel. address.  */
+#define R_PARISC_PCREL17R	11	/* Right 17 bits of rel. address.  */
+#define R_PARISC_PCREL17F	12	/* 17 bits of rel. address.  */
+#define R_PARISC_PCREL14R	14	/* Right 14 bits of rel. address.  */
+#define R_PARISC_DPREL21L	18	/* Left 21 bits of rel. address.  */
+#define R_PARISC_DPREL14R	22	/* Right 14 bits of rel. address.  */
+#define R_PARISC_GPREL21L	26	/* GP-relative, left 21 bits.  */
+#define R_PARISC_GPREL14R	30	/* GP-relative, right 14 bits.  */
+#define R_PARISC_LTOFF21L	34	/* LT-relative, left 21 bits.  */
+#define R_PARISC_LTOFF14R	38	/* LT-relative, right 14 bits.  */
+#define R_PARISC_SECREL32	41	/* 32 bits section rel. address.  */
+#define R_PARISC_SEGBASE	48	/* No relocation, set segment base.  */
+#define R_PARISC_SEGREL32	49	/* 32 bits segment rel. address.  */
+#define R_PARISC_PLTOFF21L	50	/* PLT rel. address, left 21 bits.  */
+#define R_PARISC_PLTOFF14R	54	/* PLT rel. address, right 14 bits.  */
+#define R_PARISC_LTOFF_FPTR32	57	/* 32 bits LT-rel. function pointer. */
+#define R_PARISC_LTOFF_FPTR21L	58	/* LT-rel. fct ptr, left 21 bits. */
+#define R_PARISC_LTOFF_FPTR14R	62	/* LT-rel. fct ptr, right 14 bits. */
+#define R_PARISC_FPTR64		64	/* 64 bits function address.  */
+#define R_PARISC_PLABEL32	65	/* 32 bits function address.  */
+#define R_PARISC_PCREL64	72	/* 64 bits PC-rel. address.  */
+#define R_PARISC_PCREL22F	74	/* 22 bits PC-rel. address.  */
+#define R_PARISC_PCREL14WR	75	/* PC-rel. address, right 14 bits.  */
+#define R_PARISC_PCREL14DR	76	/* PC rel. address, right 14 bits.  */
+#define R_PARISC_PCREL16F	77	/* 16 bits PC-rel. address.  */
+#define R_PARISC_PCREL16WF	78	/* 16 bits PC-rel. address.  */
+#define R_PARISC_PCREL16DF	79	/* 16 bits PC-rel. address.  */
+#define R_PARISC_DIR64		80	/* 64 bits of eff. address.  */
+#define R_PARISC_DIR14WR	83	/* 14 bits of eff. address.  */
+#define R_PARISC_DIR14DR	84	/* 14 bits of eff. address.  */
+#define R_PARISC_DIR16F		85	/* 16 bits of eff. address.  */
+#define R_PARISC_DIR16WF	86	/* 16 bits of eff. address.  */
+#define R_PARISC_DIR16DF	87	/* 16 bits of eff. address.  */
+#define R_PARISC_GPREL64	88	/* 64 bits of GP-rel. address.  */
+#define R_PARISC_GPREL14WR	91	/* GP-rel. address, right 14 bits.  */
+#define R_PARISC_GPREL14DR	92	/* GP-rel. address, right 14 bits.  */
+#define R_PARISC_GPREL16F	93	/* 16 bits GP-rel. address.  */
+#define R_PARISC_GPREL16WF	94	/* 16 bits GP-rel. address.  */
+#define R_PARISC_GPREL16DF	95	/* 16 bits GP-rel. address.  */
+#define R_PARISC_LTOFF64	96	/* 64 bits LT-rel. address.  */
+#define R_PARISC_LTOFF14WR	99	/* LT-rel. address, right 14 bits.  */
+#define R_PARISC_LTOFF14DR	100	/* LT-rel. address, right 14 bits.  */
+#define R_PARISC_LTOFF16F	101	/* 16 bits LT-rel. address.  */
+#define R_PARISC_LTOFF16WF	102	/* 16 bits LT-rel. address.  */
+#define R_PARISC_LTOFF16DF	103	/* 16 bits LT-rel. address.  */
+#define R_PARISC_SECREL64	104	/* 64 bits section rel. address.  */
+#define R_PARISC_SEGREL64	112	/* 64 bits segment rel. address.  */
+#define R_PARISC_PLTOFF14WR	115	/* PLT-rel. address, right 14 bits.  */
+#define R_PARISC_PLTOFF14DR	116	/* PLT-rel. address, right 14 bits.  */
+#define R_PARISC_PLTOFF16F	117	/* 16 bits LT-rel. address.  */
+#define R_PARISC_PLTOFF16WF	118	/* 16 bits PLT-rel. address.  */
+#define R_PARISC_PLTOFF16DF	119	/* 16 bits PLT-rel. address.  */
+#define R_PARISC_LTOFF_FPTR64	120	/* 64 bits LT-rel. function ptr.  */
+#define R_PARISC_LTOFF_FPTR14WR	123	/* LT-rel. fct. ptr., right 14 bits. */
+#define R_PARISC_LTOFF_FPTR14DR	124	/* LT-rel. fct. ptr., right 14 bits. */
+#define R_PARISC_LTOFF_FPTR16F	125	/* 16 bits LT-rel. function ptr.  */
+#define R_PARISC_LTOFF_FPTR16WF	126	/* 16 bits LT-rel. function ptr.  */
+#define R_PARISC_LTOFF_FPTR16DF	127	/* 16 bits LT-rel. function ptr.  */
+#define R_PARISC_LORESERVE	128
+#define R_PARISC_COPY		128	/* Copy relocation.  */
+#define R_PARISC_IPLT		129	/* Dynamic reloc, imported PLT */
+#define R_PARISC_EPLT		130	/* Dynamic reloc, exported PLT */
+#define R_PARISC_TPREL32	153	/* 32 bits TP-rel. address.  */
+#define R_PARISC_TPREL21L	154	/* TP-rel. address, left 21 bits.  */
+#define R_PARISC_TPREL14R	158	/* TP-rel. address, right 14 bits.  */
+#define R_PARISC_LTOFF_TP21L	162	/* LT-TP-rel. address, left 21 bits. */
+#define R_PARISC_LTOFF_TP14R	166	/* LT-TP-rel. address, right 14 bits.*/
+#define R_PARISC_LTOFF_TP14F	167	/* 14 bits LT-TP-rel. address.  */
+#define R_PARISC_TPREL64	216	/* 64 bits TP-rel. address.  */
+#define R_PARISC_TPREL14WR	219	/* TP-rel. address, right 14 bits.  */
+#define R_PARISC_TPREL14DR	220	/* TP-rel. address, right 14 bits.  */
+#define R_PARISC_TPREL16F	221	/* 16 bits TP-rel. address.  */
+#define R_PARISC_TPREL16WF	222	/* 16 bits TP-rel. address.  */
+#define R_PARISC_TPREL16DF	223	/* 16 bits TP-rel. address.  */
+#define R_PARISC_LTOFF_TP64	224	/* 64 bits LT-TP-rel. address.  */
+#define R_PARISC_LTOFF_TP14WR	227	/* LT-TP-rel. address, right 14 bits.*/
+#define R_PARISC_LTOFF_TP14DR	228	/* LT-TP-rel. address, right 14 bits.*/
+#define R_PARISC_LTOFF_TP16F	229	/* 16 bits LT-TP-rel. address.  */
+#define R_PARISC_LTOFF_TP16WF	230	/* 16 bits LT-TP-rel. address.  */
+#define R_PARISC_LTOFF_TP16DF	231	/* 16 bits LT-TP-rel. address.  */
+#define R_PARISC_HIRESERVE	255
+
+/* Legal values for p_type field of Elf32_Phdr/Elf64_Phdr.  */
+
+#define PT_HP_TLS		(PT_LOOS + 0x0)
+#define PT_HP_CORE_NONE		(PT_LOOS + 0x1)
+#define PT_HP_CORE_VERSION	(PT_LOOS + 0x2)
+#define PT_HP_CORE_KERNEL	(PT_LOOS + 0x3)
+#define PT_HP_CORE_COMM		(PT_LOOS + 0x4)
+#define PT_HP_CORE_PROC		(PT_LOOS + 0x5)
+#define PT_HP_CORE_LOADABLE	(PT_LOOS + 0x6)
+#define PT_HP_CORE_STACK	(PT_LOOS + 0x7)
+#define PT_HP_CORE_SHM		(PT_LOOS + 0x8)
+#define PT_HP_CORE_MMF		(PT_LOOS + 0x9)
+#define PT_HP_PARALLEL		(PT_LOOS + 0x10)
+#define PT_HP_FASTBIND		(PT_LOOS + 0x11)
+#define PT_HP_OPT_ANNOT		(PT_LOOS + 0x12)
+#define PT_HP_HSL_ANNOT		(PT_LOOS + 0x13)
+#define PT_HP_STACK		(PT_LOOS + 0x14)
+
+#define PT_PARISC_ARCHEXT	0x70000000
+#define PT_PARISC_UNWIND	0x70000001
+
+/* Legal values for p_flags field of Elf32_Phdr/Elf64_Phdr.  */
+
+#define PF_PARISC_SBP		0x08000000
+
+#define PF_HP_PAGE_SIZE		0x00100000
+#define PF_HP_FAR_SHARED	0x00200000
+#define PF_HP_NEAR_SHARED	0x00400000
+#define PF_HP_CODE		0x01000000
+#define PF_HP_MODIFY		0x02000000
+#define PF_HP_LAZYSWAP		0x04000000
+#define PF_HP_SBP		0x08000000
+
+/* IA-64 specific declarations.  */
+
+/* Processor specific flags for the Ehdr e_flags field.  */
+#define EF_IA_64_MASKOS		0x0000000f	/* os-specific flags */
+#define EF_IA_64_ABI64		0x00000010	/* 64-bit ABI */
+#define EF_IA_64_ARCH		0xff000000	/* arch. version mask */
+
+/* Processor specific values for the Phdr p_type field.  */
+#define PT_IA_64_ARCHEXT	(PT_LOPROC + 0)	/* arch extension bits */
+#define PT_IA_64_UNWIND		(PT_LOPROC + 1)	/* ia64 unwind bits */
+
+/* Processor specific flags for the Phdr p_flags field.  */
+#define PF_IA_64_NORECOV	0x80000000	/* spec insns w/o recovery */
+
+/* Processor specific values for the Shdr sh_type field.  */
+#define SHT_IA_64_EXT		(SHT_LOPROC + 0) /* extension bits */
+#define SHT_IA_64_UNWIND	(SHT_LOPROC + 1) /* unwind bits */
+
+/* Processor specific flags for the Shdr sh_flags field.  */
+#define SHF_IA_64_SHORT		0x10000000	/* section near gp */
+#define SHF_IA_64_NORECOV	0x20000000	/* spec insns w/o recovery */
+
+/* Processor specific values for the Dyn d_tag field.  */
+#define DT_IA_64_PLT_RESERVE	(DT_LOPROC + 0)
+#define DT_IA_64_NUM		1
+
+/* IA-64 relocations.  */
+#define R_IA64_NONE		0x00	/* none */
+#define R_IA64_IMM14		0x21	/* symbol + addend, add imm14 */
+#define R_IA64_IMM22		0x22	/* symbol + addend, add imm22 */
+#define R_IA64_IMM64		0x23	/* symbol + addend, mov imm64 */
+#define R_IA64_DIR32MSB		0x24	/* symbol + addend, data4 MSB */
+#define R_IA64_DIR32LSB		0x25	/* symbol + addend, data4 LSB */
+#define R_IA64_DIR64MSB		0x26	/* symbol + addend, data8 MSB */
+#define R_IA64_DIR64LSB		0x27	/* symbol + addend, data8 LSB */
+#define R_IA64_GPREL22		0x2a	/* @gprel(sym + add), add imm22 */
+#define R_IA64_GPREL64I		0x2b	/* @gprel(sym + add), mov imm64 */
+#define R_IA64_GPREL32MSB	0x2c	/* @gprel(sym + add), data4 MSB */
+#define R_IA64_GPREL32LSB	0x2d	/* @gprel(sym + add), data4 LSB */
+#define R_IA64_GPREL64MSB	0x2e	/* @gprel(sym + add), data8 MSB */
+#define R_IA64_GPREL64LSB	0x2f	/* @gprel(sym + add), data8 LSB */
+#define R_IA64_LTOFF22		0x32	/* @ltoff(sym + add), add imm22 */
+#define R_IA64_LTOFF64I		0x33	/* @ltoff(sym + add), mov imm64 */
+#define R_IA64_PLTOFF22		0x3a	/* @pltoff(sym + add), add imm22 */
+#define R_IA64_PLTOFF64I	0x3b	/* @pltoff(sym + add), mov imm64 */
+#define R_IA64_PLTOFF64MSB	0x3e	/* @pltoff(sym + add), data8 MSB */
+#define R_IA64_PLTOFF64LSB	0x3f	/* @pltoff(sym + add), data8 LSB */
+#define R_IA64_FPTR64I		0x43	/* @fptr(sym + add), mov imm64 */
+#define R_IA64_FPTR32MSB	0x44	/* @fptr(sym + add), data4 MSB */
+#define R_IA64_FPTR32LSB	0x45	/* @fptr(sym + add), data4 LSB */
+#define R_IA64_FPTR64MSB	0x46	/* @fptr(sym + add), data8 MSB */
+#define R_IA64_FPTR64LSB	0x47	/* @fptr(sym + add), data8 LSB */
+#define R_IA64_PCREL60B		0x48	/* @pcrel(sym + add), brl */
+#define R_IA64_PCREL21B		0x49	/* @pcrel(sym + add), ptb, call */
+#define R_IA64_PCREL21M		0x4a	/* @pcrel(sym + add), chk.s */
+#define R_IA64_PCREL21F		0x4b	/* @pcrel(sym + add), fchkf */
+#define R_IA64_PCREL32MSB	0x4c	/* @pcrel(sym + add), data4 MSB */
+#define R_IA64_PCREL32LSB	0x4d	/* @pcrel(sym + add), data4 LSB */
+#define R_IA64_PCREL64MSB	0x4e	/* @pcrel(sym + add), data8 MSB */
+#define R_IA64_PCREL64LSB	0x4f	/* @pcrel(sym + add), data8 LSB */
+#define R_IA64_LTOFF_FPTR22	0x52	/* @ltoff(@fptr(s+a)), imm22 */
+#define R_IA64_LTOFF_FPTR64I	0x53	/* @ltoff(@fptr(s+a)), imm64 */
+#define R_IA64_LTOFF_FPTR32MSB	0x54	/* @ltoff(@fptr(s+a)), data4 MSB */
+#define R_IA64_LTOFF_FPTR32LSB	0x55	/* @ltoff(@fptr(s+a)), data4 LSB */
+#define R_IA64_LTOFF_FPTR64MSB	0x56	/* @ltoff(@fptr(s+a)), data8 MSB */
+#define R_IA64_LTOFF_FPTR64LSB	0x57	/* @ltoff(@fptr(s+a)), data8 LSB */
+#define R_IA64_SEGREL32MSB	0x5c	/* @segrel(sym + add), data4 MSB */
+#define R_IA64_SEGREL32LSB	0x5d	/* @segrel(sym + add), data4 LSB */
+#define R_IA64_SEGREL64MSB	0x5e	/* @segrel(sym + add), data8 MSB */
+#define R_IA64_SEGREL64LSB	0x5f	/* @segrel(sym + add), data8 LSB */
+#define R_IA64_SECREL32MSB	0x64	/* @secrel(sym + add), data4 MSB */
+#define R_IA64_SECREL32LSB	0x65	/* @secrel(sym + add), data4 LSB */
+#define R_IA64_SECREL64MSB	0x66	/* @secrel(sym + add), data8 MSB */
+#define R_IA64_SECREL64LSB	0x67	/* @secrel(sym + add), data8 LSB */
+#define R_IA64_REL32MSB		0x6c	/* data 4 + REL */
+#define R_IA64_REL32LSB		0x6d	/* data 4 + REL */
+#define R_IA64_REL64MSB		0x6e	/* data 8 + REL */
+#define R_IA64_REL64LSB		0x6f	/* data 8 + REL */
+#define R_IA64_LTV32MSB		0x74	/* symbol + addend, data4 MSB */
+#define R_IA64_LTV32LSB		0x75	/* symbol + addend, data4 LSB */
+#define R_IA64_LTV64MSB		0x76	/* symbol + addend, data8 MSB */
+#define R_IA64_LTV64LSB		0x77	/* symbol + addend, data8 LSB */
+#define R_IA64_PCREL21BI	0x79	/* @pcrel(sym + add), 21bit inst */
+#define R_IA64_PCREL22		0x7a	/* @pcrel(sym + add), 22bit inst */
+#define R_IA64_PCREL64I		0x7b	/* @pcrel(sym + add), 64bit inst */
+#define R_IA64_IPLTMSB		0x80	/* dynamic reloc, imported PLT, MSB */
+#define R_IA64_IPLTLSB		0x81	/* dynamic reloc, imported PLT, LSB */
+#define R_IA64_COPY		0x84	/* copy relocation */
+#define R_IA64_SUB		0x85	/* Addend and symbol difference */
+#define R_IA64_LTOFF22X		0x86	/* LTOFF22, relaxable.  */
+#define R_IA64_LDXMOV		0x87	/* Use of LTOFF22X.  */
+#define R_IA64_TPREL14		0x91	/* @tprel(sym + add), imm14 */
+#define R_IA64_TPREL22		0x92	/* @tprel(sym + add), imm22 */
+#define R_IA64_TPREL64I		0x93	/* @tprel(sym + add), imm64 */
+#define R_IA64_TPREL64MSB	0x96	/* @tprel(sym + add), data8 MSB */
+#define R_IA64_TPREL64LSB	0x97	/* @tprel(sym + add), data8 LSB */
+#define R_IA64_LTOFF_TPREL22	0x9a	/* @ltoff(@tprel(s+a)), imm2 */
+#define R_IA64_DTPMOD64MSB	0xa6	/* @dtpmod(sym + add), data8 MSB */
+#define R_IA64_DTPMOD64LSB	0xa7	/* @dtpmod(sym + add), data8 LSB */
+#define R_IA64_LTOFF_DTPMOD22	0xaa	/* @ltoff(@dtpmod(sym + add)), imm22 */
+#define R_IA64_DTPREL14		0xb1	/* @dtprel(sym + add), imm14 */
+#define R_IA64_DTPREL22		0xb2	/* @dtprel(sym + add), imm22 */
+#define R_IA64_DTPREL64I	0xb3	/* @dtprel(sym + add), imm64 */
+#define R_IA64_DTPREL32MSB	0xb4	/* @dtprel(sym + add), data4 MSB */
+#define R_IA64_DTPREL32LSB	0xb5	/* @dtprel(sym + add), data4 LSB */
+#define R_IA64_DTPREL64MSB	0xb6	/* @dtprel(sym + add), data8 MSB */
+#define R_IA64_DTPREL64LSB	0xb7	/* @dtprel(sym + add), data8 LSB */
+#define R_IA64_LTOFF_DTPREL22	0xba	/* @ltoff(@dtprel(s+a)), imm22 */
+
+typedef struct elf32_rel {
+  Elf32_Addr	r_offset;
+  Elf32_Word	r_info;
+} Elf32_Rel;
+
+typedef struct elf64_rel {
+  Elf64_Addr r_offset;	/* Location at which to apply the action */
+  Elf64_Xword r_info;	/* index and type of relocation */
+} Elf64_Rel;
+
+typedef struct elf32_rela{
+  Elf32_Addr	r_offset;
+  Elf32_Word	r_info;
+  Elf32_Sword	r_addend;
+} Elf32_Rela;
+
+typedef struct elf64_rela {
+  Elf64_Addr r_offset;	/* Location at which to apply the action */
+  Elf64_Xword r_info;	/* index and type of relocation */
+  Elf64_Sxword r_addend;	/* Constant addend used to compute value */
+} Elf64_Rela;
+
+typedef struct elf32_sym{
+  Elf32_Word	st_name;
+  Elf32_Addr	st_value;
+  Elf32_Word	st_size;
+  unsigned char	st_info;
+  unsigned char	st_other;
+  Elf32_Half	st_shndx;
+} Elf32_Sym;
+
+typedef struct elf64_sym {
+  Elf64_Word st_name;		/* Symbol name, index in string tbl */
+  unsigned char	st_info;	/* Type and binding attributes */
+  unsigned char	st_other;	/* No defined meaning, 0 */
+  Elf64_Half st_shndx;		/* Associated section index */
+  Elf64_Addr st_value;		/* Value of the symbol */
+  Elf64_Xword st_size;		/* Associated symbol size */
+} Elf64_Sym;
+
+
+#define EI_NIDENT	16
+
+typedef struct elf32_hdr{
+  unsigned char	e_ident[EI_NIDENT];
+  Elf32_Half	e_type;
+  Elf32_Half	e_machine;
+  Elf32_Word	e_version;
+  Elf32_Addr	e_entry;  /* Entry point */
+  Elf32_Off	e_phoff;
+  Elf32_Off	e_shoff;
+  Elf32_Word	e_flags;
+  Elf32_Half	e_ehsize;
+  Elf32_Half	e_phentsize;
+  Elf32_Half	e_phnum;
+  Elf32_Half	e_shentsize;
+  Elf32_Half	e_shnum;
+  Elf32_Half	e_shstrndx;
+} Elf32_Ehdr;
+
+typedef struct elf64_hdr {
+  unsigned char	e_ident[16];		/* ELF "magic number" */
+  Elf64_Half e_type;
+  Elf64_Half e_machine;
+  Elf64_Word e_version;
+  Elf64_Addr e_entry;		/* Entry point virtual address */
+  Elf64_Off e_phoff;		/* Program header table file offset */
+  Elf64_Off e_shoff;		/* Section header table file offset */
+  Elf64_Word e_flags;
+  Elf64_Half e_ehsize;
+  Elf64_Half e_phentsize;
+  Elf64_Half e_phnum;
+  Elf64_Half e_shentsize;
+  Elf64_Half e_shnum;
+  Elf64_Half e_shstrndx;
+} Elf64_Ehdr;
+
+/* These constants define the permissions on sections in the program
+   header, p_flags. */
+#define PF_R		0x4
+#define PF_W		0x2
+#define PF_X		0x1
+
+typedef struct elf32_phdr{
+  Elf32_Word	p_type;
+  Elf32_Off	p_offset;
+  Elf32_Addr	p_vaddr;
+  Elf32_Addr	p_paddr;
+  Elf32_Word	p_filesz;
+  Elf32_Word	p_memsz;
+  Elf32_Word	p_flags;
+  Elf32_Word	p_align;
+} Elf32_Phdr;
+
+typedef struct elf64_phdr {
+  Elf64_Word p_type;
+  Elf64_Word p_flags;
+  Elf64_Off p_offset;		/* Segment file offset */
+  Elf64_Addr p_vaddr;		/* Segment virtual address */
+  Elf64_Addr p_paddr;		/* Segment physical address */
+  Elf64_Xword p_filesz;		/* Segment size in file */
+  Elf64_Xword p_memsz;		/* Segment size in memory */
+  Elf64_Xword p_align;		/* Segment alignment, file & memory */
+} Elf64_Phdr;
+
+/* sh_type */
+#define SHT_NULL	0
+#define SHT_PROGBITS	1
+#define SHT_SYMTAB	2
+#define SHT_STRTAB	3
+#define SHT_RELA	4
+#define SHT_HASH	5
+#define SHT_DYNAMIC	6
+#define SHT_NOTE	7
+#define SHT_NOBITS	8
+#define SHT_REL		9
+#define SHT_SHLIB	10
+#define SHT_DYNSYM	11
+#define SHT_NUM		12
+#define SHT_LOPROC	0x70000000
+#define SHT_HIPROC	0x7fffffff
+#define SHT_LOUSER	0x80000000
+#define SHT_HIUSER	0xffffffff
+#define SHT_MIPS_LIST		0x70000000
+#define SHT_MIPS_CONFLICT	0x70000002
+#define SHT_MIPS_GPTAB		0x70000003
+#define SHT_MIPS_UCODE		0x70000004
+
+/* sh_flags */
+#define SHF_WRITE	0x1
+#define SHF_ALLOC	0x2
+#define SHF_EXECINSTR	0x4
+#define SHF_MASKPROC	0xf0000000
+#define SHF_MIPS_GPREL	0x10000000
+
+/* special section indexes */
+#define SHN_UNDEF	0
+#define SHN_LORESERVE	0xff00
+#define SHN_LOPROC	0xff00
+#define SHN_HIPROC	0xff1f
+#define SHN_ABS		0xfff1
+#define SHN_COMMON	0xfff2
+#define SHN_HIRESERVE	0xffff
+#define SHN_MIPS_ACCOMON	0xff00
+
+typedef struct elf32_shdr {
+  Elf32_Word	sh_name;
+  Elf32_Word	sh_type;
+  Elf32_Word	sh_flags;
+  Elf32_Addr	sh_addr;
+  Elf32_Off	sh_offset;
+  Elf32_Word	sh_size;
+  Elf32_Word	sh_link;
+  Elf32_Word	sh_info;
+  Elf32_Word	sh_addralign;
+  Elf32_Word	sh_entsize;
+} Elf32_Shdr;
+
+typedef struct elf64_shdr {
+  Elf64_Word sh_name;		/* Section name, index in string tbl */
+  Elf64_Word sh_type;		/* Type of section */
+  Elf64_Xword sh_flags;		/* Miscellaneous section attributes */
+  Elf64_Addr sh_addr;		/* Section virtual addr at execution */
+  Elf64_Off sh_offset;		/* Section file offset */
+  Elf64_Xword sh_size;		/* Size of section in bytes */
+  Elf64_Word sh_link;		/* Index of another section */
+  Elf64_Word sh_info;		/* Additional section information */
+  Elf64_Xword sh_addralign;	/* Section alignment */
+  Elf64_Xword sh_entsize;	/* Entry size if section holds table */
+} Elf64_Shdr;
+
+#define	EI_MAG0		0		/* e_ident[] indexes */
+#define	EI_MAG1		1
+#define	EI_MAG2		2
+#define	EI_MAG3		3
+#define	EI_CLASS	4
+#define	EI_DATA		5
+#define	EI_VERSION	6
+#define	EI_OSABI	7
+#define	EI_PAD		8
+
+#define ELFOSABI_NONE           0       /* UNIX System V ABI */
+#define ELFOSABI_SYSV           0       /* Alias.  */
+#define ELFOSABI_HPUX           1       /* HP-UX */
+#define ELFOSABI_NETBSD         2       /* NetBSD.  */
+#define ELFOSABI_LINUX          3       /* Linux.  */
+#define ELFOSABI_SOLARIS        6       /* Sun Solaris.  */
+#define ELFOSABI_AIX            7       /* IBM AIX.  */
+#define ELFOSABI_IRIX           8       /* SGI Irix.  */
+#define ELFOSABI_FREEBSD        9       /* FreeBSD.  */
+#define ELFOSABI_TRU64          10      /* Compaq TRU64 UNIX.  */
+#define ELFOSABI_MODESTO        11      /* Novell Modesto.  */
+#define ELFOSABI_OPENBSD        12      /* OpenBSD.  */
+#define ELFOSABI_ARM            97      /* ARM */
+#define ELFOSABI_STANDALONE     255     /* Standalone (embedded) application */
+
+#define	ELFMAG0		0x7f		/* EI_MAG */
+#define	ELFMAG1		'E'
+#define	ELFMAG2		'L'
+#define	ELFMAG3		'F'
+#define	ELFMAG		"\177ELF"
+#define	SELFMAG		4
+
+#define	ELFCLASSNONE	0		/* EI_CLASS */
+#define	ELFCLASS32	1
+#define	ELFCLASS64	2
+#define	ELFCLASSNUM	3
+
+#define ELFDATANONE	0		/* e_ident[EI_DATA] */
+#define ELFDATA2LSB	1
+#define ELFDATA2MSB	2
+
+#define EV_NONE		0		/* e_version, EI_VERSION */
+#define EV_CURRENT	1
+#define EV_NUM		2
+
+/* Notes used in ET_CORE */
+#define NT_PRSTATUS	1
+#define NT_PRFPREG	2
+#define NT_PRPSINFO	3
+#define NT_TASKSTRUCT	4
+#define NT_AUXV		6
+#define NT_PRXFPREG     0x46e62b7f      /* copied from gdb5.1/include/elf/common.h */
+
+
+/* Note header in a PT_NOTE section */
+typedef struct elf32_note {
+  Elf32_Word	n_namesz;	/* Name size */
+  Elf32_Word	n_descsz;	/* Content size */
+  Elf32_Word	n_type;		/* Content type */
+} Elf32_Nhdr;
+
+/* Note header in a PT_NOTE section */
+typedef struct elf64_note {
+  Elf64_Word n_namesz;	/* Name size */
+  Elf64_Word n_descsz;	/* Content size */
+  Elf64_Word n_type;	/* Content type */
+} Elf64_Nhdr;
+
+#ifdef ELF_CLASS
+#if ELF_CLASS == ELFCLASS32
+
+#define elfhdr		elf32_hdr
+#define elf_phdr	elf32_phdr
+#define elf_note	elf32_note
+#define elf_shdr	elf32_shdr
+#define elf_sym		elf32_sym
+#define elf_addr_t	Elf32_Off
+
+#ifdef ELF_USES_RELOCA
+# define ELF_RELOC      Elf32_Rela
+#else
+# define ELF_RELOC      Elf32_Rel
+#endif
+
+#else
+
+#define elfhdr		elf64_hdr
+#define elf_phdr	elf64_phdr
+#define elf_note	elf64_note
+#define elf_shdr	elf64_shdr
+#define elf_sym		elf64_sym
+#define elf_addr_t	Elf64_Off
+
+#ifdef ELF_USES_RELOCA
+# define ELF_RELOC      Elf64_Rela
+#else
+# define ELF_RELOC      Elf64_Rel
+#endif
+
+#endif /* ELF_CLASS */
+
+#ifndef ElfW
+# if ELF_CLASS == ELFCLASS32
+#  define ElfW(x)  Elf32_ ## x
+#  define ELFW(x)  ELF32_ ## x
+# else
+#  define ElfW(x)  Elf64_ ## x
+#  define ELFW(x)  ELF64_ ## x
+# endif
+#endif
+
+#endif /* ELF_CLASS */
+
+
+#endif /* _QEMU_ELF_H */
diff --git a/src/recompiler/exec-all.h b/src/recompiler/exec-all.h
new file mode 100644
index 00000000..52145e3e
--- /dev/null
+++ b/src/recompiler/exec-all.h
@@ -0,0 +1,400 @@
+/*
+ * internal execution defines for qemu
+ *
+ *  Copyright (c) 2003 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Oracle LGPL Disclaimer: For the avoidance of doubt, except that if any license choice
+ * other than GPL or LGPL is available it will apply instead, Oracle elects to use only
+ * the Lesser General Public License version 2.1 (LGPLv2) at this time for any software where
+ * a choice of LGPL license versions is made available with the language indicating
+ * that LGPLv2 or any later version may be used, or where a choice of which version
+ * of the LGPL is applied is otherwise unspecified.
+ */
+
+#ifndef _EXEC_ALL_H_
+#define _EXEC_ALL_H_
+
+#include "qemu-common.h"
+#ifdef VBOX
+# include <VBox/vmm/tm.h>
+# include <VBox/vmm/pgm.h> /* PGM_DYNAMIC_RAM_ALLOC */
+# include <VBox/vmm/em.h>  /* EMRemIsLockOwner */
+# ifndef LOG_GROUP
+#  define LOG_GROUP LOG_GROUP_REM
+# endif
+# include <VBox/log.h>
+# include "REMInternal.h"
+# include <VBox/vmm/vm.h>
+#endif /* VBOX */
+
+/* allow to see translation results - the slowdown should be negligible, so we leave it */
+#ifndef VBOX
+#define DEBUG_DISAS
+#endif /* !VBOX */
+
+/* Page tracking code uses ram addresses in system mode, and virtual
+   addresses in userspace mode.  Define tb_page_addr_t to be an appropriate
+   type.  */
+#if defined(CONFIG_USER_ONLY)
+typedef abi_ulong tb_page_addr_t;
+#else
+typedef ram_addr_t tb_page_addr_t;
+#endif
+
+/* is_jmp field values */
+#define DISAS_NEXT    0 /* next instruction can be analyzed */
+#define DISAS_JUMP    1 /* only pc was modified dynamically */
+#define DISAS_UPDATE  2 /* cpu state was modified dynamically */
+#define DISAS_TB_JUMP 3 /* only pc was modified statically */
+
+typedef struct TranslationBlock TranslationBlock;
+
+/* XXX: make safe guess about sizes */
+#define MAX_OP_PER_INSTR 266
+
+#if HOST_LONG_BITS == 32
+#define MAX_OPC_PARAM_PER_ARG 2
+#else
+#define MAX_OPC_PARAM_PER_ARG 1
+#endif
+#define MAX_OPC_PARAM_IARGS 4
+#define MAX_OPC_PARAM_OARGS 1
+#define MAX_OPC_PARAM_ARGS (MAX_OPC_PARAM_IARGS + MAX_OPC_PARAM_OARGS)
+
+/* A Call op needs up to 4 + 2N parameters on 32-bit archs,
+ * and up to 4 + N parameters on 64-bit archs
+ * (N = number of input arguments + output arguments).  */
+#define MAX_OPC_PARAM (4 + (MAX_OPC_PARAM_PER_ARG * MAX_OPC_PARAM_ARGS))
+#define OPC_BUF_SIZE 640
+#define OPC_MAX_SIZE (OPC_BUF_SIZE - MAX_OP_PER_INSTR)
+
+/* Maximum size a TCG op can expand to.  This is complicated because a
+   single op may require several host instructions and register reloads.
+   For now take a wild guess at 192 bytes, which should allow at least
+   a couple of fixup instructions per argument.  */
+#define TCG_MAX_OP_SIZE 192
+
+#define OPPARAM_BUF_SIZE (OPC_BUF_SIZE * MAX_OPC_PARAM)
+
+extern target_ulong gen_opc_pc[OPC_BUF_SIZE];
+extern uint8_t gen_opc_instr_start[OPC_BUF_SIZE];
+extern uint16_t gen_opc_icount[OPC_BUF_SIZE];
+
+#include "qemu-log.h"
+
+void gen_intermediate_code(CPUState *env, struct TranslationBlock *tb);
+void gen_intermediate_code_pc(CPUState *env, struct TranslationBlock *tb);
+void gen_pc_load(CPUState *env, struct TranslationBlock *tb,
+                 uintptr_t searched_pc, int pc_pos, void *puc);
+
+void cpu_gen_init(void);
+int cpu_gen_code(CPUState *env, struct TranslationBlock *tb,
+                 int *gen_code_size_ptr);
+int cpu_restore_state(struct TranslationBlock *tb,
+                      CPUState *env, uintptr_t searched_pc,
+                      void *puc);
+void cpu_resume_from_signal(CPUState *env1, void *puc);
+void cpu_io_recompile(CPUState *env, void *retaddr);
+TranslationBlock *tb_gen_code(CPUState *env,
+                              target_ulong pc, target_ulong cs_base, int flags,
+                              int cflags);
+void cpu_exec_init(CPUState *env);
+void QEMU_NORETURN cpu_loop_exit(void);
+int page_unprotect(target_ulong address, uintptr_t pc, void *puc);
+void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end,
+                                   int is_cpu_write_access);
+void tb_invalidate_page_range(target_ulong start, target_ulong end);
+void tlb_flush_page(CPUState *env, target_ulong addr);
+void tlb_flush(CPUState *env, int flush_global);
+#if !defined(CONFIG_USER_ONLY)
+void tlb_set_page(CPUState *env, target_ulong vaddr,
+                  target_phys_addr_t paddr, int prot,
+                  int mmu_idx, target_ulong size);
+#endif
+
+#define CODE_GEN_ALIGN           16 /* must be >= of the size of a icache line */
+
+#define CODE_GEN_PHYS_HASH_BITS     15
+#define CODE_GEN_PHYS_HASH_SIZE     (1 << CODE_GEN_PHYS_HASH_BITS)
+
+#define MIN_CODE_GEN_BUFFER_SIZE     (1024 * 1024)
+
+/* estimated block size for TB allocation */
+/* XXX: use a per code average code fragment size and modulate it
+   according to the host CPU */
+#if defined(CONFIG_SOFTMMU)
+#define CODE_GEN_AVG_BLOCK_SIZE 128
+#else
+#define CODE_GEN_AVG_BLOCK_SIZE 64
+#endif
+
+#if defined(_ARCH_PPC) || defined(__x86_64__) || defined(__arm__) || defined(__i386__)
+#define USE_DIRECT_JUMP
+#endif
+
+#ifdef VBOX /* bird: not safe in next step because of threading & cpu_interrupt. */
+# undef USE_DIRECT_JUMP
+#endif /* VBOX */
+
+struct TranslationBlock {
+    target_ulong pc;   /* simulated PC corresponding to this block (EIP + CS base) */
+    target_ulong cs_base; /* CS base for this block */
+    uint64_t flags; /* flags defining in which context the code was generated */
+    uint16_t size;      /* size of target code for this block (1 <=
+                           size <= TARGET_PAGE_SIZE) */
+    uint16_t cflags;    /* compile flags */
+#define CF_COUNT_MASK  0x7fff
+#define CF_LAST_IO     0x8000 /* Last insn may be an IO access.  */
+#ifdef VBOX
+# define CF_RAW_MODE   0x0010 /* block was generated in raw mode */
+#endif
+
+    uint8_t *tc_ptr;    /* pointer to the translated code */
+    /* next matching tb for physical address. */
+    struct TranslationBlock *phys_hash_next;
+    /* first and second physical page containing code. The lower bit
+       of the pointer tells the index in page_next[] */
+    struct TranslationBlock *page_next[2];
+    tb_page_addr_t page_addr[2];
+
+    /* the following data are used to directly call another TB from
+       the code of this one. */
+    uint16_t tb_next_offset[2]; /* offset of original jump target */
+#ifdef USE_DIRECT_JUMP
+    uint16_t tb_jmp_offset[2]; /* offset of jump instruction */
+#else
+    uintptr_t tb_next[2]; /* address of jump generated code */
+#endif
+    /* list of TBs jumping to this one. This is a circular list using
+       the two least significant bits of the pointers to tell what is
+       the next pointer: 0 = jmp_next[0], 1 = jmp_next[1], 2 =
+       jmp_first */
+    struct TranslationBlock *jmp_next[2];
+    struct TranslationBlock *jmp_first;
+    uint32_t icount;
+};
+
+static inline unsigned int tb_jmp_cache_hash_page(target_ulong pc)
+{
+    target_ulong tmp;
+    tmp = pc ^ (pc >> (TARGET_PAGE_BITS - TB_JMP_PAGE_BITS));
+    return (tmp >> (TARGET_PAGE_BITS - TB_JMP_PAGE_BITS)) & TB_JMP_PAGE_MASK;
+}
+
+static inline unsigned int tb_jmp_cache_hash_func(target_ulong pc)
+{
+    target_ulong tmp;
+    tmp = pc ^ (pc >> (TARGET_PAGE_BITS - TB_JMP_PAGE_BITS));
+    return (((tmp >> (TARGET_PAGE_BITS - TB_JMP_PAGE_BITS)) & TB_JMP_PAGE_MASK)
+	    | (tmp & TB_JMP_ADDR_MASK));
+}
+
+static inline unsigned int tb_phys_hash_func(tb_page_addr_t pc)
+{
+    return pc & (CODE_GEN_PHYS_HASH_SIZE - 1);
+}
+
+TranslationBlock *tb_alloc(target_ulong pc);
+void tb_free(TranslationBlock *tb);
+void tb_flush(CPUState *env);
+void tb_link_page(TranslationBlock *tb,
+                  tb_page_addr_t phys_pc, tb_page_addr_t phys_page2);
+void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr);
+
+extern TranslationBlock *tb_phys_hash[CODE_GEN_PHYS_HASH_SIZE];
+
+#if defined(USE_DIRECT_JUMP)
+
+#if defined(_ARCH_PPC)
+extern void ppc_tb_set_jmp_target(uintptr_t jmp_addr, uintptr_t addr);
+#define tb_set_jmp_target1 ppc_tb_set_jmp_target
+#elif defined(__i386__) || defined(__x86_64__)
+static inline void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr)
+{
+    /* patch the branch destination */
+    *(uint32_t *)jmp_addr = addr - (jmp_addr + 4);
+    /* no need to flush icache explicitly */
+}
+#elif defined(__arm__)
+static inline void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr)
+{
+#if QEMU_GNUC_PREREQ(4, 1)
+    void __clear_cache(char *beg, char *end);
+#else
+    register unsigned long _beg __asm ("a1");
+    register unsigned long _end __asm ("a2");
+    register unsigned long _flg __asm ("a3");
+#endif
+
+    /* we could use a ldr pc, [pc, #-4] kind of branch and avoid the flush */
+    *(uint32_t *)jmp_addr =
+        (*(uint32_t *)jmp_addr & ~0xffffff)
+        | (((addr - (jmp_addr + 8)) >> 2) & 0xffffff);
+
+#if QEMU_GNUC_PREREQ(4, 1)
+    __clear_cache((char *) jmp_addr, (char *) jmp_addr + 4);
+#else
+    /* flush icache */
+    _beg = jmp_addr;
+    _end = jmp_addr + 4;
+    _flg = 0;
+    __asm __volatile__ ("swi 0x9f0002" : : "r" (_beg), "r" (_end), "r" (_flg));
+#endif
+}
+#endif
+
+static inline void tb_set_jmp_target(TranslationBlock *tb,
+                                     int n, uintptr_t addr)
+{
+    uintptr_t offset;
+
+    offset = tb->tb_jmp_offset[n];
+    tb_set_jmp_target1((uintptr_t)(tb->tc_ptr + offset), addr);
+}
+
+#else
+
+/* set the jump target */
+static inline void tb_set_jmp_target(TranslationBlock *tb,
+                                     int n, uintptr_t addr)
+{
+    tb->tb_next[n] = addr;
+}
+
+#endif
+
+static inline void tb_add_jump(TranslationBlock *tb, int n,
+                               TranslationBlock *tb_next)
+{
+    /* NOTE: this test is only needed for thread safety */
+    if (!tb->jmp_next[n]) {
+        /* patch the native jump address */
+        tb_set_jmp_target(tb, n, (uintptr_t)tb_next->tc_ptr);
+
+        /* add in TB jmp circular list */
+        tb->jmp_next[n] = tb_next->jmp_first;
+        tb_next->jmp_first = (TranslationBlock *)((intptr_t)(tb) | (n));
+    }
+}
+
+TranslationBlock *tb_find_pc(uintptr_t pc_ptr);
+
+#include "qemu-lock.h"
+
+extern spinlock_t tb_lock;
+
+extern int tb_invalidated_flag;
+
+#if !defined(CONFIG_USER_ONLY)
+
+extern CPUWriteMemoryFunc *io_mem_write[IO_MEM_NB_ENTRIES][4];
+extern CPUReadMemoryFunc *io_mem_read[IO_MEM_NB_ENTRIES][4];
+extern void *io_mem_opaque[IO_MEM_NB_ENTRIES];
+
+void tlb_fill(target_ulong addr, int is_write, int mmu_idx,
+              void *retaddr);
+
+#include "softmmu_defs.h"
+
+#define ACCESS_TYPE (NB_MMU_MODES + 1)
+#define MEMSUFFIX _code
+#define env cpu_single_env
+
+#define DATA_SIZE 1
+#include "softmmu_header.h"
+
+#define DATA_SIZE 2
+#include "softmmu_header.h"
+
+#define DATA_SIZE 4
+#include "softmmu_header.h"
+
+#define DATA_SIZE 8
+#include "softmmu_header.h"
+
+#undef ACCESS_TYPE
+#undef MEMSUFFIX
+#undef env
+
+#endif
+
+#if defined(CONFIG_USER_ONLY)
+static inline tb_page_addr_t get_page_addr_code(CPUState *env1, target_ulong addr)
+{
+    return addr;
+}
+#else
+# ifdef VBOX
+target_ulong remR3PhysGetPhysicalAddressCode(CPUState *env, target_ulong addr, CPUTLBEntry *pTLBEntry, target_phys_addr_t ioTLBEntry);
+# endif
+/* NOTE: this function can trigger an exception */
+/* NOTE2: the returned address is not exactly the physical address: it
+   is the offset relative to phys_ram_base */
+static inline tb_page_addr_t get_page_addr_code(CPUState *env1, target_ulong addr)
+{
+    int mmu_idx, page_index, pd;
+# ifndef VBOX
+    void *p;
+# endif
+
+    page_index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+    mmu_idx = cpu_mmu_index(env1);
+    if (unlikely(env1->tlb_table[mmu_idx][page_index].addr_code !=
+                 (addr & TARGET_PAGE_MASK))) {
+        ldub_code(addr);
+    }
+    pd = env1->tlb_table[mmu_idx][page_index].addr_code & ~TARGET_PAGE_MASK;
+    if (pd > IO_MEM_ROM && !(pd & IO_MEM_ROMD)) {
+# ifdef VBOX
+        /* deal with non-MMIO access handlers. */
+        return remR3PhysGetPhysicalAddressCode(env1, addr,
+                                               &env1->tlb_table[mmu_idx][page_index],
+                                               env1->iotlb[mmu_idx][page_index]);
+# elif defined(TARGET_SPARC) || defined(TARGET_MIPS)
+        do_unassigned_access(addr, 0, 1, 0, 4);
+#else
+        cpu_abort(env1, "Trying to execute code outside RAM or ROM at 0x" TARGET_FMT_lx "\n", addr);
+#endif
+    }
+# if defined(VBOX) && defined(REM_PHYS_ADDR_IN_TLB)
+    return addr + env1->tlb_table[mmu_idx][page_index].addend;
+# elif defined(VBOX)
+    Assert(env1->phys_addends[mmu_idx][page_index] != -1);
+    return addr + env1->phys_addends[mmu_idx][page_index];
+# else
+    p = (void *)(uintptr_t)addr
+        + env1->tlb_table[mmu_idx][page_index].addend;
+    return qemu_ram_addr_from_host(p);
+# endif
+}
+#endif
+
+typedef void (CPUDebugExcpHandler)(CPUState *env);
+
+CPUDebugExcpHandler *cpu_set_debug_excp_handler(CPUDebugExcpHandler *handler);
+
+#ifndef VBOX
+/* vl.c */
+extern int singlestep;
+
+/* cpu-exec.c */
+extern volatile sig_atomic_t exit_request;
+#endif /*!VBOX*/
+
+
+#endif
diff --git a/src/recompiler/exec.c b/src/recompiler/exec.c
new file mode 100644
index 00000000..ca632372
--- /dev/null
+++ b/src/recompiler/exec.c
@@ -0,0 +1,4586 @@
+/*
+ *  virtual page mapping and translated block handling
+ *
+ *  Copyright (c) 2003 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Oracle LGPL Disclaimer: For the avoidance of doubt, except that if any license choice
+ * other than GPL or LGPL is available it will apply instead, Oracle elects to use only
+ * the Lesser General Public License version 2.1 (LGPLv2) at this time for any software where
+ * a choice of LGPL license versions is made available with the language indicating
+ * that LGPLv2 or any later version may be used, or where a choice of which version
+ * of the LGPL is applied is otherwise unspecified.
+ */
+
+#include "config.h"
+#ifndef VBOX
+#ifdef _WIN32
+#include <windows.h>
+#else
+#include <sys/types.h>
+#include <sys/mman.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <inttypes.h>
+#else /* VBOX */
+# include <stdlib.h>
+# include <stdio.h>
+# include <iprt/alloc.h>
+# include <iprt/string.h>
+# include <iprt/param.h>
+# include <VBox/vmm/pgm.h> /* PGM_DYNAMIC_RAM_ALLOC */
+# include <iprt/errcore.h>
+#endif /* VBOX */
+
+#include "cpu.h"
+#include "exec-all.h"
+#include "qemu-common.h"
+#include "tcg.h"
+#ifndef VBOX
+#include "hw/hw.h"
+#include "hw/qdev.h"
+#endif /* !VBOX */
+#include "osdep.h"
+#include "kvm.h"
+#include "qemu-timer.h"
+#if defined(CONFIG_USER_ONLY)
+#include <qemu.h>
+#include <signal.h>
+#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
+#include <sys/param.h>
+#if __FreeBSD_version >= 700104
+#define HAVE_KINFO_GETVMMAP
+#define sigqueue sigqueue_freebsd  /* avoid redefinition */
+#include <sys/time.h>
+#include <sys/proc.h>
+#include <machine/profile.h>
+#define _KERNEL
+#include <sys/user.h>
+#undef _KERNEL
+#undef sigqueue
+#include <libutil.h>
+#endif
+#endif
+#endif
+
+//#define DEBUG_TB_INVALIDATE
+//#define DEBUG_FLUSH
+//#define DEBUG_TLB
+//#define DEBUG_UNASSIGNED
+
+/* make various TB consistency checks */
+//#define DEBUG_TB_CHECK
+//#define DEBUG_TLB_CHECK
+
+//#define DEBUG_IOPORT
+//#define DEBUG_SUBPAGE
+
+#if !defined(CONFIG_USER_ONLY)
+/* TB consistency checks only implemented for usermode emulation.  */
+#undef DEBUG_TB_CHECK
+#endif
+
+#define SMC_BITMAP_USE_THRESHOLD 10
+
+static TranslationBlock *tbs;
+static int code_gen_max_blocks;
+TranslationBlock *tb_phys_hash[CODE_GEN_PHYS_HASH_SIZE];
+static int nb_tbs;
+/* any access to the tbs or the page table must use this lock */
+spinlock_t tb_lock = SPIN_LOCK_UNLOCKED;
+
+#ifndef VBOX
+#if defined(__arm__) || defined(__sparc_v9__)
+/* The prologue must be reachable with a direct jump. ARM and Sparc64
+ have limited branch ranges (possibly also PPC) so place it in a
+ section close to code segment. */
+#define code_gen_section                                \
+    __attribute__((__section__(".gen_code")))           \
+    __attribute__((aligned (32)))
+#elif defined(_WIN32)
+/* Maximum alignment for Win32 is 16. */
+#define code_gen_section                                \
+    __attribute__((aligned (16)))
+#else
+#define code_gen_section                                \
+    __attribute__((aligned (32)))
+#endif
+
+uint8_t code_gen_prologue[1024] code_gen_section;
+#else /* VBOX */
+extern uint8_t *code_gen_prologue;
+#endif /* VBOX */
+static uint8_t *code_gen_buffer;
+static size_t code_gen_buffer_size;
+/* threshold to flush the translated code buffer */
+static size_t code_gen_buffer_max_size;
+static uint8_t *code_gen_ptr;
+
+#if !defined(CONFIG_USER_ONLY)
+# ifndef VBOX
+int phys_ram_fd;
+static int in_migration;
+# endif /* !VBOX */
+
+RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list) };
+#endif
+
+CPUState *first_cpu;
+/* current CPU in the current thread. It is only valid inside
+   cpu_exec() */
+CPUState *cpu_single_env;
+/* 0 = Do not count executed instructions.
+   1 = Precise instruction counting.
+   2 = Adaptive rate instruction counting.  */
+int use_icount = 0;
+/* Current instruction counter.  While executing translated code this may
+   include some instructions that have not yet been executed.  */
+int64_t qemu_icount;
+
+typedef struct PageDesc {
+    /* list of TBs intersecting this ram page */
+    TranslationBlock *first_tb;
+    /* in order to optimize self modifying code, we count the number
+       of lookups we do to a given page to use a bitmap */
+    unsigned int code_write_count;
+    uint8_t *code_bitmap;
+#if defined(CONFIG_USER_ONLY)
+    unsigned long flags;
+#endif
+} PageDesc;
+
+/* In system mode we want L1_MAP to be based on ram offsets,
+   while in user mode we want it to be based on virtual addresses.  */
+#if !defined(CONFIG_USER_ONLY)
+#if HOST_LONG_BITS < TARGET_PHYS_ADDR_SPACE_BITS
+# define L1_MAP_ADDR_SPACE_BITS  HOST_LONG_BITS
+#else
+# define L1_MAP_ADDR_SPACE_BITS  TARGET_PHYS_ADDR_SPACE_BITS
+#endif
+#else
+# define L1_MAP_ADDR_SPACE_BITS  TARGET_VIRT_ADDR_SPACE_BITS
+#endif
+
+/* Size of the L2 (and L3, etc) page tables.  */
+#define L2_BITS 10
+#define L2_SIZE (1 << L2_BITS)
+
+/* The bits remaining after N lower levels of page tables.  */
+#define P_L1_BITS_REM \
+    ((TARGET_PHYS_ADDR_SPACE_BITS - TARGET_PAGE_BITS) % L2_BITS)
+#define V_L1_BITS_REM \
+    ((L1_MAP_ADDR_SPACE_BITS - TARGET_PAGE_BITS) % L2_BITS)
+
+/* Size of the L1 page table.  Avoid silly small sizes.  */
+#if P_L1_BITS_REM < 4
+#define P_L1_BITS  (P_L1_BITS_REM + L2_BITS)
+#else
+#define P_L1_BITS  P_L1_BITS_REM
+#endif
+
+#if V_L1_BITS_REM < 4
+#define V_L1_BITS  (V_L1_BITS_REM + L2_BITS)
+#else
+#define V_L1_BITS  V_L1_BITS_REM
+#endif
+
+#define P_L1_SIZE  ((target_phys_addr_t)1 << P_L1_BITS)
+#define V_L1_SIZE  ((target_ulong)1 << V_L1_BITS)
+
+#define P_L1_SHIFT (TARGET_PHYS_ADDR_SPACE_BITS - TARGET_PAGE_BITS - P_L1_BITS)
+#define V_L1_SHIFT (L1_MAP_ADDR_SPACE_BITS - TARGET_PAGE_BITS - V_L1_BITS)
+
+size_t qemu_real_host_page_size;
+size_t qemu_host_page_bits;
+size_t qemu_host_page_size;
+uintptr_t qemu_host_page_mask;
+
+/* This is a multi-level map on the virtual address space.
+   The bottom level has pointers to PageDesc.  */
+static void *l1_map[V_L1_SIZE];
+
+#if !defined(CONFIG_USER_ONLY)
+typedef struct PhysPageDesc {
+    /* offset in host memory of the page + io_index in the low bits */
+    ram_addr_t phys_offset;
+    ram_addr_t region_offset;
+} PhysPageDesc;
+
+/* This is a multi-level map on the physical address space.
+   The bottom level has pointers to PhysPageDesc.  */
+static void *l1_phys_map[P_L1_SIZE];
+
+static void io_mem_init(void);
+
+/* io memory support */
+CPUWriteMemoryFunc *io_mem_write[IO_MEM_NB_ENTRIES][4];
+CPUReadMemoryFunc *io_mem_read[IO_MEM_NB_ENTRIES][4];
+void *io_mem_opaque[IO_MEM_NB_ENTRIES];
+static char io_mem_used[IO_MEM_NB_ENTRIES];
+static int io_mem_watch;
+#endif
+
+#ifndef VBOX
+/* log support */
+#ifdef WIN32
+static const char *logfilename = "qemu.log";
+#else
+static const char *logfilename = "/tmp/qemu.log";
+#endif
+#endif /* !VBOX */
+FILE *logfile;
+int loglevel;
+#ifndef VBOX
+static int log_append = 0;
+#endif /* !VBOX */
+
+/* statistics */
+#ifndef VBOX
+#if !defined(CONFIG_USER_ONLY)
+static int tlb_flush_count;
+#endif
+static int tb_flush_count;
+static int tb_phys_invalidate_count;
+#else  /* VBOX - Resettable U32 stats, see VBoxRecompiler.c. */
+uint32_t tlb_flush_count;
+uint32_t tb_flush_count;
+uint32_t tb_phys_invalidate_count;
+#endif /* VBOX */
+
+#ifndef VBOX
+#ifdef _WIN32
+static void map_exec(void *addr, size_t size)
+{
+    DWORD old_protect;
+    VirtualProtect(addr, size,
+                   PAGE_EXECUTE_READWRITE, &old_protect);
+
+}
+#else
+static void map_exec(void *addr, size_t size)
+{
+    uintptr_t start, end, page_size;
+
+    page_size = getpagesize();
+    start = (uintptr_t)addr;
+    start &= ~(page_size - 1);
+
+    end = (uintptr_t)addr + size;
+    end += page_size - 1;
+    end &= ~(page_size - 1);
+
+    mprotect((void *)start, end - start,
+             PROT_READ | PROT_WRITE | PROT_EXEC);
+}
+#endif
+#else /* VBOX */
+static void map_exec(void *addr, size_t size)
+{
+    RTMemProtect(addr, size,
+                 RTMEM_PROT_EXEC | RTMEM_PROT_READ | RTMEM_PROT_WRITE);
+}
+#endif /* VBOX */
+
+static void page_init(void)
+{
+    /* NOTE: we can always suppose that qemu_host_page_size >=
+       TARGET_PAGE_SIZE */
+#ifdef VBOX
+    RTMemProtect(code_gen_buffer, code_gen_buffer_size,
+                 RTMEM_PROT_EXEC | RTMEM_PROT_READ | RTMEM_PROT_WRITE);
+    qemu_real_host_page_size = PAGE_SIZE;
+#else /* !VBOX */
+#ifdef _WIN32
+    {
+        SYSTEM_INFO system_info;
+
+        GetSystemInfo(&system_info);
+        qemu_real_host_page_size = system_info.dwPageSize;
+    }
+#else
+    qemu_real_host_page_size = getpagesize();
+#endif
+#endif /* !VBOX */
+    if (qemu_host_page_size == 0)
+        qemu_host_page_size = qemu_real_host_page_size;
+    if (qemu_host_page_size < TARGET_PAGE_SIZE)
+        qemu_host_page_size = TARGET_PAGE_SIZE;
+    qemu_host_page_bits = 0;
+    while ((1 << qemu_host_page_bits) < VBOX_ONLY((int))qemu_host_page_size)
+        qemu_host_page_bits++;
+    qemu_host_page_mask = ~(qemu_host_page_size - 1);
+
+#ifndef VBOX /* We use other means to set reserved bit on our pages */
+#if defined(CONFIG_BSD) && defined(CONFIG_USER_ONLY)
+    {
+#ifdef HAVE_KINFO_GETVMMAP
+        struct kinfo_vmentry *freep;
+        int i, cnt;
+
+        freep = kinfo_getvmmap(getpid(), &cnt);
+        if (freep) {
+            mmap_lock();
+            for (i = 0; i < cnt; i++) {
+                uintptr_t startaddr, endaddr;
+
+                startaddr = freep[i].kve_start;
+                endaddr = freep[i].kve_end;
+                if (h2g_valid(startaddr)) {
+                    startaddr = h2g(startaddr) & TARGET_PAGE_MASK;
+
+                    if (h2g_valid(endaddr)) {
+                        endaddr = h2g(endaddr);
+                        page_set_flags(startaddr, endaddr, PAGE_RESERVED);
+                    } else {
+#if TARGET_ABI_BITS <= L1_MAP_ADDR_SPACE_BITS
+                        endaddr = ~0ul;
+                        page_set_flags(startaddr, endaddr, PAGE_RESERVED);
+#endif
+                    }
+                }
+            }
+            free(freep);
+            mmap_unlock();
+        }
+#else
+        FILE *f;
+
+        last_brk = (uintptr_t)sbrk(0);
+
+        f = fopen("/compat/linux/proc/self/maps", "r");
+        if (f) {
+            mmap_lock();
+
+            do {
+                uintptr_t startaddr, endaddr;
+                int n;
+
+                n = fscanf (f, "%lx-%lx %*[^\n]\n", &startaddr, &endaddr);
+
+                if (n == 2 && h2g_valid(startaddr)) {
+                    startaddr = h2g(startaddr) & TARGET_PAGE_MASK;
+
+                    if (h2g_valid(endaddr)) {
+                        endaddr = h2g(endaddr);
+                    } else {
+                        endaddr = ~0ul;
+                    }
+                    page_set_flags(startaddr, endaddr, PAGE_RESERVED);
+                }
+            } while (!feof(f));
+
+            fclose(f);
+            mmap_unlock();
+        }
+#endif
+    }
+#endif
+#endif /* !VBOX */
+}
+
+static PageDesc *page_find_alloc(tb_page_addr_t index, int alloc)
+{
+    PageDesc *pd;
+    void **lp;
+    int i;
+
+#if defined(CONFIG_USER_ONLY)
+    /* We can't use qemu_malloc because it may recurse into a locked mutex. */
+# define ALLOC(P, SIZE)                                 \
+    do {                                                \
+        P = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,    \
+                 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);   \
+    } while (0)
+#else
+# define ALLOC(P, SIZE) \
+    do { P = qemu_mallocz(SIZE); } while (0)
+#endif
+
+    /* Level 1.  Always allocated.  */
+    lp = l1_map + ((index >> V_L1_SHIFT) & (V_L1_SIZE - 1));
+
+    /* Level 2..N-1.  */
+    for (i = V_L1_SHIFT / L2_BITS - 1; i > 0; i--) {
+        void **p = *lp;
+
+        if (p == NULL) {
+            if (!alloc) {
+                return NULL;
+            }
+            ALLOC(p, sizeof(void *) * L2_SIZE);
+            *lp = p;
+        }
+
+        lp = p + ((index >> (i * L2_BITS)) & (L2_SIZE - 1));
+    }
+
+    pd = *lp;
+    if (pd == NULL) {
+        if (!alloc) {
+            return NULL;
+        }
+        ALLOC(pd, sizeof(PageDesc) * L2_SIZE);
+        *lp = pd;
+    }
+
+#undef ALLOC
+
+    return pd + (index & (L2_SIZE - 1));
+}
+
+static inline PageDesc *page_find(tb_page_addr_t index)
+{
+    return page_find_alloc(index, 0);
+}
+
+#if !defined(CONFIG_USER_ONLY)
+static PhysPageDesc *phys_page_find_alloc(target_phys_addr_t index, int alloc)
+{
+    PhysPageDesc *pd;
+    void **lp;
+    int i;
+
+    /* Level 1.  Always allocated.  */
+    lp = l1_phys_map + ((index >> P_L1_SHIFT) & (P_L1_SIZE - 1));
+
+    /* Level 2..N-1.  */
+    for (i = P_L1_SHIFT / L2_BITS - 1; i > 0; i--) {
+        void **p = *lp;
+        if (p == NULL) {
+            if (!alloc) {
+                return NULL;
+            }
+            *lp = p = qemu_mallocz(sizeof(void *) * L2_SIZE);
+        }
+        lp = p + ((index >> (i * L2_BITS)) & (L2_SIZE - 1));
+    }
+
+    pd = *lp;
+    if (pd == NULL) {
+        int i;
+
+        if (!alloc) {
+            return NULL;
+        }
+
+        *lp = pd = qemu_malloc(sizeof(PhysPageDesc) * L2_SIZE);
+
+        for (i = 0; i < L2_SIZE; i++) {
+            pd[i].phys_offset = IO_MEM_UNASSIGNED;
+            pd[i].region_offset = (index + i) << TARGET_PAGE_BITS;
+        }
+    }
+
+    return pd + (index & (L2_SIZE - 1));
+}
+
+static inline PhysPageDesc *phys_page_find(target_phys_addr_t index)
+{
+    return phys_page_find_alloc(index, 0);
+}
+
+static void tlb_protect_code(ram_addr_t ram_addr);
+static void tlb_unprotect_code_phys(CPUState *env, ram_addr_t ram_addr,
+                                    target_ulong vaddr);
+#define mmap_lock() do { } while(0)
+#define mmap_unlock() do { } while(0)
+#endif
+
+#ifdef VBOX /*  We don't need such huge codegen buffer size, as execute
+                most of the code in raw or hm mode. */
+#define DEFAULT_CODE_GEN_BUFFER_SIZE (8 * 1024 * 1024)
+#else  /* !VBOX */
+#define DEFAULT_CODE_GEN_BUFFER_SIZE (32 * 1024 * 1024)
+#endif /* !VBOX */
+
+#if defined(CONFIG_USER_ONLY)
+/* Currently it is not recommended to allocate big chunks of data in
+   user mode. It will change when a dedicated libc will be used */
+#define USE_STATIC_CODE_GEN_BUFFER
+#endif
+
+#if defined(VBOX) && defined(USE_STATIC_CODE_GEN_BUFFER)
+# error "VBox allocates codegen buffer dynamically"
+#endif
+
+#ifdef USE_STATIC_CODE_GEN_BUFFER
+static uint8_t static_code_gen_buffer[DEFAULT_CODE_GEN_BUFFER_SIZE]
+               __attribute__((aligned (CODE_GEN_ALIGN)));
+#endif
+
+static void code_gen_alloc(uintptr_t tb_size)
+{
+#ifdef USE_STATIC_CODE_GEN_BUFFER
+    code_gen_buffer = static_code_gen_buffer;
+    code_gen_buffer_size = DEFAULT_CODE_GEN_BUFFER_SIZE;
+    map_exec(code_gen_buffer, code_gen_buffer_size);
+#else
+# ifdef VBOX
+    /* We cannot use phys_ram_size here, as it's 0 now,
+     * it only gets initialized once RAM registration callback
+     * (REMR3NotifyPhysRamRegister()) called.
+     */
+    code_gen_buffer_size = DEFAULT_CODE_GEN_BUFFER_SIZE;
+# else  /* !VBOX */
+    code_gen_buffer_size = tb_size;
+    if (code_gen_buffer_size == 0) {
+#if defined(CONFIG_USER_ONLY)
+        /* in user mode, phys_ram_size is not meaningful */
+        code_gen_buffer_size = DEFAULT_CODE_GEN_BUFFER_SIZE;
+#else
+        /* XXX: needs adjustments */
+        code_gen_buffer_size = (uintptr_t)(ram_size / 4);
+#endif
+    }
+    if (code_gen_buffer_size < MIN_CODE_GEN_BUFFER_SIZE)
+        code_gen_buffer_size = MIN_CODE_GEN_BUFFER_SIZE;
+# endif /* !VBOX */
+    /* The code gen buffer location may have constraints depending on
+       the host cpu and OS */
+# ifdef VBOX
+    code_gen_buffer = RTMemExecAlloc(code_gen_buffer_size);
+
+    if (!code_gen_buffer) {
+        LogRel(("REM: failed allocate codegen buffer %lld\n",
+                code_gen_buffer_size));
+        return;
+    }
+# else  /* !VBOX */
+#if defined(__linux__)
+    {
+        int flags;
+        void *start = NULL;
+
+        flags = MAP_PRIVATE | MAP_ANONYMOUS;
+#if defined(__x86_64__)
+        flags |= MAP_32BIT;
+        /* Cannot map more than that */
+        if (code_gen_buffer_size > (800 * 1024 * 1024))
+            code_gen_buffer_size = (800 * 1024 * 1024);
+#elif defined(__sparc_v9__)
+        // Map the buffer below 2G, so we can use direct calls and branches
+        flags |= MAP_FIXED;
+        start = (void *) 0x60000000UL;
+        if (code_gen_buffer_size > (512 * 1024 * 1024))
+            code_gen_buffer_size = (512 * 1024 * 1024);
+#elif defined(__arm__)
+        /* Map the buffer below 32M, so we can use direct calls and branches */
+        flags |= MAP_FIXED;
+        start = (void *) 0x01000000UL;
+        if (code_gen_buffer_size > 16 * 1024 * 1024)
+            code_gen_buffer_size = 16 * 1024 * 1024;
+#elif defined(__s390x__)
+        /* Map the buffer so that we can use direct calls and branches.  */
+        /* We have a +- 4GB range on the branches; leave some slop.  */
+        if (code_gen_buffer_size > (3ul * 1024 * 1024 * 1024)) {
+            code_gen_buffer_size = 3ul * 1024 * 1024 * 1024;
+        }
+        start = (void *)0x90000000UL;
+#endif
+        code_gen_buffer = mmap(start, code_gen_buffer_size,
+                               PROT_WRITE | PROT_READ | PROT_EXEC,
+                               flags, -1, 0);
+        if (code_gen_buffer == MAP_FAILED) {
+            fprintf(stderr, "Could not allocate dynamic translator buffer\n");
+            exit(1);
+        }
+    }
+#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || defined(__DragonFly__)
+    {
+        int flags;
+        void *addr = NULL;
+        flags = MAP_PRIVATE | MAP_ANONYMOUS;
+#if defined(__x86_64__)
+        /* FreeBSD doesn't have MAP_32BIT, use MAP_FIXED and assume
+         * 0x40000000 is free */
+        flags |= MAP_FIXED;
+        addr = (void *)0x40000000;
+        /* Cannot map more than that */
+        if (code_gen_buffer_size > (800 * 1024 * 1024))
+            code_gen_buffer_size = (800 * 1024 * 1024);
+#endif
+        code_gen_buffer = mmap(addr, code_gen_buffer_size,
+                               PROT_WRITE | PROT_READ | PROT_EXEC,
+                               flags, -1, 0);
+        if (code_gen_buffer == MAP_FAILED) {
+            fprintf(stderr, "Could not allocate dynamic translator buffer\n");
+            exit(1);
+        }
+    }
+#else
+    code_gen_buffer = qemu_malloc(code_gen_buffer_size);
+    map_exec(code_gen_buffer, code_gen_buffer_size);
+#endif
+# endif /* !VBOX */
+#endif /* !USE_STATIC_CODE_GEN_BUFFER */
+#ifndef VBOX /** @todo r=bird: why are we different? */
+    map_exec(code_gen_prologue, sizeof(code_gen_prologue));
+#else
+    map_exec(code_gen_prologue, _1K);
+#endif
+    code_gen_buffer_max_size = code_gen_buffer_size -
+        (TCG_MAX_OP_SIZE * OPC_MAX_SIZE);
+    code_gen_max_blocks = code_gen_buffer_size / CODE_GEN_AVG_BLOCK_SIZE;
+    tbs = qemu_malloc(code_gen_max_blocks * sizeof(TranslationBlock));
+}
+
+/* Must be called before using the QEMU cpus. 'tb_size' is the size
+   (in bytes) allocated to the translation buffer. Zero means default
+   size. */
+void cpu_exec_init_all(uintptr_t tb_size)
+{
+    cpu_gen_init();
+    code_gen_alloc(tb_size);
+    code_gen_ptr = code_gen_buffer;
+    page_init();
+#if !defined(CONFIG_USER_ONLY)
+    io_mem_init();
+#endif
+#if !defined(CONFIG_USER_ONLY) || !defined(CONFIG_USE_GUEST_BASE)
+    /* There's no guest base to take into account, so go ahead and
+       initialize the prologue now.  */
+    tcg_prologue_init(&tcg_ctx);
+#endif
+}
+
+#ifndef VBOX
+#if defined(CPU_SAVE_VERSION) && !defined(CONFIG_USER_ONLY)
+
+static int cpu_common_post_load(void *opaque, int version_id)
+{
+    CPUState *env = opaque;
+
+    /* 0x01 was CPU_INTERRUPT_EXIT. This line can be removed when the
+       version_id is increased. */
+    env->interrupt_request &= ~0x01;
+    tlb_flush(env, 1);
+
+    return 0;
+}
+
+static const VMStateDescription vmstate_cpu_common = {
+    .name = "cpu_common",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .minimum_version_id_old = 1,
+    .post_load = cpu_common_post_load,
+    .fields      = (VMStateField []) {
+        VMSTATE_UINT32(halted, CPUState),
+        VMSTATE_UINT32(interrupt_request, CPUState),
+        VMSTATE_END_OF_LIST()
+    }
+};
+#endif
+
+CPUState *qemu_get_cpu(int cpu)
+{
+    CPUState *env = first_cpu;
+
+    while (env) {
+        if (env->cpu_index == cpu)
+            break;
+        env = env->next_cpu;
+    }
+
+    return env;
+}
+
+#endif /* !VBOX */
+
+void cpu_exec_init(CPUState *env)
+{
+    CPUState **penv;
+    int cpu_index;
+
+#if defined(CONFIG_USER_ONLY)
+    cpu_list_lock();
+#endif
+    env->next_cpu = NULL;
+    penv = &first_cpu;
+    cpu_index = 0;
+    while (*penv != NULL) {
+        penv = &(*penv)->next_cpu;
+        cpu_index++;
+    }
+    env->cpu_index = cpu_index;
+    env->numa_node = 0;
+    QTAILQ_INIT(&env->breakpoints);
+    QTAILQ_INIT(&env->watchpoints);
+    *penv = env;
+#ifndef VBOX
+#if defined(CONFIG_USER_ONLY)
+    cpu_list_unlock();
+#endif
+#if defined(CPU_SAVE_VERSION) && !defined(CONFIG_USER_ONLY)
+    vmstate_register(NULL, cpu_index, &vmstate_cpu_common, env);
+    register_savevm(NULL, "cpu", cpu_index, CPU_SAVE_VERSION,
+                    cpu_save, cpu_load, env);
+#endif
+#endif /* !VBOX */
+}
+
+static inline void invalidate_page_bitmap(PageDesc *p)
+{
+    if (p->code_bitmap) {
+        qemu_free(p->code_bitmap);
+        p->code_bitmap = NULL;
+    }
+    p->code_write_count = 0;
+}
+
+/* Set to NULL all the 'first_tb' fields in all PageDescs. */
+
+static void page_flush_tb_1 (int level, void **lp)
+{
+    int i;
+
+    if (*lp == NULL) {
+        return;
+    }
+    if (level == 0) {
+        PageDesc *pd = *lp;
+        for (i = 0; i < L2_SIZE; ++i) {
+            pd[i].first_tb = NULL;
+            invalidate_page_bitmap(pd + i);
+        }
+    } else {
+        void **pp = *lp;
+        for (i = 0; i < L2_SIZE; ++i) {
+            page_flush_tb_1 (level - 1, pp + i);
+        }
+    }
+}
+
+static void page_flush_tb(void)
+{
+    int i;
+    for (i = 0; i < V_L1_SIZE; i++) {
+        page_flush_tb_1(V_L1_SHIFT / L2_BITS - 1, l1_map + i);
+    }
+}
+
+/* flush all the translation blocks */
+/* XXX: tb_flush is currently not thread safe */
+void tb_flush(CPUState *env1)
+{
+    CPUState *env;
+#ifdef VBOX
+    STAM_PROFILE_START(&env1->StatTbFlush, a);
+#endif
+#if defined(DEBUG_FLUSH)
+    printf("qemu: flush code_size=%ld nb_tbs=%d avg_tb_size=%ld\n",
+           (unsigned long)(code_gen_ptr - code_gen_buffer),
+           nb_tbs, nb_tbs > 0 ?
+           ((unsigned long)(code_gen_ptr - code_gen_buffer)) / nb_tbs : 0);
+#endif
+    if ((uintptr_t)(code_gen_ptr - code_gen_buffer) > code_gen_buffer_size)
+        cpu_abort(env1, "Internal error: code buffer overflow\n");
+
+    nb_tbs = 0;
+
+    for(env = first_cpu; env != NULL; env = env->next_cpu) {
+        memset (env->tb_jmp_cache, 0, TB_JMP_CACHE_SIZE * sizeof (void *));
+    }
+
+    memset (tb_phys_hash, 0, CODE_GEN_PHYS_HASH_SIZE * sizeof (void *));
+    page_flush_tb();
+
+    code_gen_ptr = code_gen_buffer;
+    /* XXX: flush processor icache at this point if cache flush is
+       expensive */
+    tb_flush_count++;
+#ifdef VBOX
+    STAM_PROFILE_STOP(&env1->StatTbFlush, a);
+#endif
+}
+
+#ifdef DEBUG_TB_CHECK
+
+static void tb_invalidate_check(target_ulong address)
+{
+    TranslationBlock *tb;
+    int i;
+    address &= TARGET_PAGE_MASK;
+    for(i = 0;i < CODE_GEN_PHYS_HASH_SIZE; i++) {
+        for(tb = tb_phys_hash[i]; tb != NULL; tb = tb->phys_hash_next) {
+            if (!(address + TARGET_PAGE_SIZE <= tb->pc ||
+                  address >= tb->pc + tb->size)) {
+                printf("ERROR invalidate: address=" TARGET_FMT_lx
+                       " PC=%08lx size=%04x\n",
+                       address, (long)tb->pc, tb->size);
+            }
+        }
+    }
+}
+
+/* verify that all the pages have correct rights for code */
+static void tb_page_check(void)
+{
+    TranslationBlock *tb;
+    int i, flags1, flags2;
+
+    for(i = 0;i < CODE_GEN_PHYS_HASH_SIZE; i++) {
+        for(tb = tb_phys_hash[i]; tb != NULL; tb = tb->phys_hash_next) {
+            flags1 = page_get_flags(tb->pc);
+            flags2 = page_get_flags(tb->pc + tb->size - 1);
+            if ((flags1 & PAGE_WRITE) || (flags2 & PAGE_WRITE)) {
+                printf("ERROR page flags: PC=%08lx size=%04x f1=%x f2=%x\n",
+                       (long)tb->pc, tb->size, flags1, flags2);
+            }
+        }
+    }
+}
+
+#endif
+
+/* invalidate one TB */
+static inline void tb_remove(TranslationBlock **ptb, TranslationBlock *tb,
+                             int next_offset)
+{
+    TranslationBlock *tb1;
+    for(;;) {
+        tb1 = *ptb;
+        if (tb1 == tb) {
+            *ptb = *(TranslationBlock **)((char *)tb1 + next_offset);
+            break;
+        }
+        ptb = (TranslationBlock **)((char *)tb1 + next_offset);
+    }
+}
+
+static inline void tb_page_remove(TranslationBlock **ptb, TranslationBlock *tb)
+{
+    TranslationBlock *tb1;
+    unsigned int n1;
+
+    for(;;) {
+        tb1 = *ptb;
+        n1 = (intptr_t)tb1 & 3;
+        tb1 = (TranslationBlock *)((intptr_t)tb1 & ~3);
+        if (tb1 == tb) {
+            *ptb = tb1->page_next[n1];
+            break;
+        }
+        ptb = &tb1->page_next[n1];
+    }
+}
+
+static inline void tb_jmp_remove(TranslationBlock *tb, int n)
+{
+    TranslationBlock *tb1, **ptb;
+    unsigned int n1;
+
+    ptb = &tb->jmp_next[n];
+    tb1 = *ptb;
+    if (tb1) {
+        /* find tb(n) in circular list */
+        for(;;) {
+            tb1 = *ptb;
+            n1 = (intptr_t)tb1 & 3;
+            tb1 = (TranslationBlock *)((intptr_t)tb1 & ~3);
+            if (n1 == n && tb1 == tb)
+                break;
+            if (n1 == 2) {
+                ptb = &tb1->jmp_first;
+            } else {
+                ptb = &tb1->jmp_next[n1];
+            }
+        }
+        /* now we can suppress tb(n) from the list */
+        *ptb = tb->jmp_next[n];
+
+        tb->jmp_next[n] = NULL;
+    }
+}
+
+/* reset the jump entry 'n' of a TB so that it is not chained to
+   another TB */
+static inline void tb_reset_jump(TranslationBlock *tb, int n)
+{
+    tb_set_jmp_target(tb, n, (uintptr_t)(tb->tc_ptr + tb->tb_next_offset[n]));
+}
+
+void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr)
+{
+    CPUState *env;
+    PageDesc *p;
+    unsigned int h, n1;
+    tb_page_addr_t phys_pc;
+    TranslationBlock *tb1, *tb2;
+
+    /* remove the TB from the hash list */
+    phys_pc = tb->page_addr[0] + (tb->pc & ~TARGET_PAGE_MASK);
+    h = tb_phys_hash_func(phys_pc);
+    tb_remove(&tb_phys_hash[h], tb,
+              offsetof(TranslationBlock, phys_hash_next));
+
+    /* remove the TB from the page list */
+    if (tb->page_addr[0] != page_addr) {
+        p = page_find(tb->page_addr[0] >> TARGET_PAGE_BITS);
+        tb_page_remove(&p->first_tb, tb);
+        invalidate_page_bitmap(p);
+    }
+    if (tb->page_addr[1] != -1 && tb->page_addr[1] != page_addr) {
+        p = page_find(tb->page_addr[1] >> TARGET_PAGE_BITS);
+        tb_page_remove(&p->first_tb, tb);
+        invalidate_page_bitmap(p);
+    }
+
+    tb_invalidated_flag = 1;
+
+    /* remove the TB from the hash list */
+    h = tb_jmp_cache_hash_func(tb->pc);
+    for(env = first_cpu; env != NULL; env = env->next_cpu) {
+        if (env->tb_jmp_cache[h] == tb)
+            env->tb_jmp_cache[h] = NULL;
+    }
+
+    /* suppress this TB from the two jump lists */
+    tb_jmp_remove(tb, 0);
+    tb_jmp_remove(tb, 1);
+
+    /* suppress any remaining jumps to this TB */
+    tb1 = tb->jmp_first;
+    for(;;) {
+        n1 = (intptr_t)tb1 & 3;
+        if (n1 == 2)
+            break;
+        tb1 = (TranslationBlock *)((intptr_t)tb1 & ~3);
+        tb2 = tb1->jmp_next[n1];
+        tb_reset_jump(tb1, n1);
+        tb1->jmp_next[n1] = NULL;
+        tb1 = tb2;
+    }
+    tb->jmp_first = (TranslationBlock *)((intptr_t)tb | 2); /* fail safe */
+
+    tb_phys_invalidate_count++;
+}
+
+#ifdef VBOX
+
+void tb_invalidate_virt(CPUState *env, uint32_t eip)
+{
+# if 1
+    tb_flush(env);
+# else
+    uint8_t *cs_base, *pc;
+    unsigned int flags, h, phys_pc;
+    TranslationBlock *tb, **ptb;
+
+    flags = env->hflags;
+    flags |= (env->eflags & (IOPL_MASK | TF_MASK | VM_MASK));
+    cs_base = env->segs[R_CS].base;
+    pc = cs_base + eip;
+
+    tb = tb_find(&ptb, (uintptr_t)pc, (uintptr_t)cs_base,
+                 flags);
+
+    if(tb)
+    {
+#  ifdef DEBUG
+        printf("invalidating TB (%08X) at %08X\n", tb, eip);
+#  endif
+        tb_invalidate(tb);
+        //Note: this will leak TBs, but the whole cache will be flushed
+        //      when it happens too often
+        tb->pc = 0;
+        tb->cs_base = 0;
+        tb->flags = 0;
+    }
+# endif
+}
+
+# ifdef VBOX_STRICT
+/**
+ * Gets the page offset.
+ */
+ram_addr_t get_phys_page_offset(target_ulong addr)
+{
+    PhysPageDesc *p = phys_page_find(addr >> TARGET_PAGE_BITS);
+    return p ? p->phys_offset : 0;
+}
+# endif /* VBOX_STRICT */
+
+#endif /* VBOX */
+
+static inline void set_bits(uint8_t *tab, int start, int len)
+{
+    int end, mask, end1;
+
+    end = start + len;
+    tab += start >> 3;
+    mask = 0xff << (start & 7);
+    if ((start & ~7) == (end & ~7)) {
+        if (start < end) {
+            mask &= ~(0xff << (end & 7));
+            *tab |= mask;
+        }
+    } else {
+        *tab++ |= mask;
+        start = (start + 8) & ~7;
+        end1 = end & ~7;
+        while (start < end1) {
+            *tab++ = 0xff;
+            start += 8;
+        }
+        if (start < end) {
+            mask = ~(0xff << (end & 7));
+            *tab |= mask;
+        }
+    }
+}
+
+static void build_page_bitmap(PageDesc *p)
+{
+    int n, tb_start, tb_end;
+    TranslationBlock *tb;
+
+    p->code_bitmap = qemu_mallocz(TARGET_PAGE_SIZE / 8);
+
+    tb = p->first_tb;
+    while (tb != NULL) {
+        n = (intptr_t)tb & 3;
+        tb = (TranslationBlock *)((intptr_t)tb & ~3);
+        /* NOTE: this is subtle as a TB may span two physical pages */
+        if (n == 0) {
+            /* NOTE: tb_end may be after the end of the page, but
+               it is not a problem */
+            tb_start = tb->pc & ~TARGET_PAGE_MASK;
+            tb_end = tb_start + tb->size;
+            if (tb_end > TARGET_PAGE_SIZE)
+                tb_end = TARGET_PAGE_SIZE;
+        } else {
+            tb_start = 0;
+            tb_end = ((tb->pc + tb->size) & ~TARGET_PAGE_MASK);
+        }
+        set_bits(p->code_bitmap, tb_start, tb_end - tb_start);
+        tb = tb->page_next[n];
+    }
+}
+
+TranslationBlock *tb_gen_code(CPUState *env,
+                              target_ulong pc, target_ulong cs_base,
+                              int flags, int cflags)
+{
+    TranslationBlock *tb;
+    uint8_t *tc_ptr;
+    tb_page_addr_t phys_pc, phys_page2;
+    target_ulong virt_page2;
+    int code_gen_size;
+
+    phys_pc = get_page_addr_code(env, pc);
+    tb = tb_alloc(pc);
+    if (!tb) {
+        /* flush must be done */
+        tb_flush(env);
+        /* cannot fail at this point */
+        tb = tb_alloc(pc);
+        /* Don't forget to invalidate previous TB info.  */
+        tb_invalidated_flag = 1;
+    }
+    tc_ptr = code_gen_ptr;
+    tb->tc_ptr = tc_ptr;
+    tb->cs_base = cs_base;
+    tb->flags = flags;
+    tb->cflags = cflags;
+    cpu_gen_code(env, tb, &code_gen_size);
+    code_gen_ptr = (void *)(((uintptr_t)code_gen_ptr + code_gen_size + CODE_GEN_ALIGN - 1) & ~(CODE_GEN_ALIGN - 1));
+
+    /* check next page if needed */
+    virt_page2 = (pc + tb->size - 1) & TARGET_PAGE_MASK;
+    phys_page2 = -1;
+    if ((pc & TARGET_PAGE_MASK) != virt_page2) {
+        phys_page2 = get_page_addr_code(env, virt_page2);
+    }
+    tb_link_page(tb, phys_pc, phys_page2);
+    return tb;
+}
+
+/* invalidate all TBs which intersect with the target physical page
+   starting in range [start;end[. NOTE: start and end must refer to
+   the same physical page. 'is_cpu_write_access' should be true if called
+   from a real cpu write access: the virtual CPU will exit the current
+   TB if code is modified inside this TB. */
+void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end,
+                                   int is_cpu_write_access)
+{
+    TranslationBlock *tb, *tb_next, *saved_tb;
+    CPUState *env = cpu_single_env;
+    tb_page_addr_t tb_start, tb_end;
+    PageDesc *p;
+    int n;
+#ifdef TARGET_HAS_PRECISE_SMC
+    int current_tb_not_found = is_cpu_write_access;
+    TranslationBlock *current_tb = NULL;
+    int current_tb_modified = 0;
+    target_ulong current_pc = 0;
+    target_ulong current_cs_base = 0;
+    int current_flags = 0;
+#endif /* TARGET_HAS_PRECISE_SMC */
+
+    p = page_find(start >> TARGET_PAGE_BITS);
+    if (!p)
+        return;
+    if (!p->code_bitmap &&
+        ++p->code_write_count >= SMC_BITMAP_USE_THRESHOLD &&
+        is_cpu_write_access) {
+        /* build code bitmap */
+        build_page_bitmap(p);
+    }
+
+    /* we remove all the TBs in the range [start, end[ */
+    /* XXX: see if in some cases it could be faster to invalidate all the code */
+    tb = p->first_tb;
+    while (tb != NULL) {
+        n = (intptr_t)tb & 3;
+        tb = (TranslationBlock *)((intptr_t)tb & ~3);
+        tb_next = tb->page_next[n];
+        /* NOTE: this is subtle as a TB may span two physical pages */
+        if (n == 0) {
+            /* NOTE: tb_end may be after the end of the page, but
+               it is not a problem */
+            tb_start = tb->page_addr[0] + (tb->pc & ~TARGET_PAGE_MASK);
+            tb_end = tb_start + tb->size;
+        } else {
+            tb_start = tb->page_addr[1];
+            tb_end = tb_start + ((tb->pc + tb->size) & ~TARGET_PAGE_MASK);
+        }
+        if (!(tb_end <= start || tb_start >= end)) {
+#ifdef TARGET_HAS_PRECISE_SMC
+            if (current_tb_not_found) {
+                current_tb_not_found = 0;
+                current_tb = NULL;
+                if (env->mem_io_pc) {
+                    /* now we have a real cpu fault */
+                    current_tb = tb_find_pc(env->mem_io_pc);
+                }
+            }
+            if (current_tb == tb &&
+                (current_tb->cflags & CF_COUNT_MASK) != 1) {
+                /* If we are modifying the current TB, we must stop
+                its execution. We could be more precise by checking
+                that the modification is after the current PC, but it
+                would require a specialized function to partially
+                restore the CPU state */
+
+                current_tb_modified = 1;
+                cpu_restore_state(current_tb, env,
+                                  env->mem_io_pc, NULL);
+                cpu_get_tb_cpu_state(env, &current_pc, &current_cs_base,
+                                     &current_flags);
+            }
+#endif /* TARGET_HAS_PRECISE_SMC */
+            /* we need to do that to handle the case where a signal
+               occurs while doing tb_phys_invalidate() */
+            saved_tb = NULL;
+            if (env) {
+                saved_tb = env->current_tb;
+                env->current_tb = NULL;
+            }
+            tb_phys_invalidate(tb, -1);
+            if (env) {
+                env->current_tb = saved_tb;
+                if (env->interrupt_request && env->current_tb)
+                    cpu_interrupt(env, env->interrupt_request);
+            }
+        }
+        tb = tb_next;
+    }
+#if !defined(CONFIG_USER_ONLY)
+    /* if no code remaining, no need to continue to use slow writes */
+    if (!p->first_tb) {
+        invalidate_page_bitmap(p);
+        if (is_cpu_write_access) {
+            tlb_unprotect_code_phys(env, start, env->mem_io_vaddr);
+        }
+    }
+#endif
+#ifdef TARGET_HAS_PRECISE_SMC
+    if (current_tb_modified) {
+        /* we generate a block containing just the instruction
+           modifying the memory. It will ensure that it cannot modify
+           itself */
+        env->current_tb = NULL;
+        tb_gen_code(env, current_pc, current_cs_base, current_flags, 1);
+        cpu_resume_from_signal(env, NULL);
+    }
+#endif
+}
+
+/* len must be <= 8 and start must be a multiple of len */
+static inline void tb_invalidate_phys_page_fast(tb_page_addr_t start, int len)
+{
+    PageDesc *p;
+    int offset, b;
+#if 0
+    if (1) {
+        qemu_log("modifying code at 0x%x size=%d EIP=%x PC=%08x\n",
+                  cpu_single_env->mem_io_vaddr, len,
+                  cpu_single_env->eip,
+                  cpu_single_env->eip + (intptr_t)cpu_single_env->segs[R_CS].base);
+    }
+#endif
+    p = page_find(start >> TARGET_PAGE_BITS);
+    if (!p)
+        return;
+    if (p->code_bitmap) {
+        offset = start & ~TARGET_PAGE_MASK;
+        b = p->code_bitmap[offset >> 3] >> (offset & 7);
+        if (b & ((1 << len) - 1))
+            goto do_invalidate;
+    } else {
+    do_invalidate:
+        tb_invalidate_phys_page_range(start, start + len, 1);
+    }
+}
+
+#if !defined(CONFIG_SOFTMMU)
+static void tb_invalidate_phys_page(tb_page_addr_t addr,
+                                    uintptr_t pc, void *puc)
+{
+    TranslationBlock *tb;
+    PageDesc *p;
+    int n;
+#ifdef TARGET_HAS_PRECISE_SMC
+    TranslationBlock *current_tb = NULL;
+    CPUState *env = cpu_single_env;
+    int current_tb_modified = 0;
+    target_ulong current_pc = 0;
+    target_ulong current_cs_base = 0;
+    int current_flags = 0;
+#endif
+
+    addr &= TARGET_PAGE_MASK;
+    p = page_find(addr >> TARGET_PAGE_BITS);
+    if (!p)
+        return;
+    tb = p->first_tb;
+#ifdef TARGET_HAS_PRECISE_SMC
+    if (tb && pc != 0) {
+        current_tb = tb_find_pc(pc);
+    }
+#endif
+    while (tb != NULL) {
+        n = (intptr_t)tb & 3;
+        tb = (TranslationBlock *)((intptr_t)tb & ~3);
+#ifdef TARGET_HAS_PRECISE_SMC
+        if (current_tb == tb &&
+            (current_tb->cflags & CF_COUNT_MASK) != 1) {
+                /* If we are modifying the current TB, we must stop
+                   its execution. We could be more precise by checking
+                   that the modification is after the current PC, but it
+                   would require a specialized function to partially
+                   restore the CPU state */
+
+            current_tb_modified = 1;
+            cpu_restore_state(current_tb, env, pc, puc);
+            cpu_get_tb_cpu_state(env, &current_pc, &current_cs_base,
+                                 &current_flags);
+        }
+#endif /* TARGET_HAS_PRECISE_SMC */
+        tb_phys_invalidate(tb, addr);
+        tb = tb->page_next[n];
+    }
+    p->first_tb = NULL;
+#ifdef TARGET_HAS_PRECISE_SMC
+    if (current_tb_modified) {
+        /* we generate a block containing just the instruction
+           modifying the memory. It will ensure that it cannot modify
+           itself */
+        env->current_tb = NULL;
+        tb_gen_code(env, current_pc, current_cs_base, current_flags, 1);
+        cpu_resume_from_signal(env, puc);
+    }
+#endif
+}
+#endif
+
+/* add the tb in the target page and protect it if necessary */
+static inline void tb_alloc_page(TranslationBlock *tb,
+                                 unsigned int n, tb_page_addr_t page_addr)
+{
+    PageDesc *p;
+    TranslationBlock *last_first_tb;
+
+    tb->page_addr[n] = page_addr;
+    p = page_find_alloc(page_addr >> TARGET_PAGE_BITS, 1);
+    tb->page_next[n] = p->first_tb;
+    last_first_tb = p->first_tb;
+    p->first_tb = (TranslationBlock *)((intptr_t)tb | n);
+    invalidate_page_bitmap(p);
+
+#if defined(TARGET_HAS_SMC) || 1
+
+#if defined(CONFIG_USER_ONLY)
+    if (p->flags & PAGE_WRITE) {
+        target_ulong addr;
+        PageDesc *p2;
+        int prot;
+
+        /* force the host page as non writable (writes will have a
+           page fault + mprotect overhead) */
+        page_addr &= qemu_host_page_mask;
+        prot = 0;
+        for(addr = page_addr; addr < page_addr + qemu_host_page_size;
+            addr += TARGET_PAGE_SIZE) {
+
+            p2 = page_find (addr >> TARGET_PAGE_BITS);
+            if (!p2)
+                continue;
+            prot |= p2->flags;
+            p2->flags &= ~PAGE_WRITE;
+          }
+        mprotect(g2h(page_addr), qemu_host_page_size,
+                 (prot & PAGE_BITS) & ~PAGE_WRITE);
+#ifdef DEBUG_TB_INVALIDATE
+        printf("protecting code page: 0x" TARGET_FMT_lx "\n",
+               page_addr);
+#endif
+    }
+#else
+    /* if some code is already present, then the pages are already
+       protected. So we handle the case where only the first TB is
+       allocated in a physical page */
+    if (!last_first_tb) {
+        tlb_protect_code(page_addr);
+    }
+#endif
+
+#endif /* TARGET_HAS_SMC */
+}
+
+/* Allocate a new translation block. Flush the translation buffer if
+   too many translation blocks or too much generated code. */
+TranslationBlock *tb_alloc(target_ulong pc)
+{
+    TranslationBlock *tb;
+
+    if (nb_tbs >= code_gen_max_blocks ||
+        (code_gen_ptr - code_gen_buffer) >= VBOX_ONLY((uintptr_t))code_gen_buffer_max_size)
+        return NULL;
+    tb = &tbs[nb_tbs++];
+    tb->pc = pc;
+    tb->cflags = 0;
+    return tb;
+}
+
+void tb_free(TranslationBlock *tb)
+{
+    /* In practice this is mostly used for single use temporary TB
+       Ignore the hard cases and just back up if this TB happens to
+       be the last one generated.  */
+    if (nb_tbs > 0 && tb == &tbs[nb_tbs - 1]) {
+        code_gen_ptr = tb->tc_ptr;
+        nb_tbs--;
+    }
+}
+
+/* add a new TB and link it to the physical page tables. phys_page2 is
+   (-1) to indicate that only one page contains the TB. */
+void tb_link_page(TranslationBlock *tb,
+                  tb_page_addr_t phys_pc, tb_page_addr_t phys_page2)
+{
+    unsigned int h;
+    TranslationBlock **ptb;
+
+    /* Grab the mmap lock to stop another thread invalidating this TB
+       before we are done.  */
+    mmap_lock();
+    /* add in the physical hash table */
+    h = tb_phys_hash_func(phys_pc);
+    ptb = &tb_phys_hash[h];
+    tb->phys_hash_next = *ptb;
+    *ptb = tb;
+
+    /* add in the page list */
+    tb_alloc_page(tb, 0, phys_pc & TARGET_PAGE_MASK);
+    if (phys_page2 != -1)
+        tb_alloc_page(tb, 1, phys_page2);
+    else
+        tb->page_addr[1] = -1;
+
+    tb->jmp_first = (TranslationBlock *)((intptr_t)tb | 2);
+    tb->jmp_next[0] = NULL;
+    tb->jmp_next[1] = NULL;
+
+    /* init original jump addresses */
+    if (tb->tb_next_offset[0] != 0xffff)
+        tb_reset_jump(tb, 0);
+    if (tb->tb_next_offset[1] != 0xffff)
+        tb_reset_jump(tb, 1);
+
+#ifdef DEBUG_TB_CHECK
+    tb_page_check();
+#endif
+    mmap_unlock();
+}
+
+/* find the TB 'tb' such that tb[0].tc_ptr <= tc_ptr <
+   tb[1].tc_ptr. Return NULL if not found */
+TranslationBlock *tb_find_pc(uintptr_t tc_ptr)
+{
+    int m_min, m_max, m;
+    uintptr_t v;
+    TranslationBlock *tb;
+
+    if (nb_tbs <= 0)
+        return NULL;
+    if (tc_ptr < (uintptr_t)code_gen_buffer ||
+        tc_ptr >= (uintptr_t)code_gen_ptr)
+        return NULL;
+    /* binary search (cf Knuth) */
+    m_min = 0;
+    m_max = nb_tbs - 1;
+    while (m_min <= m_max) {
+        m = (m_min + m_max) >> 1;
+        tb = &tbs[m];
+        v = (uintptr_t)tb->tc_ptr;
+        if (v == tc_ptr)
+            return tb;
+        else if (tc_ptr < v) {
+            m_max = m - 1;
+        } else {
+            m_min = m + 1;
+        }
+    }
+    return &tbs[m_max];
+}
+
+static void tb_reset_jump_recursive(TranslationBlock *tb);
+
+static inline void tb_reset_jump_recursive2(TranslationBlock *tb, int n)
+{
+    TranslationBlock *tb1, *tb_next, **ptb;
+    unsigned int n1;
+
+    tb1 = tb->jmp_next[n];
+    if (tb1 != NULL) {
+        /* find head of list */
+        for(;;) {
+            n1 = (intptr_t)tb1 & 3;
+            tb1 = (TranslationBlock *)((intptr_t)tb1 & ~3);
+            if (n1 == 2)
+                break;
+            tb1 = tb1->jmp_next[n1];
+        }
+        /* we are now sure now that tb jumps to tb1 */
+        tb_next = tb1;
+
+        /* remove tb from the jmp_first list */
+        ptb = &tb_next->jmp_first;
+        for(;;) {
+            tb1 = *ptb;
+            n1 = (intptr_t)tb1 & 3;
+            tb1 = (TranslationBlock *)((intptr_t)tb1 & ~3);
+            if (n1 == n && tb1 == tb)
+                break;
+            ptb = &tb1->jmp_next[n1];
+        }
+        *ptb = tb->jmp_next[n];
+        tb->jmp_next[n] = NULL;
+
+        /* suppress the jump to next tb in generated code */
+        tb_reset_jump(tb, n);
+
+        /* suppress jumps in the tb on which we could have jumped */
+        tb_reset_jump_recursive(tb_next);
+    }
+}
+
+static void tb_reset_jump_recursive(TranslationBlock *tb)
+{
+    tb_reset_jump_recursive2(tb, 0);
+    tb_reset_jump_recursive2(tb, 1);
+}
+
+#if defined(TARGET_HAS_ICE)
+#if defined(CONFIG_USER_ONLY)
+static void breakpoint_invalidate(CPUState *env, target_ulong pc)
+{
+    tb_invalidate_phys_page_range(pc, pc + 1, 0);
+}
+#else
+static void breakpoint_invalidate(CPUState *env, target_ulong pc)
+{
+    target_phys_addr_t addr;
+    target_ulong pd;
+    ram_addr_t ram_addr;
+    PhysPageDesc *p;
+
+    addr = cpu_get_phys_page_debug(env, pc);
+    p = phys_page_find(addr >> TARGET_PAGE_BITS);
+    if (!p) {
+        pd = IO_MEM_UNASSIGNED;
+    } else {
+        pd = p->phys_offset;
+    }
+    ram_addr = (pd & TARGET_PAGE_MASK) | (pc & ~TARGET_PAGE_MASK);
+    tb_invalidate_phys_page_range(ram_addr, ram_addr + 1, 0);
+}
+#endif
+#endif /* TARGET_HAS_ICE */
+
+#if defined(CONFIG_USER_ONLY)
+void cpu_watchpoint_remove_all(CPUState *env, int mask)
+
+{
+}
+
+int cpu_watchpoint_insert(CPUState *env, target_ulong addr, target_ulong len,
+                          int flags, CPUWatchpoint **watchpoint)
+{
+    return -ENOSYS;
+}
+#else
+/* Add a watchpoint.  */
+int cpu_watchpoint_insert(CPUState *env, target_ulong addr, target_ulong len,
+                          int flags, CPUWatchpoint **watchpoint)
+{
+    target_ulong len_mask = ~(len - 1);
+    CPUWatchpoint *wp;
+
+    /* sanity checks: allow power-of-2 lengths, deny unaligned watchpoints */
+    if ((len != 1 && len != 2 && len != 4 && len != 8) || (addr & ~len_mask)) {
+        fprintf(stderr, "qemu: tried to set invalid watchpoint at "
+                TARGET_FMT_lx ", len=" TARGET_FMT_lu "\n", addr, len);
+#ifndef VBOX
+        return -EINVAL;
+#else
+        return VERR_INVALID_PARAMETER;
+#endif
+    }
+    wp = qemu_malloc(sizeof(*wp));
+
+    wp->vaddr = addr;
+    wp->len_mask = len_mask;
+    wp->flags = flags;
+
+    /* keep all GDB-injected watchpoints in front */
+    if (flags & BP_GDB)
+        QTAILQ_INSERT_HEAD(&env->watchpoints, wp, entry);
+    else
+        QTAILQ_INSERT_TAIL(&env->watchpoints, wp, entry);
+
+    tlb_flush_page(env, addr);
+
+    if (watchpoint)
+        *watchpoint = wp;
+    return 0;
+}
+
+/* Remove a specific watchpoint.  */
+int cpu_watchpoint_remove(CPUState *env, target_ulong addr, target_ulong len,
+                          int flags)
+{
+    target_ulong len_mask = ~(len - 1);
+    CPUWatchpoint *wp;
+
+    QTAILQ_FOREACH(wp, &env->watchpoints, entry) {
+        if (addr == wp->vaddr && len_mask == wp->len_mask
+                && flags == (wp->flags & ~BP_WATCHPOINT_HIT)) {
+            cpu_watchpoint_remove_by_ref(env, wp);
+            return 0;
+        }
+    }
+#ifndef VBOX
+    return -ENOENT;
+#else
+    return VERR_NOT_FOUND;
+#endif
+}
+
+/* Remove a specific watchpoint by reference.  */
+void cpu_watchpoint_remove_by_ref(CPUState *env, CPUWatchpoint *watchpoint)
+{
+    QTAILQ_REMOVE(&env->watchpoints, watchpoint, entry);
+
+    tlb_flush_page(env, watchpoint->vaddr);
+
+    qemu_free(watchpoint);
+}
+
+/* Remove all matching watchpoints.  */
+void cpu_watchpoint_remove_all(CPUState *env, int mask)
+{
+    CPUWatchpoint *wp, *next;
+
+    QTAILQ_FOREACH_SAFE(wp, &env->watchpoints, entry, next) {
+        if (wp->flags & mask)
+            cpu_watchpoint_remove_by_ref(env, wp);
+    }
+}
+#endif
+
+/* Add a breakpoint.  */
+int cpu_breakpoint_insert(CPUState *env, target_ulong pc, int flags,
+                          CPUBreakpoint **breakpoint)
+{
+#if defined(TARGET_HAS_ICE)
+    CPUBreakpoint *bp;
+
+    bp = qemu_malloc(sizeof(*bp));
+
+    bp->pc = pc;
+    bp->flags = flags;
+
+    /* keep all GDB-injected breakpoints in front */
+    if (flags & BP_GDB)
+        QTAILQ_INSERT_HEAD(&env->breakpoints, bp, entry);
+    else
+        QTAILQ_INSERT_TAIL(&env->breakpoints, bp, entry);
+
+    breakpoint_invalidate(env, pc);
+
+    if (breakpoint)
+        *breakpoint = bp;
+    return 0;
+#else
+    return -ENOSYS;
+#endif
+}
+
+/* Remove a specific breakpoint.  */
+int cpu_breakpoint_remove(CPUState *env, target_ulong pc, int flags)
+{
+#if defined(TARGET_HAS_ICE)
+    CPUBreakpoint *bp;
+
+    QTAILQ_FOREACH(bp, &env->breakpoints, entry) {
+        if (bp->pc == pc && bp->flags == flags) {
+            cpu_breakpoint_remove_by_ref(env, bp);
+            return 0;
+        }
+    }
+# ifndef VBOX
+    return -ENOENT;
+# else
+    return VERR_NOT_FOUND;
+# endif
+#else
+    return -ENOSYS;
+#endif
+}
+
+/* Remove a specific breakpoint by reference.  */
+void cpu_breakpoint_remove_by_ref(CPUState *env, CPUBreakpoint *breakpoint)
+{
+#if defined(TARGET_HAS_ICE)
+    QTAILQ_REMOVE(&env->breakpoints, breakpoint, entry);
+
+    breakpoint_invalidate(env, breakpoint->pc);
+
+    qemu_free(breakpoint);
+#endif
+}
+
+/* Remove all matching breakpoints. */
+void cpu_breakpoint_remove_all(CPUState *env, int mask)
+{
+#if defined(TARGET_HAS_ICE)
+    CPUBreakpoint *bp, *next;
+
+    QTAILQ_FOREACH_SAFE(bp, &env->breakpoints, entry, next) {
+        if (bp->flags & mask)
+            cpu_breakpoint_remove_by_ref(env, bp);
+    }
+#endif
+}
+
+/* enable or disable single step mode. EXCP_DEBUG is returned by the
+   CPU loop after each instruction */
+void cpu_single_step(CPUState *env, int enabled)
+{
+#if defined(TARGET_HAS_ICE)
+    if (env->singlestep_enabled != enabled) {
+        env->singlestep_enabled = enabled;
+        if (kvm_enabled())
+            kvm_update_guest_debug(env, 0);
+        else {
+            /* must flush all the translated code to avoid inconsistencies */
+            /* XXX: only flush what is necessary */
+            tb_flush(env);
+        }
+    }
+#endif
+}
+
+#ifndef VBOX
+
+/* enable or disable low levels log */
+void cpu_set_log(int log_flags)
+{
+    loglevel = log_flags;
+    if (loglevel && !logfile) {
+        logfile = fopen(logfilename, log_append ? "a" : "w");
+        if (!logfile) {
+            perror(logfilename);
+            _exit(1);
+        }
+#if !defined(CONFIG_SOFTMMU)
+        /* must avoid mmap() usage of glibc by setting a buffer "by hand" */
+        {
+            static char logfile_buf[4096];
+            setvbuf(logfile, logfile_buf, _IOLBF, sizeof(logfile_buf));
+        }
+#elif !defined(_WIN32)
+        /* Win32 doesn't support line-buffering and requires size >= 2 */
+        setvbuf(logfile, NULL, _IOLBF, 0);
+#endif
+        log_append = 1;
+    }
+    if (!loglevel && logfile) {
+        fclose(logfile);
+        logfile = NULL;
+    }
+}
+
+void cpu_set_log_filename(const char *filename)
+{
+    logfilename = strdup(filename);
+    if (logfile) {
+        fclose(logfile);
+        logfile = NULL;
+    }
+    cpu_set_log(loglevel);
+}
+
+#endif /* !VBOX */
+
+static void cpu_unlink_tb(CPUState *env)
+{
+    /* FIXME: TB unchaining isn't SMP safe.  For now just ignore the
+       problem and hope the cpu will stop of its own accord.  For userspace
+       emulation this often isn't actually as bad as it sounds.  Often
+       signals are used primarily to interrupt blocking syscalls.  */
+    TranslationBlock *tb;
+    static spinlock_t interrupt_lock = SPIN_LOCK_UNLOCKED;
+
+    spin_lock(&interrupt_lock);
+    tb = env->current_tb;
+    /* if the cpu is currently executing code, we must unlink it and
+       all the potentially executing TB */
+    if (tb) {
+        env->current_tb = NULL;
+        tb_reset_jump_recursive(tb);
+    }
+    spin_unlock(&interrupt_lock);
+}
+
+/* mask must never be zero, except for A20 change call */
+void cpu_interrupt(CPUState *env, int mask)
+{
+    int old_mask;
+
+    old_mask = env->interrupt_request;
+#ifndef VBOX
+    env->interrupt_request |= mask;
+#else  /* VBOX */
+    VM_ASSERT_EMT(env->pVM);
+    ASMAtomicOrS32((int32_t volatile *)&env->interrupt_request, mask);
+#endif /* VBOX */
+
+#ifndef VBOX
+#ifndef CONFIG_USER_ONLY
+    /*
+     * If called from iothread context, wake the target cpu in
+     * case its halted.
+     */
+    if (!qemu_cpu_self(env)) {
+        qemu_cpu_kick(env);
+        return;
+    }
+#endif
+#endif /* !VBOX */
+
+    if (use_icount) {
+        env->icount_decr.u16.high = 0xffff;
+#ifndef CONFIG_USER_ONLY
+        if (!can_do_io(env)
+            && (mask & ~old_mask) != 0) {
+            cpu_abort(env, "Raised interrupt while not in I/O function");
+        }
+#endif
+    } else {
+        cpu_unlink_tb(env);
+    }
+}
+
+void cpu_reset_interrupt(CPUState *env, int mask)
+{
+#ifdef VBOX
+    /*
+     * Note: the current implementation can be executed by another thread without problems; make sure this remains true
+     *       for future changes!
+     */
+    ASMAtomicAndS32((int32_t volatile *)&env->interrupt_request, ~mask);
+#else /* !VBOX */
+    env->interrupt_request &= ~mask;
+#endif /* !VBOX */
+}
+
+void cpu_exit(CPUState *env)
+{
+    env->exit_request = 1;
+    cpu_unlink_tb(env);
+}
+
+#ifndef VBOX
+const CPULogItem cpu_log_items[] = {
+    { CPU_LOG_TB_OUT_ASM, "out_asm",
+      "show generated host assembly code for each compiled TB" },
+    { CPU_LOG_TB_IN_ASM, "in_asm",
+      "show target assembly code for each compiled TB" },
+    { CPU_LOG_TB_OP, "op",
+      "show micro ops for each compiled TB" },
+    { CPU_LOG_TB_OP_OPT, "op_opt",
+      "show micro ops "
+#ifdef TARGET_I386
+      "before eflags optimization and "
+#endif
+      "after liveness analysis" },
+    { CPU_LOG_INT, "int",
+      "show interrupts/exceptions in short format" },
+    { CPU_LOG_EXEC, "exec",
+      "show trace before each executed TB (lots of logs)" },
+    { CPU_LOG_TB_CPU, "cpu",
+      "show CPU state before block translation" },
+#ifdef TARGET_I386
+    { CPU_LOG_PCALL, "pcall",
+      "show protected mode far calls/returns/exceptions" },
+    { CPU_LOG_RESET, "cpu_reset",
+      "show CPU state before CPU resets" },
+#endif
+#ifdef DEBUG_IOPORT
+    { CPU_LOG_IOPORT, "ioport",
+      "show all i/o ports accesses" },
+#endif
+    { 0, NULL, NULL },
+};
+
+#ifndef CONFIG_USER_ONLY
+static QLIST_HEAD(memory_client_list, CPUPhysMemoryClient) memory_client_list
+    = QLIST_HEAD_INITIALIZER(memory_client_list);
+
+static void cpu_notify_set_memory(target_phys_addr_t start_addr,
+				  ram_addr_t size,
+				  ram_addr_t phys_offset)
+{
+    CPUPhysMemoryClient *client;
+    QLIST_FOREACH(client, &memory_client_list, list) {
+        client->set_memory(client, start_addr, size, phys_offset);
+    }
+}
+
+static int cpu_notify_sync_dirty_bitmap(target_phys_addr_t start,
+					target_phys_addr_t end)
+{
+    CPUPhysMemoryClient *client;
+    QLIST_FOREACH(client, &memory_client_list, list) {
+        int r = client->sync_dirty_bitmap(client, start, end);
+        if (r < 0)
+            return r;
+    }
+    return 0;
+}
+
+static int cpu_notify_migration_log(int enable)
+{
+    CPUPhysMemoryClient *client;
+    QLIST_FOREACH(client, &memory_client_list, list) {
+        int r = client->migration_log(client, enable);
+        if (r < 0)
+            return r;
+    }
+    return 0;
+}
+
+static void phys_page_for_each_1(CPUPhysMemoryClient *client,
+                                 int level, void **lp)
+{
+    int i;
+
+    if (*lp == NULL) {
+        return;
+    }
+    if (level == 0) {
+        PhysPageDesc *pd = *lp;
+        for (i = 0; i < L2_SIZE; ++i) {
+            if (pd[i].phys_offset != IO_MEM_UNASSIGNED) {
+                client->set_memory(client, pd[i].region_offset,
+                                   TARGET_PAGE_SIZE, pd[i].phys_offset);
+            }
+        }
+    } else {
+        void **pp = *lp;
+        for (i = 0; i < L2_SIZE; ++i) {
+            phys_page_for_each_1(client, level - 1, pp + i);
+        }
+    }
+}
+
+static void phys_page_for_each(CPUPhysMemoryClient *client)
+{
+    int i;
+    for (i = 0; i < P_L1_SIZE; ++i) {
+        phys_page_for_each_1(client, P_L1_SHIFT / L2_BITS - 1,
+                             l1_phys_map + 1);
+    }
+}
+
+void cpu_register_phys_memory_client(CPUPhysMemoryClient *client)
+{
+    QLIST_INSERT_HEAD(&memory_client_list, client, list);
+    phys_page_for_each(client);
+}
+
+void cpu_unregister_phys_memory_client(CPUPhysMemoryClient *client)
+{
+    QLIST_REMOVE(client, list);
+}
+#endif
+
+static int cmp1(const char *s1, int n, const char *s2)
+{
+    if (strlen(s2) != n)
+        return 0;
+    return memcmp(s1, s2, n) == 0;
+}
+
+/* takes a comma separated list of log masks. Return 0 if error. */
+int cpu_str_to_log_mask(const char *str)
+{
+    const CPULogItem *item;
+    int mask;
+    const char *p, *p1;
+
+    p = str;
+    mask = 0;
+    for(;;) {
+        p1 = strchr(p, ',');
+        if (!p1)
+            p1 = p + strlen(p);
+	if(cmp1(p,p1-p,"all")) {
+		for(item = cpu_log_items; item->mask != 0; item++) {
+			mask |= item->mask;
+		}
+	} else {
+        for(item = cpu_log_items; item->mask != 0; item++) {
+            if (cmp1(p, p1 - p, item->name))
+                goto found;
+        }
+        return 0;
+	}
+    found:
+        mask |= item->mask;
+        if (*p1 != ',')
+            break;
+        p = p1 + 1;
+    }
+    return mask;
+}
+
+void cpu_abort(CPUState *env, const char *fmt, ...)
+{
+    va_list ap;
+    va_list ap2;
+
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    fprintf(stderr, "qemu: fatal: ");
+    vfprintf(stderr, fmt, ap);
+    fprintf(stderr, "\n");
+#ifdef TARGET_I386
+    cpu_dump_state(env, stderr, fprintf, X86_DUMP_FPU | X86_DUMP_CCOP);
+#else
+    cpu_dump_state(env, stderr, fprintf, 0);
+#endif
+    if (qemu_log_enabled()) {
+        qemu_log("qemu: fatal: ");
+        qemu_log_vprintf(fmt, ap2);
+        qemu_log("\n");
+#ifdef TARGET_I386
+        log_cpu_state(env, X86_DUMP_FPU | X86_DUMP_CCOP);
+#else
+        log_cpu_state(env, 0);
+#endif
+        qemu_log_flush();
+        qemu_log_close();
+    }
+    va_end(ap2);
+    va_end(ap);
+#if defined(CONFIG_USER_ONLY)
+    {
+        struct sigaction act;
+        sigfillset(&act.sa_mask);
+        act.sa_handler = SIG_DFL;
+        sigaction(SIGABRT, &act, NULL);
+    }
+#endif
+    abort();
+}
+
+CPUState *cpu_copy(CPUState *env)
+{
+    CPUState *new_env = cpu_init(env->cpu_model_str);
+    CPUState *next_cpu = new_env->next_cpu;
+    int cpu_index = new_env->cpu_index;
+#if defined(TARGET_HAS_ICE)
+    CPUBreakpoint *bp;
+    CPUWatchpoint *wp;
+#endif
+
+    memcpy(new_env, env, sizeof(CPUState));
+
+    /* Preserve chaining and index. */
+    new_env->next_cpu = next_cpu;
+    new_env->cpu_index = cpu_index;
+
+    /* Clone all break/watchpoints.
+       Note: Once we support ptrace with hw-debug register access, make sure
+       BP_CPU break/watchpoints are handled correctly on clone. */
+    QTAILQ_INIT(&env->breakpoints);
+    QTAILQ_INIT(&env->watchpoints);
+#if defined(TARGET_HAS_ICE)
+    QTAILQ_FOREACH(bp, &env->breakpoints, entry) {
+        cpu_breakpoint_insert(new_env, bp->pc, bp->flags, NULL);
+    }
+    QTAILQ_FOREACH(wp, &env->watchpoints, entry) {
+        cpu_watchpoint_insert(new_env, wp->vaddr, (~wp->len_mask) + 1,
+                              wp->flags, NULL);
+    }
+#endif
+
+    return new_env;
+}
+
+#endif /* !VBOX */
+#if !defined(CONFIG_USER_ONLY)
+
+static inline void tlb_flush_jmp_cache(CPUState *env, target_ulong addr)
+{
+    unsigned int i;
+
+    /* Discard jump cache entries for any tb which might potentially
+       overlap the flushed page.  */
+    i = tb_jmp_cache_hash_page(addr - TARGET_PAGE_SIZE);
+    memset (&env->tb_jmp_cache[i], 0,
+	    TB_JMP_PAGE_SIZE * sizeof(TranslationBlock *));
+
+    i = tb_jmp_cache_hash_page(addr);
+    memset (&env->tb_jmp_cache[i], 0,
+	    TB_JMP_PAGE_SIZE * sizeof(TranslationBlock *));
+#ifdef VBOX
+
+    /* inform raw mode about TLB page flush */
+    remR3FlushPage(env, addr);
+#endif /* VBOX */
+}
+
+static CPUTLBEntry s_cputlb_empty_entry = {
+    .addr_read  = -1,
+    .addr_write = -1,
+    .addr_code  = -1,
+    .addend     = -1,
+};
+
+/* NOTE: if flush_global is true, also flush global entries (not
+   implemented yet) */
+void tlb_flush(CPUState *env, int flush_global)
+{
+    int i;
+
+#ifdef VBOX
+    Assert(EMRemIsLockOwner(env->pVM));
+    ASMAtomicAndS32((int32_t volatile *)&env->interrupt_request, ~CPU_INTERRUPT_EXTERNAL_FLUSH_TLB);
+#endif
+
+#if defined(DEBUG_TLB)
+    printf("tlb_flush:\n");
+#endif
+    /* must reset current TB so that interrupts cannot modify the
+       links while we are modifying them */
+    env->current_tb = NULL;
+
+    for(i = 0; i < CPU_TLB_SIZE; i++) {
+        int mmu_idx;
+        for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
+            env->tlb_table[mmu_idx][i] = s_cputlb_empty_entry;
+        }
+    }
+
+    memset (env->tb_jmp_cache, 0, TB_JMP_CACHE_SIZE * sizeof (void *));
+
+    env->tlb_flush_addr = -1;
+    env->tlb_flush_mask = 0;
+    tlb_flush_count++;
+#ifdef VBOX
+
+    /* inform raw mode about TLB flush */
+    remR3FlushTLB(env, flush_global);
+#endif /* VBOX */
+}
+
+static inline void tlb_flush_entry(CPUTLBEntry *tlb_entry, target_ulong addr)
+{
+    if (addr == (tlb_entry->addr_read &
+                 (TARGET_PAGE_MASK | TLB_INVALID_MASK)) ||
+        addr == (tlb_entry->addr_write &
+                 (TARGET_PAGE_MASK | TLB_INVALID_MASK)) ||
+        addr == (tlb_entry->addr_code &
+                 (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
+        *tlb_entry = s_cputlb_empty_entry;
+    }
+}
+
+void tlb_flush_page(CPUState *env, target_ulong addr)
+{
+    int i;
+    int mmu_idx;
+
+    Assert(EMRemIsLockOwner(env->pVM));
+#if defined(DEBUG_TLB)
+    printf("tlb_flush_page: " TARGET_FMT_lx "\n", addr);
+#endif
+    /* Check if we need to flush due to large pages.  */
+    if ((addr & env->tlb_flush_mask) == env->tlb_flush_addr) {
+#if defined(DEBUG_TLB)
+        printf("tlb_flush_page: forced full flush ("
+               TARGET_FMT_lx "/" TARGET_FMT_lx ")\n",
+               env->tlb_flush_addr, env->tlb_flush_mask);
+#endif
+        tlb_flush(env, 1);
+        return;
+    }
+    /* must reset current TB so that interrupts cannot modify the
+       links while we are modifying them */
+    env->current_tb = NULL;
+
+    addr &= TARGET_PAGE_MASK;
+    i = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++)
+        tlb_flush_entry(&env->tlb_table[mmu_idx][i], addr);
+
+    tlb_flush_jmp_cache(env, addr);
+}
+
+/* update the TLBs so that writes to code in the virtual page 'addr'
+   can be detected */
+static void tlb_protect_code(ram_addr_t ram_addr)
+{
+    cpu_physical_memory_reset_dirty(ram_addr,
+                                    ram_addr + TARGET_PAGE_SIZE,
+                                    CODE_DIRTY_FLAG);
+#if defined(VBOX) && defined(REM_MONITOR_CODE_PAGES)
+    /** @todo Retest this? This function has changed... */
+    remR3ProtectCode(cpu_single_env, ram_addr);
+#endif /* VBOX */
+}
+
+/* update the TLB so that writes in physical page 'phys_addr' are no longer
+   tested for self modifying code */
+static void tlb_unprotect_code_phys(CPUState *env, ram_addr_t ram_addr,
+                                    target_ulong vaddr)
+{
+    cpu_physical_memory_set_dirty_flags(ram_addr, CODE_DIRTY_FLAG);
+}
+
+static inline void tlb_reset_dirty_range(CPUTLBEntry *tlb_entry,
+                                         uintptr_t start, uintptr_t length)
+{
+    uintptr_t addr;
+#ifdef VBOX
+
+    if (start & 3)
+        return;
+#endif /* VBOX */
+    if ((tlb_entry->addr_write & ~TARGET_PAGE_MASK) == IO_MEM_RAM) {
+        addr = (tlb_entry->addr_write & TARGET_PAGE_MASK) + tlb_entry->addend;
+        if ((addr - start) < length) {
+            tlb_entry->addr_write = (tlb_entry->addr_write & TARGET_PAGE_MASK) | TLB_NOTDIRTY;
+        }
+    }
+}
+
+/* Note: start and end must be within the same ram block.  */
+void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t end,
+                                     int dirty_flags)
+{
+    CPUState *env;
+    uintptr_t length, start1;
+    int i;
+
+    start &= TARGET_PAGE_MASK;
+    end = TARGET_PAGE_ALIGN(end);
+
+    length = end - start;
+    if (length == 0)
+        return;
+    cpu_physical_memory_mask_dirty_range(start, length, dirty_flags);
+
+    /* we modify the TLB cache so that the dirty bit will be set again
+       when accessing the range */
+#if defined(VBOX) && defined(REM_PHYS_ADDR_IN_TLB)
+    start1 = start;
+#elif !defined(VBOX)
+    start1 = (uintptr_t)qemu_get_ram_ptr(start);
+    /* Chek that we don't span multiple blocks - this breaks the
+       address comparisons below.  */
+    if ((uintptr_t)qemu_get_ram_ptr(end - 1) - start1
+            != (end - 1) - start) {
+        abort();
+    }
+#else
+    start1 = (uintptr_t)remR3TlbGCPhys2Ptr(first_cpu, start, 1 /*fWritable*/); /** @todo page replacing (sharing or read only) may cause trouble, fix interface/whatever. */
+#endif
+
+    for(env = first_cpu; env != NULL; env = env->next_cpu) {
+        int mmu_idx;
+        for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
+            for(i = 0; i < CPU_TLB_SIZE; i++)
+                tlb_reset_dirty_range(&env->tlb_table[mmu_idx][i],
+                                      start1, length);
+        }
+    }
+}
+
+#ifndef VBOX
+
+int cpu_physical_memory_set_dirty_tracking(int enable)
+{
+    int ret = 0;
+    in_migration = enable;
+    ret = cpu_notify_migration_log(!!enable);
+    return ret;
+}
+
+int cpu_physical_memory_get_dirty_tracking(void)
+{
+    return in_migration;
+}
+
+#endif /* !VBOX */
+
+int cpu_physical_sync_dirty_bitmap(target_phys_addr_t start_addr,
+                                   target_phys_addr_t end_addr)
+{
+#ifndef VBOX
+    int ret;
+
+    ret = cpu_notify_sync_dirty_bitmap(start_addr, end_addr);
+    return ret;
+#else  /* VBOX */
+    return 0;
+#endif /* VBOX */
+}
+
+#if defined(VBOX) && !defined(REM_PHYS_ADDR_IN_TLB)
+DECLINLINE(void) tlb_update_dirty(CPUTLBEntry *tlb_entry, target_phys_addr_t phys_addend)
+#else
+static inline void tlb_update_dirty(CPUTLBEntry *tlb_entry)
+#endif
+{
+    ram_addr_t ram_addr;
+#ifndef VBOX
+    void *p;
+#endif
+
+    if ((tlb_entry->addr_write & ~TARGET_PAGE_MASK) == IO_MEM_RAM) {
+#if defined(VBOX) && defined(REM_PHYS_ADDR_IN_TLB)
+        ram_addr = (tlb_entry->addr_write & TARGET_PAGE_MASK) + tlb_entry->addend;
+#elif !defined(VBOX)
+        p = (void *)(uintptr_t)((tlb_entry->addr_write & TARGET_PAGE_MASK)
+            + tlb_entry->addend);
+        ram_addr = qemu_ram_addr_from_host(p);
+#else
+        Assert(phys_addend != -1);
+        ram_addr = (tlb_entry->addr_write & TARGET_PAGE_MASK) + phys_addend;
+#endif
+        if (!cpu_physical_memory_is_dirty(ram_addr)) {
+            tlb_entry->addr_write |= TLB_NOTDIRTY;
+        }
+    }
+}
+
+/* update the TLB according to the current state of the dirty bits */
+void cpu_tlb_update_dirty(CPUState *env)
+{
+    int i;
+    int mmu_idx;
+    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
+        for(i = 0; i < CPU_TLB_SIZE; i++)
+#if defined(VBOX) && !defined(REM_PHYS_ADDR_IN_TLB)
+            tlb_update_dirty(&env->tlb_table[mmu_idx][i], env->phys_addends[mmu_idx][i]);
+#else
+            tlb_update_dirty(&env->tlb_table[mmu_idx][i]);
+#endif
+    }
+}
+
+static inline void tlb_set_dirty1(CPUTLBEntry *tlb_entry, target_ulong vaddr)
+{
+    if (tlb_entry->addr_write == (vaddr | TLB_NOTDIRTY))
+        tlb_entry->addr_write = vaddr;
+}
+
+/* update the TLB corresponding to virtual page vaddr
+   so that it is no longer dirty */
+static inline void tlb_set_dirty(CPUState *env, target_ulong vaddr)
+{
+    int i;
+    int mmu_idx;
+
+    vaddr &= TARGET_PAGE_MASK;
+    i = (vaddr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++)
+        tlb_set_dirty1(&env->tlb_table[mmu_idx][i], vaddr);
+}
+
+/* Our TLB does not support large pages, so remember the area covered by
+   large pages and trigger a full TLB flush if these are invalidated.  */
+static void tlb_add_large_page(CPUState *env, target_ulong vaddr,
+                               target_ulong size)
+{
+    target_ulong mask = ~(size - 1);
+
+    if (env->tlb_flush_addr == (target_ulong)-1) {
+        env->tlb_flush_addr = vaddr & mask;
+        env->tlb_flush_mask = mask;
+        return;
+    }
+    /* Extend the existing region to include the new page.
+       This is a compromise between unnecessary flushes and the cost
+       of maintaining a full variable size TLB.  */
+    mask &= env->tlb_flush_mask;
+    while (((env->tlb_flush_addr ^ vaddr) & mask) != 0) {
+        mask <<= 1;
+    }
+    env->tlb_flush_addr &= mask;
+    env->tlb_flush_mask = mask;
+}
+
+/* Add a new TLB entry. At most one entry for a given virtual address
+   is permitted. Only a single TARGET_PAGE_SIZE region is mapped, the
+   supplied size is only used by tlb_flush_page.  */
+void tlb_set_page(CPUState *env, target_ulong vaddr,
+                  target_phys_addr_t paddr, int prot,
+                  int mmu_idx, target_ulong size)
+{
+    PhysPageDesc *p;
+    ram_addr_t pd;
+    unsigned int index;
+    target_ulong address;
+    target_ulong code_address;
+    uintptr_t addend;
+    CPUTLBEntry *te;
+    CPUWatchpoint *wp;
+    target_phys_addr_t iotlb;
+#if defined(VBOX) && !defined(REM_PHYS_ADDR_IN_TLB)
+    int read_mods = 0, write_mods = 0, code_mods = 0;
+#endif
+
+    assert(size >= TARGET_PAGE_SIZE);
+    if (size != TARGET_PAGE_SIZE) {
+        tlb_add_large_page(env, vaddr, size);
+    }
+    p = phys_page_find(paddr >> TARGET_PAGE_BITS);
+    if (!p) {
+        pd = IO_MEM_UNASSIGNED;
+    } else {
+        pd = p->phys_offset;
+    }
+#if defined(DEBUG_TLB)
+    printf("tlb_set_page: vaddr=" TARGET_FMT_lx " paddr=0x%08x prot=%x idx=%d size=" TARGET_FMT_lx " pd=0x%08lx\n",
+           vaddr, (int)paddr, prot, mmu_idx, size, (long)pd);
+#endif
+
+    address = vaddr;
+    if ((pd & ~TARGET_PAGE_MASK) > IO_MEM_ROM && !(pd & IO_MEM_ROMD)) {
+        /* IO memory case (romd handled later) */
+        address |= TLB_MMIO;
+    }
+#if defined(VBOX) && defined(REM_PHYS_ADDR_IN_TLB)
+    addend = pd & TARGET_PAGE_MASK;
+#elif !defined(VBOX)
+    addend = (uintptr_t)qemu_get_ram_ptr(pd & TARGET_PAGE_MASK);
+#else
+    /** @todo this is racing the phys_page_find call above since it may register
+     *        a new chunk of memory...  */
+    addend = (uintptr_t)remR3TlbGCPhys2Ptr(env, pd & TARGET_PAGE_MASK, !!(prot & PAGE_WRITE));
+#endif
+
+    if ((pd & ~TARGET_PAGE_MASK) <= IO_MEM_ROM) {
+        /* Normal RAM.  */
+        iotlb = pd & TARGET_PAGE_MASK;
+        if ((pd & ~TARGET_PAGE_MASK) == IO_MEM_RAM)
+            iotlb |= IO_MEM_NOTDIRTY;
+        else
+            iotlb |= IO_MEM_ROM;
+    } else {
+        /* IO handlers are currently passed a physical address.
+           It would be nice to pass an offset from the base address
+           of that region.  This would avoid having to special case RAM,
+           and avoid full address decoding in every device.
+           We can't use the high bits of pd for this because
+           IO_MEM_ROMD uses these as a ram address.  */
+        iotlb = (pd & ~TARGET_PAGE_MASK);
+        if (p) {
+            iotlb += p->region_offset;
+        } else {
+            iotlb += paddr;
+        }
+    }
+
+    code_address = address;
+#if defined(VBOX) && !defined(REM_PHYS_ADDR_IN_TLB)
+
+    if (addend & 0x3)
+    {
+        if (addend & 0x2)
+        {
+            /* catch write */
+            if ((pd & ~TARGET_PAGE_MASK) <= IO_MEM_ROM)
+                write_mods |= TLB_MMIO;
+        }
+        else if (addend & 0x1)
+        {
+            /* catch all */
+            if ((pd & ~TARGET_PAGE_MASK) <= IO_MEM_ROM)
+            {
+                read_mods |= TLB_MMIO;
+                write_mods |= TLB_MMIO;
+                code_mods |= TLB_MMIO;
+            }
+        }
+        if ((iotlb & ~TARGET_PAGE_MASK) == 0)
+            iotlb = env->pVM->rem.s.iHandlerMemType  + paddr;
+        addend &= ~(target_ulong)0x3;
+    }
+
+#endif
+    /* Make accesses to pages with watchpoints go via the
+       watchpoint trap routines.  */
+    QTAILQ_FOREACH(wp, &env->watchpoints, entry) {
+        if (vaddr == (wp->vaddr & TARGET_PAGE_MASK)) {
+            /* Avoid trapping reads of pages with a write breakpoint. */
+            if ((prot & PAGE_WRITE) || (wp->flags & BP_MEM_READ)) {
+                iotlb = io_mem_watch + paddr;
+                address |= TLB_MMIO;
+                break;
+            }
+        }
+    }
+
+    index = (vaddr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+    env->iotlb[mmu_idx][index] = iotlb - vaddr;
+    te = &env->tlb_table[mmu_idx][index];
+    te->addend = addend - vaddr;
+    if (prot & PAGE_READ) {
+        te->addr_read = address;
+    } else {
+        te->addr_read = -1;
+    }
+
+    if (prot & PAGE_EXEC) {
+        te->addr_code = code_address;
+    } else {
+        te->addr_code = -1;
+    }
+    if (prot & PAGE_WRITE) {
+        if ((pd & ~TARGET_PAGE_MASK) == IO_MEM_ROM ||
+            (pd & IO_MEM_ROMD)) {
+            /* Write access calls the I/O callback.  */
+            te->addr_write = address | TLB_MMIO;
+        } else if ((pd & ~TARGET_PAGE_MASK) == IO_MEM_RAM &&
+                   !cpu_physical_memory_is_dirty(pd)) {
+            te->addr_write = address | TLB_NOTDIRTY;
+        } else {
+            te->addr_write = address;
+        }
+    } else {
+        te->addr_write = -1;
+    }
+
+#if defined(VBOX) && !defined(REM_PHYS_ADDR_IN_TLB)
+    if (prot & PAGE_READ)
+        te->addr_read |= read_mods;
+    if (prot & PAGE_EXEC)
+        te->addr_code |= code_mods;
+    if (prot & PAGE_WRITE)
+        te->addr_write |= write_mods;
+
+    env->phys_addends[mmu_idx][index] = (pd & TARGET_PAGE_MASK)- vaddr;
+#endif
+
+#ifdef VBOX
+    /* inform raw mode about TLB page change */
+    remR3FlushPage(env, vaddr);
+#endif
+}
+
+#else
+
+void tlb_flush(CPUState *env, int flush_global)
+{
+}
+
+void tlb_flush_page(CPUState *env, target_ulong addr)
+{
+}
+
+/*
+ * Walks guest process memory "regions" one by one
+ * and calls callback function 'fn' for each region.
+ */
+
+struct walk_memory_regions_data
+{
+    walk_memory_regions_fn fn;
+    void *priv;
+    uintptr_t start;
+    int prot;
+};
+
+static int walk_memory_regions_end(struct walk_memory_regions_data *data,
+                                   abi_ulong end, int new_prot)
+{
+    if (data->start != -1ul) {
+        int rc = data->fn(data->priv, data->start, end, data->prot);
+        if (rc != 0) {
+            return rc;
+        }
+    }
+
+    data->start = (new_prot ? end : -1ul);
+    data->prot = new_prot;
+
+    return 0;
+}
+
+static int walk_memory_regions_1(struct walk_memory_regions_data *data,
+                                 abi_ulong base, int level, void **lp)
+{
+    abi_ulong pa;
+    int i, rc;
+
+    if (*lp == NULL) {
+        return walk_memory_regions_end(data, base, 0);
+    }
+
+    if (level == 0) {
+        PageDesc *pd = *lp;
+        for (i = 0; i < L2_SIZE; ++i) {
+            int prot = pd[i].flags;
+
+            pa = base | (i << TARGET_PAGE_BITS);
+            if (prot != data->prot) {
+                rc = walk_memory_regions_end(data, pa, prot);
+                if (rc != 0) {
+                    return rc;
+                }
+            }
+        }
+    } else {
+        void **pp = *lp;
+        for (i = 0; i < L2_SIZE; ++i) {
+            pa = base | ((abi_ulong)i <<
+                (TARGET_PAGE_BITS + L2_BITS * level));
+            rc = walk_memory_regions_1(data, pa, level - 1, pp + i);
+            if (rc != 0) {
+                return rc;
+            }
+        }
+    }
+
+    return 0;
+}
+
+int walk_memory_regions(void *priv, walk_memory_regions_fn fn)
+{
+    struct walk_memory_regions_data data;
+    target_ulong i;
+
+    data.fn = fn;
+    data.priv = priv;
+    data.start = -1ul;
+    data.prot = 0;
+
+    for (i = 0; i < V_L1_SIZE; i++) {
+        int rc = walk_memory_regions_1(&data, (abi_ulong)i << V_L1_SHIFT,
+                                       V_L1_SHIFT / L2_BITS - 1, l1_map + i);
+        if (rc != 0) {
+            return rc;
+        }
+    }
+
+    return walk_memory_regions_end(&data, 0, 0);
+}
+
+static int dump_region(void *priv, abi_ulong start,
+    abi_ulong end, unsigned long prot)
+{
+    FILE *f = (FILE *)priv;
+
+    (void) fprintf(f, TARGET_ABI_FMT_lx"-"TARGET_ABI_FMT_lx
+        " "TARGET_ABI_FMT_lx" %c%c%c\n",
+        start, end, end - start,
+        ((prot & PAGE_READ) ? 'r' : '-'),
+        ((prot & PAGE_WRITE) ? 'w' : '-'),
+        ((prot & PAGE_EXEC) ? 'x' : '-'));
+
+    return (0);
+}
+
+/* dump memory mappings */
+void page_dump(FILE *f)
+{
+    (void) fprintf(f, "%-8s %-8s %-8s %s\n",
+            "start", "end", "size", "prot");
+    walk_memory_regions(f, dump_region);
+}
+
+int page_get_flags(target_ulong address)
+{
+    PageDesc *p;
+
+    p = page_find(address >> TARGET_PAGE_BITS);
+    if (!p)
+        return 0;
+    return p->flags;
+}
+
+/* Modify the flags of a page and invalidate the code if necessary.
+   The flag PAGE_WRITE_ORG is positioned automatically depending
+   on PAGE_WRITE.  The mmap_lock should already be held.  */
+void page_set_flags(target_ulong start, target_ulong end, int flags)
+{
+    target_ulong addr, len;
+
+    /* This function should never be called with addresses outside the
+       guest address space.  If this assert fires, it probably indicates
+       a missing call to h2g_valid.  */
+#if TARGET_ABI_BITS > L1_MAP_ADDR_SPACE_BITS
+    assert(end < ((abi_ulong)1 << L1_MAP_ADDR_SPACE_BITS));
+#endif
+    assert(start < end);
+
+    start = start & TARGET_PAGE_MASK;
+    end = TARGET_PAGE_ALIGN(end);
+
+    if (flags & PAGE_WRITE) {
+        flags |= PAGE_WRITE_ORG;
+    }
+
+#ifdef VBOX
+    AssertMsgFailed(("We shouldn't be here, and if we should, we must have an env to do the proper locking!\n"));
+#endif
+    for (addr = start, len = end - start;
+         len != 0;
+         len -= TARGET_PAGE_SIZE, addr += TARGET_PAGE_SIZE) {
+        PageDesc *p = page_find_alloc(addr >> TARGET_PAGE_BITS, 1);
+
+        /* If the write protection bit is set, then we invalidate
+           the code inside.  */
+        if (!(p->flags & PAGE_WRITE) &&
+            (flags & PAGE_WRITE) &&
+            p->first_tb) {
+            tb_invalidate_phys_page(addr, 0, NULL);
+        }
+        p->flags = flags;
+    }
+}
+
+int page_check_range(target_ulong start, target_ulong len, int flags)
+{
+    PageDesc *p;
+    target_ulong end;
+    target_ulong addr;
+
+    /* This function should never be called with addresses outside the
+       guest address space.  If this assert fires, it probably indicates
+       a missing call to h2g_valid.  */
+#if TARGET_ABI_BITS > L1_MAP_ADDR_SPACE_BITS
+    assert(start < ((abi_ulong)1 << L1_MAP_ADDR_SPACE_BITS));
+#endif
+
+    if (len == 0) {
+        return 0;
+    }
+    if (start + len - 1 < start) {
+        /* We've wrapped around.  */
+        return -1;
+    }
+
+    end = TARGET_PAGE_ALIGN(start+len); /* must do before we loose bits in the next step */
+    start = start & TARGET_PAGE_MASK;
+
+    for (addr = start, len = end - start;
+         len != 0;
+         len -= TARGET_PAGE_SIZE, addr += TARGET_PAGE_SIZE) {
+        p = page_find(addr >> TARGET_PAGE_BITS);
+        if( !p )
+            return -1;
+        if( !(p->flags & PAGE_VALID) )
+            return -1;
+
+        if ((flags & PAGE_READ) && !(p->flags & PAGE_READ))
+            return -1;
+        if (flags & PAGE_WRITE) {
+            if (!(p->flags & PAGE_WRITE_ORG))
+                return -1;
+            /* unprotect the page if it was put read-only because it
+               contains translated code */
+            if (!(p->flags & PAGE_WRITE)) {
+                if (!page_unprotect(addr, 0, NULL))
+                    return -1;
+            }
+            return 0;
+        }
+    }
+    return 0;
+}
+
+/* called from signal handler: invalidate the code and unprotect the
+   page. Return TRUE if the fault was successfully handled. */
+int page_unprotect(target_ulong address, uintptr_t pc, void *puc)
+{
+    unsigned int prot;
+    PageDesc *p;
+    target_ulong host_start, host_end, addr;
+
+    /* Technically this isn't safe inside a signal handler.  However we
+       know this only ever happens in a synchronous SEGV handler, so in
+       practice it seems to be ok.  */
+    mmap_lock();
+
+    p = page_find(address >> TARGET_PAGE_BITS);
+    if (!p) {
+        mmap_unlock();
+        return 0;
+    }
+
+    /* if the page was really writable, then we change its
+       protection back to writable */
+    if ((p->flags & PAGE_WRITE_ORG) && !(p->flags & PAGE_WRITE)) {
+        host_start = address & qemu_host_page_mask;
+        host_end = host_start + qemu_host_page_size;
+
+        prot = 0;
+        for (addr = host_start ; addr < host_end ; addr += TARGET_PAGE_SIZE) {
+            p = page_find(addr >> TARGET_PAGE_BITS);
+            p->flags |= PAGE_WRITE;
+            prot |= p->flags;
+
+            /* and since the content will be modified, we must invalidate
+               the corresponding translated code. */
+            tb_invalidate_phys_page(addr, pc, puc);
+#ifdef DEBUG_TB_CHECK
+            tb_invalidate_check(addr);
+#endif
+        }
+        mprotect((void *)g2h(host_start), qemu_host_page_size,
+                 prot & PAGE_BITS);
+
+        mmap_unlock();
+        return 1;
+    }
+    mmap_unlock();
+    return 0;
+}
+
+static inline void tlb_set_dirty(CPUState *env,
+                                 uintptr_t addr, target_ulong vaddr)
+{
+}
+#endif /* defined(CONFIG_USER_ONLY) */
+
+#if !defined(CONFIG_USER_ONLY)
+
+#define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK)
+typedef struct subpage_t {
+    target_phys_addr_t base;
+    ram_addr_t sub_io_index[TARGET_PAGE_SIZE];
+    ram_addr_t region_offset[TARGET_PAGE_SIZE];
+} subpage_t;
+
+static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
+                             ram_addr_t memory, ram_addr_t region_offset);
+static subpage_t *subpage_init (target_phys_addr_t base, ram_addr_t *phys,
+                                ram_addr_t orig_memory,
+                                ram_addr_t region_offset);
+#define CHECK_SUBPAGE(addr, start_addr, start_addr2, end_addr, end_addr2, \
+                      need_subpage)                                     \
+    do {                                                                \
+        if (addr > start_addr)                                          \
+            start_addr2 = 0;                                            \
+        else {                                                          \
+            start_addr2 = start_addr & ~TARGET_PAGE_MASK;               \
+            if (start_addr2 > 0)                                        \
+                need_subpage = 1;                                       \
+        }                                                               \
+                                                                        \
+        if ((start_addr + orig_size) - addr >= TARGET_PAGE_SIZE)        \
+            end_addr2 = TARGET_PAGE_SIZE - 1;                           \
+        else {                                                          \
+            end_addr2 = (start_addr + orig_size - 1) & ~TARGET_PAGE_MASK; \
+            if (end_addr2 < TARGET_PAGE_SIZE - 1)                       \
+                need_subpage = 1;                                       \
+        }                                                               \
+    } while (0)
+
+/* register physical memory.
+   For RAM, 'size' must be a multiple of the target page size.
+   If (phys_offset & ~TARGET_PAGE_MASK) != 0, then it is an
+   io memory page.  The address used when calling the IO function is
+   the offset from the start of the region, plus region_offset.  Both
+   start_addr and region_offset are rounded down to a page boundary
+   before calculating this offset.  This should not be a problem unless
+   the low bits of start_addr and region_offset differ.  */
+void cpu_register_physical_memory_offset(target_phys_addr_t start_addr,
+                                         ram_addr_t size,
+                                         ram_addr_t phys_offset,
+                                         ram_addr_t region_offset)
+{
+    target_phys_addr_t addr, end_addr;
+    PhysPageDesc *p;
+    CPUState *env;
+    ram_addr_t orig_size = size;
+    subpage_t *subpage;
+
+#ifndef VBOX
+    cpu_notify_set_memory(start_addr, size, phys_offset);
+#endif /* !VBOX */
+
+    if (phys_offset == IO_MEM_UNASSIGNED) {
+        region_offset = start_addr;
+    }
+    region_offset &= TARGET_PAGE_MASK;
+    size = (size + TARGET_PAGE_SIZE - 1) & TARGET_PAGE_MASK;
+    end_addr = start_addr + (target_phys_addr_t)size;
+    for(addr = start_addr; addr != end_addr; addr += TARGET_PAGE_SIZE) {
+        p = phys_page_find(addr >> TARGET_PAGE_BITS);
+        if (p && p->phys_offset != IO_MEM_UNASSIGNED) {
+            ram_addr_t orig_memory = p->phys_offset;
+            target_phys_addr_t start_addr2, end_addr2;
+            int need_subpage = 0;
+
+            CHECK_SUBPAGE(addr, start_addr, start_addr2, end_addr, end_addr2,
+                          need_subpage);
+            if (need_subpage) {
+                if (!(orig_memory & IO_MEM_SUBPAGE)) {
+                    subpage = subpage_init((addr & TARGET_PAGE_MASK),
+                                           &p->phys_offset, orig_memory,
+                                           p->region_offset);
+                } else {
+                    subpage = io_mem_opaque[(orig_memory & ~TARGET_PAGE_MASK)
+                                            >> IO_MEM_SHIFT];
+                }
+                subpage_register(subpage, start_addr2, end_addr2, phys_offset,
+                                 region_offset);
+                p->region_offset = 0;
+            } else {
+                p->phys_offset = phys_offset;
+                if ((phys_offset & ~TARGET_PAGE_MASK) <= IO_MEM_ROM ||
+                    (phys_offset & IO_MEM_ROMD))
+                    phys_offset += TARGET_PAGE_SIZE;
+            }
+        } else {
+            p = phys_page_find_alloc(addr >> TARGET_PAGE_BITS, 1);
+            p->phys_offset = phys_offset;
+            p->region_offset = region_offset;
+            if ((phys_offset & ~TARGET_PAGE_MASK) <= IO_MEM_ROM ||
+                (phys_offset & IO_MEM_ROMD)) {
+                phys_offset += TARGET_PAGE_SIZE;
+            } else {
+                target_phys_addr_t start_addr2, end_addr2;
+                int need_subpage = 0;
+
+                CHECK_SUBPAGE(addr, start_addr, start_addr2, end_addr,
+                              end_addr2, need_subpage);
+
+                if (need_subpage) {
+                    subpage = subpage_init((addr & TARGET_PAGE_MASK),
+                                           &p->phys_offset, IO_MEM_UNASSIGNED,
+                                           addr & TARGET_PAGE_MASK);
+                    subpage_register(subpage, start_addr2, end_addr2,
+                                     phys_offset, region_offset);
+                    p->region_offset = 0;
+                }
+            }
+        }
+        region_offset += TARGET_PAGE_SIZE;
+    }
+
+    /* since each CPU stores ram addresses in its TLB cache, we must
+       reset the modified entries */
+#ifndef VBOX
+    /* XXX: slow ! */
+    for(env = first_cpu; env != NULL; env = env->next_cpu) {
+        tlb_flush(env, 1);
+    }
+#else
+    /* We have one thread per CPU, so, one of the other EMTs might be executing
+       code right now and flushing the TLB may crash it. */
+    env = first_cpu;
+    if (EMRemIsLockOwner(env->pVM))
+        tlb_flush(env, 1);
+    else
+        ASMAtomicOrS32((int32_t volatile *)&env->interrupt_request,
+                       CPU_INTERRUPT_EXTERNAL_FLUSH_TLB);
+#endif
+}
+
+/* XXX: temporary until new memory mapping API */
+ram_addr_t cpu_get_physical_page_desc(target_phys_addr_t addr)
+{
+    PhysPageDesc *p;
+
+    p = phys_page_find(addr >> TARGET_PAGE_BITS);
+    if (!p)
+        return IO_MEM_UNASSIGNED;
+    return p->phys_offset;
+}
+
+#ifndef VBOX
+
+void qemu_register_coalesced_mmio(target_phys_addr_t addr, ram_addr_t size)
+{
+    if (kvm_enabled())
+        kvm_coalesce_mmio_region(addr, size);
+}
+
+void qemu_unregister_coalesced_mmio(target_phys_addr_t addr, ram_addr_t size)
+{
+    if (kvm_enabled())
+        kvm_uncoalesce_mmio_region(addr, size);
+}
+
+void qemu_flush_coalesced_mmio_buffer(void)
+{
+    if (kvm_enabled())
+        kvm_flush_coalesced_mmio_buffer();
+}
+
+#if defined(__linux__) && !defined(TARGET_S390X)
+
+#include <sys/vfs.h>
+
+#define HUGETLBFS_MAGIC       0x958458f6
+
+static size_t gethugepagesize(const char *path)
+{
+    struct statfs fs;
+    int ret;
+
+    do {
+	    ret = statfs(path, &fs);
+    } while (ret != 0 && errno == EINTR);
+
+    if (ret != 0) {
+	    perror(path);
+	    return 0;
+    }
+
+    if (fs.f_type != HUGETLBFS_MAGIC)
+	    fprintf(stderr, "Warning: path not on HugeTLBFS: %s\n", path);
+
+    return (size_t)fs.f_bsize;
+}
+
+static void *file_ram_alloc(RAMBlock *block,
+                            ram_addr_t memory,
+                            const char *path)
+{
+    char *filename;
+    void *area;
+    int fd;
+#ifdef MAP_POPULATE
+    int flags;
+#endif
+    size_t hpagesize;
+
+    hpagesize = gethugepagesize(path);
+    if (!hpagesize) {
+	return NULL;
+    }
+
+    if (memory < hpagesize) {
+        return NULL;
+    }
+
+    if (kvm_enabled() && !kvm_has_sync_mmu()) {
+        fprintf(stderr, "host lacks kvm mmu notifiers, -mem-path unsupported\n");
+        return NULL;
+    }
+
+    if (asprintf(&filename, "%s/qemu_back_mem.XXXXXX", path) == -1) {
+	return NULL;
+    }
+
+    fd = mkstemp(filename);
+    if (fd < 0) {
+	perror("unable to create backing store for hugepages");
+	free(filename);
+	return NULL;
+    }
+    unlink(filename);
+    free(filename);
+
+    memory = (memory+hpagesize-1) & ~(hpagesize-1);
+
+    /*
+     * ftruncate is not supported by hugetlbfs in older
+     * hosts, so don't bother bailing out on errors.
+     * If anything goes wrong with it under other filesystems,
+     * mmap will fail.
+     */
+    if (ftruncate(fd, memory))
+	perror("ftruncate");
+
+#ifdef MAP_POPULATE
+    /* NB: MAP_POPULATE won't exhaustively alloc all phys pages in the case
+     * MAP_PRIVATE is requested.  For mem_prealloc we mmap as MAP_SHARED
+     * to sidestep this quirk.
+     */
+    flags = mem_prealloc ? MAP_POPULATE | MAP_SHARED : MAP_PRIVATE;
+    area = mmap(0, memory, PROT_READ | PROT_WRITE, flags, fd, 0);
+#else
+    area = mmap(0, memory, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+#endif
+    if (area == MAP_FAILED) {
+	perror("file_ram_alloc: can't mmap RAM pages");
+	close(fd);
+	return (NULL);
+    }
+    block->fd = fd;
+    return area;
+}
+#endif
+
+static ram_addr_t find_ram_offset(ram_addr_t size)
+{
+    RAMBlock *block, *next_block;
+    ram_addr_t offset = 0, mingap = ULONG_MAX;
+
+    if (QLIST_EMPTY(&ram_list.blocks))
+        return 0;
+
+    QLIST_FOREACH(block, &ram_list.blocks, next) {
+        ram_addr_t end, next = ULONG_MAX;
+
+        end = block->offset + block->length;
+
+        QLIST_FOREACH(next_block, &ram_list.blocks, next) {
+            if (next_block->offset >= end) {
+                next = MIN(next, next_block->offset);
+            }
+        }
+        if (next - end >= size && next - end < mingap) {
+            offset =  end;
+            mingap = next - end;
+        }
+    }
+    return offset;
+}
+
+static ram_addr_t last_ram_offset(void)
+{
+    RAMBlock *block;
+    ram_addr_t last = 0;
+
+    QLIST_FOREACH(block, &ram_list.blocks, next)
+        last = MAX(last, block->offset + block->length);
+
+    return last;
+}
+
+ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev, const char *name,
+                        ram_addr_t size, void *host)
+{
+    RAMBlock *new_block, *block;
+
+    size = TARGET_PAGE_ALIGN(size);
+    new_block = qemu_mallocz(sizeof(*new_block));
+
+    if (dev && dev->parent_bus && dev->parent_bus->info->get_dev_path) {
+        char *id = dev->parent_bus->info->get_dev_path(dev);
+        if (id) {
+            snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id);
+            qemu_free(id);
+        }
+    }
+    pstrcat(new_block->idstr, sizeof(new_block->idstr), name);
+
+    QLIST_FOREACH(block, &ram_list.blocks, next) {
+        if (!strcmp(block->idstr, new_block->idstr)) {
+            fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n",
+                    new_block->idstr);
+            abort();
+        }
+    }
+
+    new_block->host = host;
+
+    new_block->offset = find_ram_offset(size);
+    new_block->length = size;
+
+    QLIST_INSERT_HEAD(&ram_list.blocks, new_block, next);
+
+    ram_list.phys_dirty = qemu_realloc(ram_list.phys_dirty,
+                                       last_ram_offset() >> TARGET_PAGE_BITS);
+    memset(ram_list.phys_dirty + (new_block->offset >> TARGET_PAGE_BITS),
+           0xff, size >> TARGET_PAGE_BITS);
+
+    if (kvm_enabled())
+        kvm_setup_guest_memory(new_block->host, size);
+
+    return new_block->offset;
+}
+
+ram_addr_t qemu_ram_alloc(DeviceState *dev, const char *name, ram_addr_t size)
+{
+    RAMBlock *new_block, *block;
+
+    size = TARGET_PAGE_ALIGN(size);
+    new_block = qemu_mallocz(sizeof(*new_block));
+
+    if (dev && dev->parent_bus && dev->parent_bus->info->get_dev_path) {
+        char *id = dev->parent_bus->info->get_dev_path(dev);
+        if (id) {
+            snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id);
+            qemu_free(id);
+        }
+    }
+    pstrcat(new_block->idstr, sizeof(new_block->idstr), name);
+
+    QLIST_FOREACH(block, &ram_list.blocks, next) {
+        if (!strcmp(block->idstr, new_block->idstr)) {
+            fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n",
+                    new_block->idstr);
+            abort();
+        }
+    }
+
+    if (mem_path) {
+#if defined (__linux__) && !defined(TARGET_S390X)
+        new_block->host = file_ram_alloc(new_block, size, mem_path);
+        if (!new_block->host) {
+            new_block->host = qemu_vmalloc(size);
+#ifdef MADV_MERGEABLE
+            madvise(new_block->host, size, MADV_MERGEABLE);
+#endif
+        }
+#else
+        fprintf(stderr, "-mem-path option unsupported\n");
+        exit(1);
+#endif
+    } else {
+#if defined(TARGET_S390X) && defined(CONFIG_KVM)
+        /* XXX S390 KVM requires the topmost vma of the RAM to be < 256GB */
+        new_block->host = mmap((void*)0x1000000, size,
+                                PROT_EXEC|PROT_READ|PROT_WRITE,
+                                MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+#else
+        new_block->host = qemu_vmalloc(size);
+#endif
+#ifdef MADV_MERGEABLE
+        madvise(new_block->host, size, MADV_MERGEABLE);
+#endif
+    }
+    new_block->offset = find_ram_offset(size);
+    new_block->length = size;
+
+    QLIST_INSERT_HEAD(&ram_list.blocks, new_block, next);
+
+    ram_list.phys_dirty = qemu_realloc(ram_list.phys_dirty,
+                                       last_ram_offset() >> TARGET_PAGE_BITS);
+    memset(ram_list.phys_dirty + (new_block->offset >> TARGET_PAGE_BITS),
+           0xff, size >> TARGET_PAGE_BITS);
+
+    if (kvm_enabled())
+        kvm_setup_guest_memory(new_block->host, size);
+
+    return new_block->offset;
+}
+
+void qemu_ram_free(ram_addr_t addr)
+{
+    RAMBlock *block;
+
+    QLIST_FOREACH(block, &ram_list.blocks, next) {
+        if (addr == block->offset) {
+            QLIST_REMOVE(block, next);
+            if (mem_path) {
+#if defined (__linux__) && !defined(TARGET_S390X)
+                if (block->fd) {
+                    munmap(block->host, block->length);
+                    close(block->fd);
+                } else {
+                    qemu_vfree(block->host);
+                }
+#endif
+            } else {
+#if defined(TARGET_S390X) && defined(CONFIG_KVM)
+                munmap(block->host, block->length);
+#else
+                qemu_vfree(block->host);
+#endif
+            }
+            qemu_free(block);
+            return;
+        }
+    }
+
+}
+
+/* Return a host pointer to ram allocated with qemu_ram_alloc.
+   With the exception of the softmmu code in this file, this should
+   only be used for local memory (e.g. video ram) that the device owns,
+   and knows it isn't going to access beyond the end of the block.
+
+   It should not be used for general purpose DMA.
+   Use cpu_physical_memory_map/cpu_physical_memory_rw instead.
+ */
+void *qemu_get_ram_ptr(ram_addr_t addr)
+{
+    RAMBlock *block;
+
+    QLIST_FOREACH(block, &ram_list.blocks, next) {
+        if (addr - block->offset < block->length) {
+            QLIST_REMOVE(block, next);
+            QLIST_INSERT_HEAD(&ram_list.blocks, block, next);
+            return block->host + (addr - block->offset);
+        }
+    }
+
+    fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
+    abort();
+
+    return NULL;
+}
+
+/* Some of the softmmu routines need to translate from a host pointer
+   (typically a TLB entry) back to a ram offset.  */
+ram_addr_t qemu_ram_addr_from_host(void *ptr)
+{
+    RAMBlock *block;
+    uint8_t *host = ptr;
+
+    QLIST_FOREACH(block, &ram_list.blocks, next) {
+        if (host - block->host < block->length) {
+            return block->offset + (host - block->host);
+        }
+    }
+
+    fprintf(stderr, "Bad ram pointer %p\n", ptr);
+    abort();
+
+    return 0;
+}
+
+#endif /* !VBOX */
+
+static uint32_t unassigned_mem_readb(void *opaque, target_phys_addr_t addr)
+{
+#ifdef DEBUG_UNASSIGNED
+    printf("Unassigned mem read " TARGET_FMT_plx "\n", addr);
+#endif
+#if defined(TARGET_SPARC) || defined(TARGET_MICROBLAZE)
+    do_unassigned_access(addr, 0, 0, 0, 1);
+#endif
+    return 0;
+}
+
+static uint32_t unassigned_mem_readw(void *opaque, target_phys_addr_t addr)
+{
+#ifdef DEBUG_UNASSIGNED
+    printf("Unassigned mem read " TARGET_FMT_plx "\n", addr);
+#endif
+#if defined(TARGET_SPARC) || defined(TARGET_MICROBLAZE)
+    do_unassigned_access(addr, 0, 0, 0, 2);
+#endif
+    return 0;
+}
+
+static uint32_t unassigned_mem_readl(void *opaque, target_phys_addr_t addr)
+{
+#ifdef DEBUG_UNASSIGNED
+    printf("Unassigned mem read " TARGET_FMT_plx "\n", addr);
+#endif
+#if defined(TARGET_SPARC) || defined(TARGET_MICROBLAZE)
+    do_unassigned_access(addr, 0, 0, 0, 4);
+#endif
+    return 0;
+}
+
+static void unassigned_mem_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+#ifdef DEBUG_UNASSIGNED
+    printf("Unassigned mem write " TARGET_FMT_plx " = 0x%x\n", addr, val);
+#endif
+#if defined(TARGET_SPARC) || defined(TARGET_MICROBLAZE)
+    do_unassigned_access(addr, 1, 0, 0, 1);
+#endif
+}
+
+static void unassigned_mem_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+#ifdef DEBUG_UNASSIGNED
+    printf("Unassigned mem write " TARGET_FMT_plx " = 0x%x\n", addr, val);
+#endif
+#if defined(TARGET_SPARC) || defined(TARGET_MICROBLAZE)
+    do_unassigned_access(addr, 1, 0, 0, 2);
+#endif
+}
+
+static void unassigned_mem_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+#ifdef DEBUG_UNASSIGNED
+    printf("Unassigned mem write " TARGET_FMT_plx " = 0x%x\n", addr, val);
+#endif
+#if defined(TARGET_SPARC) || defined(TARGET_MICROBLAZE)
+    do_unassigned_access(addr, 1, 0, 0, 4);
+#endif
+}
+
+static CPUReadMemoryFunc * const unassigned_mem_read[3] = {
+    unassigned_mem_readb,
+    unassigned_mem_readw,
+    unassigned_mem_readl,
+};
+
+static CPUWriteMemoryFunc * const unassigned_mem_write[3] = {
+    unassigned_mem_writeb,
+    unassigned_mem_writew,
+    unassigned_mem_writel,
+};
+
+static void notdirty_mem_writeb(void *opaque, target_phys_addr_t ram_addr,
+                                uint32_t val)
+{
+    int dirty_flags;
+    dirty_flags = cpu_physical_memory_get_dirty_flags(ram_addr);
+    if (!(dirty_flags & CODE_DIRTY_FLAG)) {
+#if !defined(CONFIG_USER_ONLY)
+        tb_invalidate_phys_page_fast(ram_addr, 1);
+        dirty_flags = cpu_physical_memory_get_dirty_flags(ram_addr);
+#endif
+    }
+#if defined(VBOX) && !defined(REM_PHYS_ADDR_IN_TLB)
+    remR3PhysWriteU8(ram_addr, val);
+#else
+    stb_p(qemu_get_ram_ptr(ram_addr), val);
+#endif
+    dirty_flags |= (0xff & ~CODE_DIRTY_FLAG);
+    cpu_physical_memory_set_dirty_flags(ram_addr, dirty_flags);
+    /* we remove the notdirty callback only if the code has been
+       flushed */
+    if (dirty_flags == 0xff)
+        tlb_set_dirty(cpu_single_env, cpu_single_env->mem_io_vaddr);
+}
+
+static void notdirty_mem_writew(void *opaque, target_phys_addr_t ram_addr,
+                                uint32_t val)
+{
+    int dirty_flags;
+    dirty_flags = cpu_physical_memory_get_dirty_flags(ram_addr);
+    if (!(dirty_flags & CODE_DIRTY_FLAG)) {
+#if !defined(CONFIG_USER_ONLY)
+        tb_invalidate_phys_page_fast(ram_addr, 2);
+        dirty_flags = cpu_physical_memory_get_dirty_flags(ram_addr);
+#endif
+    }
+#if defined(VBOX) && !defined(REM_PHYS_ADDR_IN_TLB)
+    remR3PhysWriteU16(ram_addr, val);
+#else
+    stw_p(qemu_get_ram_ptr(ram_addr), val);
+#endif
+    dirty_flags |= (0xff & ~CODE_DIRTY_FLAG);
+    cpu_physical_memory_set_dirty_flags(ram_addr, dirty_flags);
+    /* we remove the notdirty callback only if the code has been
+       flushed */
+    if (dirty_flags == 0xff)
+        tlb_set_dirty(cpu_single_env, cpu_single_env->mem_io_vaddr);
+}
+
+static void notdirty_mem_writel(void *opaque, target_phys_addr_t ram_addr,
+                                uint32_t val)
+{
+    int dirty_flags;
+    dirty_flags = cpu_physical_memory_get_dirty_flags(ram_addr);
+    if (!(dirty_flags & CODE_DIRTY_FLAG)) {
+#if !defined(CONFIG_USER_ONLY)
+        tb_invalidate_phys_page_fast(ram_addr, 4);
+        dirty_flags = cpu_physical_memory_get_dirty_flags(ram_addr);
+#endif
+    }
+#if defined(VBOX) && !defined(REM_PHYS_ADDR_IN_TLB)
+    remR3PhysWriteU32(ram_addr, val);
+#else
+    stl_p(qemu_get_ram_ptr(ram_addr), val);
+#endif
+    dirty_flags |= (0xff & ~CODE_DIRTY_FLAG);
+    cpu_physical_memory_set_dirty_flags(ram_addr, dirty_flags);
+    /* we remove the notdirty callback only if the code has been
+       flushed */
+    if (dirty_flags == 0xff)
+        tlb_set_dirty(cpu_single_env, cpu_single_env->mem_io_vaddr);
+}
+
+static CPUReadMemoryFunc * const error_mem_read[3] = {
+    NULL, /* never used */
+    NULL, /* never used */
+    NULL, /* never used */
+};
+
+static CPUWriteMemoryFunc * const notdirty_mem_write[3] = {
+    notdirty_mem_writeb,
+    notdirty_mem_writew,
+    notdirty_mem_writel,
+};
+
+/* Generate a debug exception if a watchpoint has been hit.  */
+static void check_watchpoint(int offset, int len_mask, int flags)
+{
+    CPUState *env = cpu_single_env;
+    target_ulong pc, cs_base;
+    TranslationBlock *tb;
+    target_ulong vaddr;
+    CPUWatchpoint *wp;
+    int cpu_flags;
+
+    if (env->watchpoint_hit) {
+        /* We re-entered the check after replacing the TB. Now raise
+         * the debug interrupt so that is will trigger after the
+         * current instruction. */
+        cpu_interrupt(env, CPU_INTERRUPT_DEBUG);
+        return;
+    }
+    vaddr = (env->mem_io_vaddr & TARGET_PAGE_MASK) + offset;
+    QTAILQ_FOREACH(wp, &env->watchpoints, entry) {
+        if ((vaddr == (wp->vaddr & len_mask) ||
+             (vaddr & wp->len_mask) == wp->vaddr) && (wp->flags & flags)) {
+            wp->flags |= BP_WATCHPOINT_HIT;
+            if (!env->watchpoint_hit) {
+                env->watchpoint_hit = wp;
+                tb = tb_find_pc(env->mem_io_pc);
+                if (!tb) {
+                    cpu_abort(env, "check_watchpoint: could not find TB for "
+                              "pc=%p", (void *)env->mem_io_pc);
+                }
+                cpu_restore_state(tb, env, env->mem_io_pc, NULL);
+                tb_phys_invalidate(tb, -1);
+                if (wp->flags & BP_STOP_BEFORE_ACCESS) {
+                    env->exception_index = EXCP_DEBUG;
+                } else {
+                    cpu_get_tb_cpu_state(env, &pc, &cs_base, &cpu_flags);
+                    tb_gen_code(env, pc, cs_base, cpu_flags, 1);
+                }
+                cpu_resume_from_signal(env, NULL);
+            }
+        } else {
+            wp->flags &= ~BP_WATCHPOINT_HIT;
+        }
+    }
+}
+
+/* Watchpoint access routines.  Watchpoints are inserted using TLB tricks,
+   so these check for a hit then pass through to the normal out-of-line
+   phys routines.  */
+static uint32_t watch_mem_readb(void *opaque, target_phys_addr_t addr)
+{
+    check_watchpoint(addr & ~TARGET_PAGE_MASK, ~0x0, BP_MEM_READ);
+    return ldub_phys(addr);
+}
+
+static uint32_t watch_mem_readw(void *opaque, target_phys_addr_t addr)
+{
+    check_watchpoint(addr & ~TARGET_PAGE_MASK, ~0x1, BP_MEM_READ);
+    return lduw_phys(addr);
+}
+
+static uint32_t watch_mem_readl(void *opaque, target_phys_addr_t addr)
+{
+    check_watchpoint(addr & ~TARGET_PAGE_MASK, ~0x3, BP_MEM_READ);
+    return ldl_phys(addr);
+}
+
+static void watch_mem_writeb(void *opaque, target_phys_addr_t addr,
+                             uint32_t val)
+{
+    check_watchpoint(addr & ~TARGET_PAGE_MASK, ~0x0, BP_MEM_WRITE);
+    stb_phys(addr, val);
+}
+
+static void watch_mem_writew(void *opaque, target_phys_addr_t addr,
+                             uint32_t val)
+{
+    check_watchpoint(addr & ~TARGET_PAGE_MASK, ~0x1, BP_MEM_WRITE);
+    stw_phys(addr, val);
+}
+
+static void watch_mem_writel(void *opaque, target_phys_addr_t addr,
+                             uint32_t val)
+{
+    check_watchpoint(addr & ~TARGET_PAGE_MASK, ~0x3, BP_MEM_WRITE);
+    stl_phys(addr, val);
+}
+
+static CPUReadMemoryFunc * const watch_mem_read[3] = {
+    watch_mem_readb,
+    watch_mem_readw,
+    watch_mem_readl,
+};
+
+static CPUWriteMemoryFunc * const watch_mem_write[3] = {
+    watch_mem_writeb,
+    watch_mem_writew,
+    watch_mem_writel,
+};
+
+static inline uint32_t subpage_readlen (subpage_t *mmio,
+                                        target_phys_addr_t addr,
+                                        unsigned int len)
+{
+    unsigned int idx = SUBPAGE_IDX(addr);
+#if defined(DEBUG_SUBPAGE)
+    printf("%s: subpage %p len %d addr " TARGET_FMT_plx " idx %d\n", __func__,
+           mmio, len, addr, idx);
+#endif
+
+    addr += mmio->region_offset[idx];
+    idx = mmio->sub_io_index[idx];
+    return io_mem_read[idx][len](io_mem_opaque[idx], addr);
+}
+
+static inline void subpage_writelen (subpage_t *mmio, target_phys_addr_t addr,
+                                     uint32_t value, unsigned int len)
+{
+    unsigned int idx = SUBPAGE_IDX(addr);
+#if defined(DEBUG_SUBPAGE)
+    printf("%s: subpage %p len %d addr " TARGET_FMT_plx " idx %d value %08x\n",
+           __func__, mmio, len, addr, idx, value);
+#endif
+
+    addr += mmio->region_offset[idx];
+    idx = mmio->sub_io_index[idx];
+    io_mem_write[idx][len](io_mem_opaque[idx], addr, value);
+}
+
+static uint32_t subpage_readb (void *opaque, target_phys_addr_t addr)
+{
+    return subpage_readlen(opaque, addr, 0);
+}
+
+static void subpage_writeb (void *opaque, target_phys_addr_t addr,
+                            uint32_t value)
+{
+    subpage_writelen(opaque, addr, value, 0);
+}
+
+static uint32_t subpage_readw (void *opaque, target_phys_addr_t addr)
+{
+    return subpage_readlen(opaque, addr, 1);
+}
+
+static void subpage_writew (void *opaque, target_phys_addr_t addr,
+                            uint32_t value)
+{
+    subpage_writelen(opaque, addr, value, 1);
+}
+
+static uint32_t subpage_readl (void *opaque, target_phys_addr_t addr)
+{
+    return subpage_readlen(opaque, addr, 2);
+}
+
+static void subpage_writel (void *opaque, target_phys_addr_t addr,
+                            uint32_t value)
+{
+    subpage_writelen(opaque, addr, value, 2);
+}
+
+static CPUReadMemoryFunc * const subpage_read[] = {
+    &subpage_readb,
+    &subpage_readw,
+    &subpage_readl,
+};
+
+static CPUWriteMemoryFunc * const subpage_write[] = {
+    &subpage_writeb,
+    &subpage_writew,
+    &subpage_writel,
+};
+
+static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
+                             ram_addr_t memory, ram_addr_t region_offset)
+{
+    int idx, eidx;
+
+    if (start >= TARGET_PAGE_SIZE || end >= TARGET_PAGE_SIZE)
+        return -1;
+    idx = SUBPAGE_IDX(start);
+    eidx = SUBPAGE_IDX(end);
+#if defined(DEBUG_SUBPAGE)
+    printf("%s: %p start %08x end %08x idx %08x eidx %08x mem %ld\n", __func__,
+           mmio, start, end, idx, eidx, memory);
+#endif
+    memory = (memory >> IO_MEM_SHIFT) & (IO_MEM_NB_ENTRIES - 1);
+    for (; idx <= eidx; idx++) {
+        mmio->sub_io_index[idx] = memory;
+        mmio->region_offset[idx] = region_offset;
+    }
+
+    return 0;
+}
+
+static subpage_t *subpage_init (target_phys_addr_t base, ram_addr_t *phys,
+                                ram_addr_t orig_memory,
+                                ram_addr_t region_offset)
+{
+    subpage_t *mmio;
+    int subpage_memory;
+
+    mmio = qemu_mallocz(sizeof(subpage_t));
+
+    mmio->base = base;
+    subpage_memory = cpu_register_io_memory(subpage_read, subpage_write, mmio);
+#if defined(DEBUG_SUBPAGE)
+    printf("%s: %p base " TARGET_FMT_plx " len %08x %d\n", __func__,
+           mmio, base, TARGET_PAGE_SIZE, subpage_memory);
+#endif
+    *phys = subpage_memory | IO_MEM_SUBPAGE;
+    subpage_register(mmio, 0, TARGET_PAGE_SIZE-1, orig_memory, region_offset);
+
+    return mmio;
+}
+
+static int get_free_io_mem_idx(void)
+{
+    int i;
+
+    for (i = 0; i<IO_MEM_NB_ENTRIES; i++)
+        if (!io_mem_used[i]) {
+            io_mem_used[i] = 1;
+            return i;
+        }
+    fprintf(stderr, "RAN out out io_mem_idx, max %d !\n", IO_MEM_NB_ENTRIES);
+    return -1;
+}
+
+/* mem_read and mem_write are arrays of functions containing the
+   function to access byte (index 0), word (index 1) and dword (index
+   2). Functions can be omitted with a NULL function pointer.
+   If io_index is non zero, the corresponding io zone is
+   modified. If it is zero, a new io zone is allocated. The return
+   value can be used with cpu_register_physical_memory(). (-1) is
+   returned if error. */
+static int cpu_register_io_memory_fixed(int io_index,
+                                        CPUReadMemoryFunc * const *mem_read,
+                                        CPUWriteMemoryFunc * const *mem_write,
+                                        void *opaque)
+{
+    int i;
+
+    if (io_index <= 0) {
+        io_index = get_free_io_mem_idx();
+        if (io_index == -1)
+            return io_index;
+    } else {
+        io_index >>= IO_MEM_SHIFT;
+        if (io_index >= IO_MEM_NB_ENTRIES)
+            return -1;
+    }
+
+    for (i = 0; i < 3; ++i) {
+        io_mem_read[io_index][i]
+            = (mem_read[i] ? mem_read[i] : unassigned_mem_read[i]);
+    }
+    for (i = 0; i < 3; ++i) {
+        io_mem_write[io_index][i]
+            = (mem_write[i] ? mem_write[i] : unassigned_mem_write[i]);
+    }
+    io_mem_opaque[io_index] = opaque;
+
+    return (io_index << IO_MEM_SHIFT);
+}
+
+int cpu_register_io_memory(CPUReadMemoryFunc * const *mem_read,
+                           CPUWriteMemoryFunc * const *mem_write,
+                           void *opaque)
+{
+    return cpu_register_io_memory_fixed(0, mem_read, mem_write, opaque);
+}
+
+void cpu_unregister_io_memory(int io_table_address)
+{
+    int i;
+    int io_index = io_table_address >> IO_MEM_SHIFT;
+
+    for (i=0;i < 3; i++) {
+        io_mem_read[io_index][i] = unassigned_mem_read[i];
+        io_mem_write[io_index][i] = unassigned_mem_write[i];
+    }
+    io_mem_opaque[io_index] = NULL;
+    io_mem_used[io_index] = 0;
+}
+
+static void io_mem_init(void)
+{
+    int i;
+
+    cpu_register_io_memory_fixed(IO_MEM_ROM, error_mem_read, unassigned_mem_write, NULL);
+    cpu_register_io_memory_fixed(IO_MEM_UNASSIGNED, unassigned_mem_read, unassigned_mem_write, NULL);
+    cpu_register_io_memory_fixed(IO_MEM_NOTDIRTY, error_mem_read, notdirty_mem_write, NULL);
+    for (i=0; i<5; i++)
+        io_mem_used[i] = 1;
+
+    io_mem_watch = cpu_register_io_memory(watch_mem_read,
+                                          watch_mem_write, NULL);
+}
+
+#endif /* !defined(CONFIG_USER_ONLY) */
+
+/* physical memory access (slow version, mainly for debug) */
+#if defined(CONFIG_USER_ONLY)
+int cpu_memory_rw_debug(CPUState *env, target_ulong addr,
+                        uint8_t *buf, int len, int is_write)
+{
+    int l, flags;
+    target_ulong page;
+    void * p;
+
+    while (len > 0) {
+        page = addr & TARGET_PAGE_MASK;
+        l = (page + TARGET_PAGE_SIZE) - addr;
+        if (l > len)
+            l = len;
+        flags = page_get_flags(page);
+        if (!(flags & PAGE_VALID))
+            return -1;
+        if (is_write) {
+            if (!(flags & PAGE_WRITE))
+                return -1;
+            /* XXX: this code should not depend on lock_user */
+            if (!(p = lock_user(VERIFY_WRITE, addr, l, 0)))
+                return -1;
+            memcpy(p, buf, l);
+            unlock_user(p, addr, l);
+        } else {
+            if (!(flags & PAGE_READ))
+                return -1;
+            /* XXX: this code should not depend on lock_user */
+            if (!(p = lock_user(VERIFY_READ, addr, l, 1)))
+                return -1;
+            memcpy(buf, p, l);
+            unlock_user(p, addr, 0);
+        }
+        len -= l;
+        buf += l;
+        addr += l;
+    }
+    return 0;
+}
+
+#else
+void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
+                            int len, int is_write)
+{
+    int l, io_index;
+    uint8_t *ptr;
+    uint32_t val;
+    target_phys_addr_t page;
+    ram_addr_t pd;
+    PhysPageDesc *p;
+
+    while (len > 0) {
+        page = addr & TARGET_PAGE_MASK;
+        l = (page + TARGET_PAGE_SIZE) - addr;
+        if (l > len)
+            l = len;
+        p = phys_page_find(page >> TARGET_PAGE_BITS);
+        if (!p) {
+            pd = IO_MEM_UNASSIGNED;
+        } else {
+            pd = p->phys_offset;
+        }
+
+        if (is_write) {
+            if ((pd & ~TARGET_PAGE_MASK) != IO_MEM_RAM) {
+                target_phys_addr_t addr1 = addr;
+                io_index = (pd >> IO_MEM_SHIFT) & (IO_MEM_NB_ENTRIES - 1);
+                if (p)
+                    addr1 = (addr & ~TARGET_PAGE_MASK) + p->region_offset;
+                /* XXX: could force cpu_single_env to NULL to avoid
+                   potential bugs */
+                if (l >= 4 && ((addr1 & 3) == 0)) {
+                    /* 32 bit write access */
+#if !defined(VBOX) || !defined(REM_PHYS_ADDR_IN_TLB)
+                    val = ldl_p(buf);
+#else
+                    val = *(const uint32_t *)buf;
+#endif
+                    io_mem_write[io_index][2](io_mem_opaque[io_index], addr1, val);
+                    l = 4;
+                } else if (l >= 2 && ((addr1 & 1) == 0)) {
+                    /* 16 bit write access */
+#if !defined(VBOX) || !defined(REM_PHYS_ADDR_IN_TLB)
+                    val = lduw_p(buf);
+#else
+                    val = *(const uint16_t *)buf;
+#endif
+                    io_mem_write[io_index][1](io_mem_opaque[io_index], addr1, val);
+                    l = 2;
+                } else {
+                    /* 8 bit write access */
+#if !defined(VBOX) || !defined(REM_PHYS_ADDR_IN_TLB)
+                    val = ldub_p(buf);
+#else
+                    val = *(const uint8_t *)buf;
+#endif
+                    io_mem_write[io_index][0](io_mem_opaque[io_index], addr1, val);
+                    l = 1;
+                }
+            } else {
+                ram_addr_t addr1;
+                addr1 = (pd & TARGET_PAGE_MASK) + (addr & ~TARGET_PAGE_MASK);
+                /* RAM case */
+#ifdef VBOX
+                remR3PhysWrite(addr1, buf, l); NOREF(ptr);
+#else
+                ptr = qemu_get_ram_ptr(addr1);
+                memcpy(ptr, buf, l);
+#endif
+                if (!cpu_physical_memory_is_dirty(addr1)) {
+                    /* invalidate code */
+                    tb_invalidate_phys_page_range(addr1, addr1 + l, 0);
+                    /* set dirty bit */
+                    cpu_physical_memory_set_dirty_flags(
+                        addr1, (0xff & ~CODE_DIRTY_FLAG));
+                }
+            }
+        } else {
+            if ((pd & ~TARGET_PAGE_MASK) > IO_MEM_ROM &&
+                !(pd & IO_MEM_ROMD)) {
+                target_phys_addr_t addr1 = addr;
+                /* I/O case */
+                io_index = (pd >> IO_MEM_SHIFT) & (IO_MEM_NB_ENTRIES - 1);
+                if (p)
+                    addr1 = (addr & ~TARGET_PAGE_MASK) + p->region_offset;
+                if (l >= 4 && ((addr1 & 3) == 0)) {
+                    /* 32 bit read access */
+                    val = io_mem_read[io_index][2](io_mem_opaque[io_index], addr1);
+#if !defined(VBOX) || !defined(REM_PHYS_ADDR_IN_TLB)
+                    stl_p(buf, val);
+#else
+                    *(uint32_t *)buf = val;
+#endif
+                    l = 4;
+                } else if (l >= 2 && ((addr1 & 1) == 0)) {
+                    /* 16 bit read access */
+                    val = io_mem_read[io_index][1](io_mem_opaque[io_index], addr1);
+#if !defined(VBOX) || !defined(REM_PHYS_ADDR_IN_TLB)
+                    stw_p(buf, val);
+#else
+                    *(uint16_t *)buf = val;
+#endif
+                    l = 2;
+                } else {
+                    /* 8 bit read access */
+                    val = io_mem_read[io_index][0](io_mem_opaque[io_index], addr1);
+#if !defined(VBOX) || !defined(REM_PHYS_ADDR_IN_TLB)
+                    stb_p(buf, val);
+#else
+                    *(uint8_t *)buf = val;
+#endif
+                    l = 1;
+                }
+            } else {
+                /* RAM case */
+#ifdef VBOX
+                remR3PhysRead((pd & TARGET_PAGE_MASK) + (addr & ~TARGET_PAGE_MASK), buf, l); NOREF(ptr);
+#else
+                ptr = qemu_get_ram_ptr(pd & TARGET_PAGE_MASK) +
+                    (addr & ~TARGET_PAGE_MASK);
+                memcpy(buf, ptr, l);
+#endif
+            }
+        }
+        len -= l;
+        buf += l;
+        addr += l;
+    }
+}
+
+#ifndef VBOX
+
+/* used for ROM loading : can write in RAM and ROM */
+void cpu_physical_memory_write_rom(target_phys_addr_t addr,
+                                   const uint8_t *buf, int len)
+{
+    int l;
+    uint8_t *ptr;
+    target_phys_addr_t page;
+    ram_addr_t pd;
+    PhysPageDesc *p;
+
+    while (len > 0) {
+        page = addr & TARGET_PAGE_MASK;
+        l = (page + TARGET_PAGE_SIZE) - addr;
+        if (l > len)
+            l = len;
+        p = phys_page_find(page >> TARGET_PAGE_BITS);
+        if (!p) {
+            pd = IO_MEM_UNASSIGNED;
+        } else {
+            pd = p->phys_offset;
+        }
+
+        if ((pd & ~TARGET_PAGE_MASK) != IO_MEM_RAM &&
+            (pd & ~TARGET_PAGE_MASK) != IO_MEM_ROM &&
+            !(pd & IO_MEM_ROMD)) {
+            /* do nothing */
+        } else {
+            ram_addr_t addr1;
+            addr1 = (pd & TARGET_PAGE_MASK) + (addr & ~TARGET_PAGE_MASK);
+            /* ROM/RAM case */
+            ptr = qemu_get_ram_ptr(addr1);
+            memcpy(ptr, buf, l);
+        }
+        len -= l;
+        buf += l;
+        addr += l;
+    }
+}
+
+typedef struct {
+    void *buffer;
+    target_phys_addr_t addr;
+    target_phys_addr_t len;
+} BounceBuffer;
+
+static BounceBuffer bounce;
+
+typedef struct MapClient {
+    void *opaque;
+    void (*callback)(void *opaque);
+    QLIST_ENTRY(MapClient) link;
+} MapClient;
+
+static QLIST_HEAD(map_client_list, MapClient) map_client_list
+    = QLIST_HEAD_INITIALIZER(map_client_list);
+
+void *cpu_register_map_client(void *opaque, void (*callback)(void *opaque))
+{
+    MapClient *client = qemu_malloc(sizeof(*client));
+
+    client->opaque = opaque;
+    client->callback = callback;
+    QLIST_INSERT_HEAD(&map_client_list, client, link);
+    return client;
+}
+
+void cpu_unregister_map_client(void *_client)
+{
+    MapClient *client = (MapClient *)_client;
+
+    QLIST_REMOVE(client, link);
+    qemu_free(client);
+}
+
+static void cpu_notify_map_clients(void)
+{
+    MapClient *client;
+
+    while (!QLIST_EMPTY(&map_client_list)) {
+        client = QLIST_FIRST(&map_client_list);
+        client->callback(client->opaque);
+        cpu_unregister_map_client(client);
+    }
+}
+
+/* Map a physical memory region into a host virtual address.
+ * May map a subset of the requested range, given by and returned in *plen.
+ * May return NULL if resources needed to perform the mapping are exhausted.
+ * Use only for reads OR writes - not for read-modify-write operations.
+ * Use cpu_register_map_client() to know when retrying the map operation is
+ * likely to succeed.
+ */
+void *cpu_physical_memory_map(target_phys_addr_t addr,
+                              target_phys_addr_t *plen,
+                              int is_write)
+{
+    target_phys_addr_t len = *plen;
+    target_phys_addr_t done = 0;
+    int l;
+    uint8_t *ret = NULL;
+    uint8_t *ptr;
+    target_phys_addr_t page;
+    ram_addr_t pd;
+    PhysPageDesc *p;
+    ram_addr_t addr1;
+
+    while (len > 0) {
+        page = addr & TARGET_PAGE_MASK;
+        l = (page + TARGET_PAGE_SIZE) - addr;
+        if (l > len)
+            l = len;
+        p = phys_page_find(page >> TARGET_PAGE_BITS);
+        if (!p) {
+            pd = IO_MEM_UNASSIGNED;
+        } else {
+            pd = p->phys_offset;
+        }
+
+        if ((pd & ~TARGET_PAGE_MASK) != IO_MEM_RAM) {
+            if (done || bounce.buffer) {
+                break;
+            }
+            bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, TARGET_PAGE_SIZE);
+            bounce.addr = addr;
+            bounce.len = l;
+            if (!is_write) {
+                cpu_physical_memory_rw(addr, bounce.buffer, l, 0);
+            }
+            ptr = bounce.buffer;
+        } else {
+            addr1 = (pd & TARGET_PAGE_MASK) + (addr & ~TARGET_PAGE_MASK);
+            ptr = qemu_get_ram_ptr(addr1);
+        }
+        if (!done) {
+            ret = ptr;
+        } else if (ret + done != ptr) {
+            break;
+        }
+
+        len -= l;
+        addr += l;
+        done += l;
+    }
+    *plen = done;
+    return ret;
+}
+
+/* Unmaps a memory region previously mapped by cpu_physical_memory_map().
+ * Will also mark the memory as dirty if is_write == 1.  access_len gives
+ * the amount of memory that was actually read or written by the caller.
+ */
+void cpu_physical_memory_unmap(void *buffer, target_phys_addr_t len,
+                               int is_write, target_phys_addr_t access_len)
+{
+    if (buffer != bounce.buffer) {
+        if (is_write) {
+            ram_addr_t addr1 = qemu_ram_addr_from_host(buffer);
+            while (access_len) {
+                unsigned l;
+                l = TARGET_PAGE_SIZE;
+                if (l > access_len)
+                    l = access_len;
+                if (!cpu_physical_memory_is_dirty(addr1)) {
+                    /* invalidate code */
+                    tb_invalidate_phys_page_range(addr1, addr1 + l, 0);
+                    /* set dirty bit */
+                    cpu_physical_memory_set_dirty_flags(
+                        addr1, (0xff & ~CODE_DIRTY_FLAG));
+                }
+                addr1 += l;
+                access_len -= l;
+            }
+        }
+        return;
+    }
+    if (is_write) {
+        cpu_physical_memory_write(bounce.addr, bounce.buffer, access_len);
+    }
+    qemu_vfree(bounce.buffer);
+    bounce.buffer = NULL;
+    cpu_notify_map_clients();
+}
+
+#endif /* !VBOX */
+
+/* warning: addr must be aligned */
+uint32_t ldl_phys(target_phys_addr_t addr)
+{
+    int io_index;
+    uint8_t *ptr;
+    uint32_t val;
+    ram_addr_t pd;
+    PhysPageDesc *p;
+
+    p = phys_page_find(addr >> TARGET_PAGE_BITS);
+    if (!p) {
+        pd = IO_MEM_UNASSIGNED;
+    } else {
+        pd = p->phys_offset;
+    }
+
+    if ((pd & ~TARGET_PAGE_MASK) > IO_MEM_ROM &&
+        !(pd & IO_MEM_ROMD)) {
+        /* I/O case */
+        io_index = (pd >> IO_MEM_SHIFT) & (IO_MEM_NB_ENTRIES - 1);
+        if (p)
+            addr = (addr & ~TARGET_PAGE_MASK) + p->region_offset;
+        val = io_mem_read[io_index][2](io_mem_opaque[io_index], addr);
+    } else {
+        /* RAM case */
+#ifndef VBOX
+        ptr = qemu_get_ram_ptr(pd & TARGET_PAGE_MASK) +
+            (addr & ~TARGET_PAGE_MASK);
+        val = ldl_p(ptr);
+#else
+        val = remR3PhysReadU32((pd & TARGET_PAGE_MASK) + (addr & ~TARGET_PAGE_MASK)); NOREF(ptr);
+#endif
+    }
+    return val;
+}
+
+/* warning: addr must be aligned */
+uint64_t ldq_phys(target_phys_addr_t addr)
+{
+    int io_index;
+    uint8_t *ptr;
+    uint64_t val;
+    ram_addr_t pd;
+    PhysPageDesc *p;
+
+    p = phys_page_find(addr >> TARGET_PAGE_BITS);
+    if (!p) {
+        pd = IO_MEM_UNASSIGNED;
+    } else {
+        pd = p->phys_offset;
+    }
+
+    if ((pd & ~TARGET_PAGE_MASK) > IO_MEM_ROM &&
+        !(pd & IO_MEM_ROMD)) {
+        /* I/O case */
+        io_index = (pd >> IO_MEM_SHIFT) & (IO_MEM_NB_ENTRIES - 1);
+        if (p)
+            addr = (addr & ~TARGET_PAGE_MASK) + p->region_offset;
+#ifdef TARGET_WORDS_BIGENDIAN
+        val = (uint64_t)io_mem_read[io_index][2](io_mem_opaque[io_index], addr) << 32;
+        val |= io_mem_read[io_index][2](io_mem_opaque[io_index], addr + 4);
+#else
+        val = io_mem_read[io_index][2](io_mem_opaque[io_index], addr);
+        val |= (uint64_t)io_mem_read[io_index][2](io_mem_opaque[io_index], addr + 4) << 32;
+#endif
+    } else {
+        /* RAM case */
+#ifndef VBOX
+        ptr = qemu_get_ram_ptr(pd & TARGET_PAGE_MASK) +
+            (addr & ~TARGET_PAGE_MASK);
+        val = ldq_p(ptr);
+#else
+        val = remR3PhysReadU64((pd & TARGET_PAGE_MASK) + (addr & ~TARGET_PAGE_MASK)); NOREF(ptr);
+#endif
+    }
+    return val;
+}
+
+/* XXX: optimize */
+uint32_t ldub_phys(target_phys_addr_t addr)
+{
+    uint8_t val;
+    cpu_physical_memory_read(addr, &val, 1);
+    return val;
+}
+
+/* warning: addr must be aligned */
+uint32_t lduw_phys(target_phys_addr_t addr)
+{
+    int io_index;
+#ifndef VBOX
+    uint8_t *ptr;
+#endif
+    uint64_t val;
+    ram_addr_t pd;
+    PhysPageDesc *p;
+
+    p = phys_page_find(addr >> TARGET_PAGE_BITS);
+    if (!p) {
+        pd = IO_MEM_UNASSIGNED;
+    } else {
+        pd = p->phys_offset;
+    }
+
+    if ((pd & ~TARGET_PAGE_MASK) > IO_MEM_ROM &&
+        !(pd & IO_MEM_ROMD)) {
+        /* I/O case */
+        io_index = (pd >> IO_MEM_SHIFT) & (IO_MEM_NB_ENTRIES - 1);
+        if (p)
+            addr = (addr & ~TARGET_PAGE_MASK) + p->region_offset;
+        val = io_mem_read[io_index][1](io_mem_opaque[io_index], addr);
+    } else {
+        /* RAM case */
+#ifndef VBOX
+        ptr = qemu_get_ram_ptr(pd & TARGET_PAGE_MASK) +
+            (addr & ~TARGET_PAGE_MASK);
+        val = lduw_p(ptr);
+#else
+        val = remR3PhysReadU16((pd & TARGET_PAGE_MASK) | (addr & ~TARGET_PAGE_MASK));
+#endif
+    }
+    return val;
+}
+
+/* warning: addr must be aligned. The ram page is not masked as dirty
+   and the code inside is not invalidated. It is useful if the dirty
+   bits are used to track modified PTEs */
+void stl_phys_notdirty(target_phys_addr_t addr, uint32_t val)
+{
+    int io_index;
+    uint8_t *ptr;
+    ram_addr_t pd;
+    PhysPageDesc *p;
+
+    p = phys_page_find(addr >> TARGET_PAGE_BITS);
+    if (!p) {
+        pd = IO_MEM_UNASSIGNED;
+    } else {
+        pd = p->phys_offset;
+    }
+
+    if ((pd & ~TARGET_PAGE_MASK) != IO_MEM_RAM) {
+        io_index = (pd >> IO_MEM_SHIFT) & (IO_MEM_NB_ENTRIES - 1);
+        if (p)
+            addr = (addr & ~TARGET_PAGE_MASK) + p->region_offset;
+        io_mem_write[io_index][2](io_mem_opaque[io_index], addr, val);
+    } else {
+#ifndef VBOX
+        ram_addr_t addr1 = (pd & TARGET_PAGE_MASK) + (addr & ~TARGET_PAGE_MASK);
+        ptr = qemu_get_ram_ptr(addr1);
+        stl_p(ptr, val);
+#else
+        remR3PhysWriteU32((pd & TARGET_PAGE_MASK) + (addr & ~TARGET_PAGE_MASK), val); NOREF(ptr);
+#endif
+
+#ifndef VBOX
+        if (unlikely(in_migration)) {
+            if (!cpu_physical_memory_is_dirty(addr1)) {
+                /* invalidate code */
+                tb_invalidate_phys_page_range(addr1, addr1 + 4, 0);
+                /* set dirty bit */
+                cpu_physical_memory_set_dirty_flags(
+                    addr1, (0xff & ~CODE_DIRTY_FLAG));
+            }
+        }
+#endif /* !VBOX */
+    }
+}
+
+void stq_phys_notdirty(target_phys_addr_t addr, uint64_t val)
+{
+    int io_index;
+    uint8_t *ptr;
+    ram_addr_t pd;
+    PhysPageDesc *p;
+
+    p = phys_page_find(addr >> TARGET_PAGE_BITS);
+    if (!p) {
+        pd = IO_MEM_UNASSIGNED;
+    } else {
+        pd = p->phys_offset;
+    }
+
+    if ((pd & ~TARGET_PAGE_MASK) != IO_MEM_RAM) {
+        io_index = (pd >> IO_MEM_SHIFT) & (IO_MEM_NB_ENTRIES - 1);
+        if (p)
+            addr = (addr & ~TARGET_PAGE_MASK) + p->region_offset;
+#ifdef TARGET_WORDS_BIGENDIAN
+        io_mem_write[io_index][2](io_mem_opaque[io_index], addr, val >> 32);
+        io_mem_write[io_index][2](io_mem_opaque[io_index], addr + 4, val);
+#else
+        io_mem_write[io_index][2](io_mem_opaque[io_index], addr, val);
+        io_mem_write[io_index][2](io_mem_opaque[io_index], addr + 4, val >> 32);
+#endif
+    } else {
+#ifndef VBOX
+        ptr = qemu_get_ram_ptr(pd & TARGET_PAGE_MASK) +
+            (addr & ~TARGET_PAGE_MASK);
+        stq_p(ptr, val);
+#else
+        remR3PhysWriteU64((pd & TARGET_PAGE_MASK) + (addr & ~TARGET_PAGE_MASK), val); NOREF(ptr);
+#endif
+    }
+}
+
+/* warning: addr must be aligned */
+void stl_phys(target_phys_addr_t addr, uint32_t val)
+{
+    int io_index;
+    uint8_t *ptr;
+    ram_addr_t pd;
+    PhysPageDesc *p;
+
+    p = phys_page_find(addr >> TARGET_PAGE_BITS);
+    if (!p) {
+        pd = IO_MEM_UNASSIGNED;
+    } else {
+        pd = p->phys_offset;
+    }
+
+    if ((pd & ~TARGET_PAGE_MASK) != IO_MEM_RAM) {
+        io_index = (pd >> IO_MEM_SHIFT) & (IO_MEM_NB_ENTRIES - 1);
+        if (p)
+            addr = (addr & ~TARGET_PAGE_MASK) + p->region_offset;
+        io_mem_write[io_index][2](io_mem_opaque[io_index], addr, val);
+    } else {
+        ram_addr_t addr1;
+        addr1 = (pd & TARGET_PAGE_MASK) + (addr & ~TARGET_PAGE_MASK);
+        /* RAM case */
+#ifndef VBOX
+        ptr = qemu_get_ram_ptr(addr1);
+        stl_p(ptr, val);
+#else
+        remR3PhysWriteU32((pd & TARGET_PAGE_MASK) + (addr & ~TARGET_PAGE_MASK), val); NOREF(ptr);
+#endif
+        if (!cpu_physical_memory_is_dirty(addr1)) {
+            /* invalidate code */
+            tb_invalidate_phys_page_range(addr1, addr1 + 4, 0);
+            /* set dirty bit */
+            cpu_physical_memory_set_dirty_flags(addr1,
+                (0xff & ~CODE_DIRTY_FLAG));
+        }
+    }
+}
+
+/* XXX: optimize */
+void stb_phys(target_phys_addr_t addr, uint32_t val)
+{
+    uint8_t v = val;
+    cpu_physical_memory_write(addr, &v, 1);
+}
+
+/* warning: addr must be aligned */
+void stw_phys(target_phys_addr_t addr, uint32_t val)
+{
+    int io_index;
+    uint8_t *ptr;
+    ram_addr_t pd;
+    PhysPageDesc *p;
+
+    p = phys_page_find(addr >> TARGET_PAGE_BITS);
+    if (!p) {
+        pd = IO_MEM_UNASSIGNED;
+    } else {
+        pd = p->phys_offset;
+    }
+
+    if ((pd & ~TARGET_PAGE_MASK) != IO_MEM_RAM) {
+        io_index = (pd >> IO_MEM_SHIFT) & (IO_MEM_NB_ENTRIES - 1);
+        if (p)
+            addr = (addr & ~TARGET_PAGE_MASK) + p->region_offset;
+        io_mem_write[io_index][1](io_mem_opaque[io_index], addr, val);
+    } else {
+        ram_addr_t addr1;
+        addr1 = (pd & TARGET_PAGE_MASK) + (addr & ~TARGET_PAGE_MASK);
+        /* RAM case */
+#ifndef VBOX
+        ptr = qemu_get_ram_ptr(addr1);
+        stw_p(ptr, val);
+#else
+        remR3PhysWriteU16(addr1, val); NOREF(ptr);
+#endif
+        if (!cpu_physical_memory_is_dirty(addr1)) {
+            /* invalidate code */
+            tb_invalidate_phys_page_range(addr1, addr1 + 2, 0);
+            /* set dirty bit */
+            cpu_physical_memory_set_dirty_flags(addr1,
+                (0xff & ~CODE_DIRTY_FLAG));
+        }
+    }
+}
+
+/* XXX: optimize */
+void stq_phys(target_phys_addr_t addr, uint64_t val)
+{
+    val = tswap64(val);
+    cpu_physical_memory_write(addr, (const uint8_t *)&val, 8);
+}
+
+#ifndef VBOX
+/* virtual memory access for debug (includes writing to ROM) */
+int cpu_memory_rw_debug(CPUState *env, target_ulong addr,
+                        uint8_t *buf, int len, int is_write)
+{
+    int l;
+    target_phys_addr_t phys_addr;
+    target_ulong page;
+
+    while (len > 0) {
+        page = addr & TARGET_PAGE_MASK;
+        phys_addr = cpu_get_phys_page_debug(env, page);
+        /* if no physical page mapped, return an error */
+        if (phys_addr == -1)
+            return -1;
+        l = (page + TARGET_PAGE_SIZE) - addr;
+        if (l > len)
+            l = len;
+        phys_addr += (addr & ~TARGET_PAGE_MASK);
+        if (is_write)
+            cpu_physical_memory_write_rom(phys_addr, buf, l);
+        else
+            cpu_physical_memory_rw(phys_addr, buf, l, is_write);
+        len -= l;
+        buf += l;
+        addr += l;
+    }
+    return 0;
+}
+#endif /* !VBOX */
+#endif
+
+/* in deterministic execution mode, instructions doing device I/Os
+   must be at the end of the TB */
+void cpu_io_recompile(CPUState *env, void *retaddr)
+{
+    TranslationBlock *tb;
+    uint32_t n, cflags;
+    target_ulong pc, cs_base;
+    uint64_t flags;
+
+    tb = tb_find_pc((uintptr_t)retaddr);
+    if (!tb) {
+        cpu_abort(env, "cpu_io_recompile: could not find TB for pc=%p",
+                  retaddr);
+    }
+    n = env->icount_decr.u16.low + tb->icount;
+    cpu_restore_state(tb, env, (uintptr_t)retaddr, NULL);
+    /* Calculate how many instructions had been executed before the fault
+       occurred.  */
+    n = n - env->icount_decr.u16.low;
+    /* Generate a new TB ending on the I/O insn.  */
+    n++;
+    /* On MIPS and SH, delay slot instructions can only be restarted if
+       they were already the first instruction in the TB.  If this is not
+       the first instruction in a TB then re-execute the preceding
+       branch.  */
+#if defined(TARGET_MIPS)
+    if ((env->hflags & MIPS_HFLAG_BMASK) != 0 && n > 1) {
+        env->active_tc.PC -= 4;
+        env->icount_decr.u16.low++;
+        env->hflags &= ~MIPS_HFLAG_BMASK;
+    }
+#elif defined(TARGET_SH4)
+    if ((env->flags & ((DELAY_SLOT | DELAY_SLOT_CONDITIONAL))) != 0
+            && n > 1) {
+        env->pc -= 2;
+        env->icount_decr.u16.low++;
+        env->flags &= ~(DELAY_SLOT | DELAY_SLOT_CONDITIONAL);
+    }
+#endif
+    /* This should never happen.  */
+    if (n > CF_COUNT_MASK)
+        cpu_abort(env, "TB too big during recompile");
+
+    cflags = n | CF_LAST_IO;
+    pc = tb->pc;
+    cs_base = tb->cs_base;
+    flags = tb->flags;
+    tb_phys_invalidate(tb, -1);
+    /* FIXME: In theory this could raise an exception.  In practice
+       we have already translated the block once so it's probably ok.  */
+    tb_gen_code(env, pc, cs_base, flags, cflags);
+    /** @todo If env->pc != tb->pc (i.e. the faulting instruction was not
+       the first in the TB) then we end up generating a whole new TB and
+       repeating the fault, which is horribly inefficient.
+       Better would be to execute just this insn uncached, or generate a
+       second new TB.  */
+    cpu_resume_from_signal(env, NULL);
+}
+
+#if !defined(CONFIG_USER_ONLY)
+
+#ifndef VBOX
+void dump_exec_info(FILE *f,
+                    int (*cpu_fprintf)(FILE *f, const char *fmt, ...))
+{
+    int i, target_code_size, max_target_code_size;
+    int direct_jmp_count, direct_jmp2_count, cross_page;
+    TranslationBlock *tb;
+
+    target_code_size = 0;
+    max_target_code_size = 0;
+    cross_page = 0;
+    direct_jmp_count = 0;
+    direct_jmp2_count = 0;
+    for(i = 0; i < nb_tbs; i++) {
+        tb = &tbs[i];
+        target_code_size += tb->size;
+        if (tb->size > max_target_code_size)
+            max_target_code_size = tb->size;
+        if (tb->page_addr[1] != -1)
+            cross_page++;
+        if (tb->tb_next_offset[0] != 0xffff) {
+            direct_jmp_count++;
+            if (tb->tb_next_offset[1] != 0xffff) {
+                direct_jmp2_count++;
+            }
+        }
+    }
+    /* XXX: avoid using doubles ? */
+    cpu_fprintf(f, "Translation buffer state:\n");
+    cpu_fprintf(f, "gen code size       %ld/%ld\n",
+                code_gen_ptr - code_gen_buffer, code_gen_buffer_max_size);
+    cpu_fprintf(f, "TB count            %d/%d\n",
+                nb_tbs, code_gen_max_blocks);
+    cpu_fprintf(f, "TB avg target size  %d max=%d bytes\n",
+                nb_tbs ? target_code_size / nb_tbs : 0,
+                max_target_code_size);
+    cpu_fprintf(f, "TB avg host size    %d bytes (expansion ratio: %0.1f)\n",
+                nb_tbs ? (code_gen_ptr - code_gen_buffer) / nb_tbs : 0,
+                target_code_size ? (double) (code_gen_ptr - code_gen_buffer) / target_code_size : 0);
+    cpu_fprintf(f, "cross page TB count %d (%d%%)\n",
+            cross_page,
+            nb_tbs ? (cross_page * 100) / nb_tbs : 0);
+    cpu_fprintf(f, "direct jump count   %d (%d%%) (2 jumps=%d %d%%)\n",
+                direct_jmp_count,
+                nb_tbs ? (direct_jmp_count * 100) / nb_tbs : 0,
+                direct_jmp2_count,
+                nb_tbs ? (direct_jmp2_count * 100) / nb_tbs : 0);
+    cpu_fprintf(f, "\nStatistics:\n");
+    cpu_fprintf(f, "TB flush count      %d\n", tb_flush_count);
+    cpu_fprintf(f, "TB invalidate count %d\n", tb_phys_invalidate_count);
+    cpu_fprintf(f, "TLB flush count     %d\n", tlb_flush_count);
+    tcg_dump_info(f, cpu_fprintf);
+}
+#endif /* !VBOX */
+
+#define MMUSUFFIX _cmmu
+#define GETPC() NULL
+#define env cpu_single_env
+#define SOFTMMU_CODE_ACCESS
+
+#define SHIFT 0
+#include "softmmu_template.h"
+
+#define SHIFT 1
+#include "softmmu_template.h"
+
+#define SHIFT 2
+#include "softmmu_template.h"
+
+#define SHIFT 3
+#include "softmmu_template.h"
+
+#undef env
+
+#endif
diff --git a/src/recompiler/fpu/Makefile.kup b/src/recompiler/fpu/Makefile.kup
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/src/recompiler/fpu/Makefile.kup
diff --git a/src/recompiler/fpu/softfloat-macros.h b/src/recompiler/fpu/softfloat-macros.h
new file mode 100644
index 00000000..78382282
--- /dev/null
+++ b/src/recompiler/fpu/softfloat-macros.h
@@ -0,0 +1,719 @@
+
+/*============================================================================
+
+This C source fragment is part of the SoftFloat IEC/IEEE Floating-point
+Arithmetic Package, Release 2b.
+
+Written by John R. Hauser.  This work was made possible in part by the
+International Computer Science Institute, located at Suite 600, 1947 Center
+Street, Berkeley, California 94704.  Funding was partially provided by the
+National Science Foundation under grant MIP-9311980.  The original version
+of this code was written as part of a project to build a fixed-point vector
+processor in collaboration with the University of California at Berkeley,
+overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
+is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
+arithmetic/SoftFloat.html'.
+
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
+been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
+RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
+AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
+COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
+EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
+INSTITUTE (possibly via similar legal notice) AGAINST ALL LOSSES, COSTS, OR
+OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
+
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+
+=============================================================================*/
+
+/*----------------------------------------------------------------------------
+| Shifts `a' right by the number of bits given in `count'.  If any nonzero
+| bits are shifted off, they are ``jammed'' into the least significant bit of
+| the result by setting the least significant bit to 1.  The value of `count'
+| can be arbitrarily large; in particular, if `count' is greater than 32, the
+| result will be either 0 or 1, depending on whether `a' is zero or nonzero.
+| The result is stored in the location pointed to by `zPtr'.
+*----------------------------------------------------------------------------*/
+
+INLINE void shift32RightJamming( bits32 a, int16 count, bits32 *zPtr )
+{
+    bits32 z;
+
+    if ( count == 0 ) {
+        z = a;
+    }
+    else if ( count < 32 ) {
+        z = ( a>>count ) | ( ( a<<( ( - count ) & 31 ) ) != 0 );
+    }
+    else {
+        z = ( a != 0 );
+    }
+    *zPtr = z;
+
+}
+
+/*----------------------------------------------------------------------------
+| Shifts `a' right by the number of bits given in `count'.  If any nonzero
+| bits are shifted off, they are ``jammed'' into the least significant bit of
+| the result by setting the least significant bit to 1.  The value of `count'
+| can be arbitrarily large; in particular, if `count' is greater than 64, the
+| result will be either 0 or 1, depending on whether `a' is zero or nonzero.
+| The result is stored in the location pointed to by `zPtr'.
+*----------------------------------------------------------------------------*/
+
+INLINE void shift64RightJamming( bits64 a, int16 count, bits64 *zPtr )
+{
+    bits64 z;
+
+    if ( count == 0 ) {
+        z = a;
+    }
+    else if ( count < 64 ) {
+        z = ( a>>count ) | ( ( a<<( ( - count ) & 63 ) ) != 0 );
+    }
+    else {
+        z = ( a != 0 );
+    }
+    *zPtr = z;
+
+}
+
+/*----------------------------------------------------------------------------
+| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
+| _plus_ the number of bits given in `count'.  The shifted result is at most
+| 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'.  The
+| bits shifted off form a second 64-bit result as follows:  The _last_ bit
+| shifted off is the most-significant bit of the extra result, and the other
+| 63 bits of the extra result are all zero if and only if _all_but_the_last_
+| bits shifted off were all zero.  This extra result is stored in the location
+| pointed to by `z1Ptr'.  The value of `count' can be arbitrarily large.
+|     (This routine makes more sense if `a0' and `a1' are considered to form
+| a fixed-point value with binary point between `a0' and `a1'.  This fixed-
+| point value is shifted right by the number of bits given in `count', and
+| the integer part of the result is returned at the location pointed to by
+| `z0Ptr'.  The fractional part of the result may be slightly corrupted as
+| described above, and is returned at the location pointed to by `z1Ptr'.)
+*----------------------------------------------------------------------------*/
+
+INLINE void
+ shift64ExtraRightJamming(
+     bits64 a0, bits64 a1, int16 count, bits64 *z0Ptr, bits64 *z1Ptr )
+{
+    bits64 z0, z1;
+    int8 negCount = ( - count ) & 63;
+
+    if ( count == 0 ) {
+        z1 = a1;
+        z0 = a0;
+    }
+    else if ( count < 64 ) {
+        z1 = ( a0<<negCount ) | ( a1 != 0 );
+        z0 = a0>>count;
+    }
+    else {
+        if ( count == 64 ) {
+            z1 = a0 | ( a1 != 0 );
+        }
+        else {
+            z1 = ( ( a0 | a1 ) != 0 );
+        }
+        z0 = 0;
+    }
+    *z1Ptr = z1;
+    *z0Ptr = z0;
+
+}
+
+/*----------------------------------------------------------------------------
+| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
+| number of bits given in `count'.  Any bits shifted off are lost.  The value
+| of `count' can be arbitrarily large; in particular, if `count' is greater
+| than 128, the result will be 0.  The result is broken into two 64-bit pieces
+| which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
+*----------------------------------------------------------------------------*/
+
+INLINE void
+ shift128Right(
+     bits64 a0, bits64 a1, int16 count, bits64 *z0Ptr, bits64 *z1Ptr )
+{
+    bits64 z0, z1;
+    int8 negCount = ( - count ) & 63;
+
+    if ( count == 0 ) {
+        z1 = a1;
+        z0 = a0;
+    }
+    else if ( count < 64 ) {
+        z1 = ( a0<<negCount ) | ( a1>>count );
+        z0 = a0>>count;
+    }
+    else {
+        z1 = ( count < 64 ) ? ( a0>>( count & 63 ) ) : 0;
+        z0 = 0;
+    }
+    *z1Ptr = z1;
+    *z0Ptr = z0;
+
+}
+
+/*----------------------------------------------------------------------------
+| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
+| number of bits given in `count'.  If any nonzero bits are shifted off, they
+| are ``jammed'' into the least significant bit of the result by setting the
+| least significant bit to 1.  The value of `count' can be arbitrarily large;
+| in particular, if `count' is greater than 128, the result will be either
+| 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
+| nonzero.  The result is broken into two 64-bit pieces which are stored at
+| the locations pointed to by `z0Ptr' and `z1Ptr'.
+*----------------------------------------------------------------------------*/
+
+INLINE void
+ shift128RightJamming(
+     bits64 a0, bits64 a1, int16 count, bits64 *z0Ptr, bits64 *z1Ptr )
+{
+    bits64 z0, z1;
+    int8 negCount = ( - count ) & 63;
+
+    if ( count == 0 ) {
+        z1 = a1;
+        z0 = a0;
+    }
+    else if ( count < 64 ) {
+        z1 = ( a0<<negCount ) | ( a1>>count ) | ( ( a1<<negCount ) != 0 );
+        z0 = a0>>count;
+    }
+    else {
+        if ( count == 64 ) {
+            z1 = a0 | ( a1 != 0 );
+        }
+        else if ( count < 128 ) {
+            z1 = ( a0>>( count & 63 ) ) | ( ( ( a0<<negCount ) | a1 ) != 0 );
+        }
+        else {
+            z1 = ( ( a0 | a1 ) != 0 );
+        }
+        z0 = 0;
+    }
+    *z1Ptr = z1;
+    *z0Ptr = z0;
+
+}
+
+/*----------------------------------------------------------------------------
+| Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
+| by 64 _plus_ the number of bits given in `count'.  The shifted result is
+| at most 128 nonzero bits; these are broken into two 64-bit pieces which are
+| stored at the locations pointed to by `z0Ptr' and `z1Ptr'.  The bits shifted
+| off form a third 64-bit result as follows:  The _last_ bit shifted off is
+| the most-significant bit of the extra result, and the other 63 bits of the
+| extra result are all zero if and only if _all_but_the_last_ bits shifted off
+| were all zero.  This extra result is stored in the location pointed to by
+| `z2Ptr'.  The value of `count' can be arbitrarily large.
+|     (This routine makes more sense if `a0', `a1', and `a2' are considered
+| to form a fixed-point value with binary point between `a1' and `a2'.  This
+| fixed-point value is shifted right by the number of bits given in `count',
+| and the integer part of the result is returned at the locations pointed to
+| by `z0Ptr' and `z1Ptr'.  The fractional part of the result may be slightly
+| corrupted as described above, and is returned at the location pointed to by
+| `z2Ptr'.)
+*----------------------------------------------------------------------------*/
+
+INLINE void
+ shift128ExtraRightJamming(
+     bits64 a0,
+     bits64 a1,
+     bits64 a2,
+     int16 count,
+     bits64 *z0Ptr,
+     bits64 *z1Ptr,
+     bits64 *z2Ptr
+ )
+{
+    bits64 z0, z1, z2;
+    int8 negCount = ( - count ) & 63;
+
+    if ( count == 0 ) {
+        z2 = a2;
+        z1 = a1;
+        z0 = a0;
+    }
+    else {
+        if ( count < 64 ) {
+            z2 = a1<<negCount;
+            z1 = ( a0<<negCount ) | ( a1>>count );
+            z0 = a0>>count;
+        }
+        else {
+            if ( count == 64 ) {
+                z2 = a1;
+                z1 = a0;
+            }
+            else {
+                a2 |= a1;
+                if ( count < 128 ) {
+                    z2 = a0<<negCount;
+                    z1 = a0>>( count & 63 );
+                }
+                else {
+                    z2 = ( count == 128 ) ? a0 : ( a0 != 0 );
+                    z1 = 0;
+                }
+            }
+            z0 = 0;
+        }
+        z2 |= ( a2 != 0 );
+    }
+    *z2Ptr = z2;
+    *z1Ptr = z1;
+    *z0Ptr = z0;
+
+}
+
+/*----------------------------------------------------------------------------
+| Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
+| number of bits given in `count'.  Any bits shifted off are lost.  The value
+| of `count' must be less than 64.  The result is broken into two 64-bit
+| pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
+*----------------------------------------------------------------------------*/
+
+INLINE void
+ shortShift128Left(
+     bits64 a0, bits64 a1, int16 count, bits64 *z0Ptr, bits64 *z1Ptr )
+{
+
+    *z1Ptr = a1<<count;
+    *z0Ptr =
+        ( count == 0 ) ? a0 : ( a0<<count ) | ( a1>>( ( - count ) & 63 ) );
+
+}
+
+/*----------------------------------------------------------------------------
+| Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
+| by the number of bits given in `count'.  Any bits shifted off are lost.
+| The value of `count' must be less than 64.  The result is broken into three
+| 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
+| `z1Ptr', and `z2Ptr'.
+*----------------------------------------------------------------------------*/
+
+INLINE void
+ shortShift192Left(
+     bits64 a0,
+     bits64 a1,
+     bits64 a2,
+     int16 count,
+     bits64 *z0Ptr,
+     bits64 *z1Ptr,
+     bits64 *z2Ptr
+ )
+{
+    bits64 z0, z1, z2;
+    int8 negCount;
+
+    z2 = a2<<count;
+    z1 = a1<<count;
+    z0 = a0<<count;
+    if ( 0 < count ) {
+        negCount = ( ( - count ) & 63 );
+        z1 |= a2>>negCount;
+        z0 |= a1>>negCount;
+    }
+    *z2Ptr = z2;
+    *z1Ptr = z1;
+    *z0Ptr = z0;
+
+}
+
+/*----------------------------------------------------------------------------
+| Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
+| value formed by concatenating `b0' and `b1'.  Addition is modulo 2^128, so
+| any carry out is lost.  The result is broken into two 64-bit pieces which
+| are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
+*----------------------------------------------------------------------------*/
+
+INLINE void
+ add128(
+     bits64 a0, bits64 a1, bits64 b0, bits64 b1, bits64 *z0Ptr, bits64 *z1Ptr )
+{
+    bits64 z1;
+
+    z1 = a1 + b1;
+    *z1Ptr = z1;
+    *z0Ptr = a0 + b0 + ( z1 < a1 );
+
+}
+
+/*----------------------------------------------------------------------------
+| Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
+| 192-bit value formed by concatenating `b0', `b1', and `b2'.  Addition is
+| modulo 2^192, so any carry out is lost.  The result is broken into three
+| 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
+| `z1Ptr', and `z2Ptr'.
+*----------------------------------------------------------------------------*/
+
+INLINE void
+ add192(
+     bits64 a0,
+     bits64 a1,
+     bits64 a2,
+     bits64 b0,
+     bits64 b1,
+     bits64 b2,
+     bits64 *z0Ptr,
+     bits64 *z1Ptr,
+     bits64 *z2Ptr
+ )
+{
+    bits64 z0, z1, z2;
+    int8 carry0, carry1;
+
+    z2 = a2 + b2;
+    carry1 = ( z2 < a2 );
+    z1 = a1 + b1;
+    carry0 = ( z1 < a1 );
+    z0 = a0 + b0;
+    z1 += carry1;
+    z0 += ( z1 < carry1 );
+    z0 += carry0;
+    *z2Ptr = z2;
+    *z1Ptr = z1;
+    *z0Ptr = z0;
+
+}
+
+/*----------------------------------------------------------------------------
+| Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
+| 128-bit value formed by concatenating `a0' and `a1'.  Subtraction is modulo
+| 2^128, so any borrow out (carry out) is lost.  The result is broken into two
+| 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
+| `z1Ptr'.
+*----------------------------------------------------------------------------*/
+
+INLINE void
+ sub128(
+     bits64 a0, bits64 a1, bits64 b0, bits64 b1, bits64 *z0Ptr, bits64 *z1Ptr )
+{
+
+    *z1Ptr = a1 - b1;
+    *z0Ptr = a0 - b0 - ( a1 < b1 );
+
+}
+
+/*----------------------------------------------------------------------------
+| Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
+| from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
+| Subtraction is modulo 2^192, so any borrow out (carry out) is lost.  The
+| result is broken into three 64-bit pieces which are stored at the locations
+| pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
+*----------------------------------------------------------------------------*/
+
+INLINE void
+ sub192(
+     bits64 a0,
+     bits64 a1,
+     bits64 a2,
+     bits64 b0,
+     bits64 b1,
+     bits64 b2,
+     bits64 *z0Ptr,
+     bits64 *z1Ptr,
+     bits64 *z2Ptr
+ )
+{
+    bits64 z0, z1, z2;
+    int8 borrow0, borrow1;
+
+    z2 = a2 - b2;
+    borrow1 = ( a2 < b2 );
+    z1 = a1 - b1;
+    borrow0 = ( a1 < b1 );
+    z0 = a0 - b0;
+    z0 -= ( z1 < borrow1 );
+    z1 -= borrow1;
+    z0 -= borrow0;
+    *z2Ptr = z2;
+    *z1Ptr = z1;
+    *z0Ptr = z0;
+
+}
+
+/*----------------------------------------------------------------------------
+| Multiplies `a' by `b' to obtain a 128-bit product.  The product is broken
+| into two 64-bit pieces which are stored at the locations pointed to by
+| `z0Ptr' and `z1Ptr'.
+*----------------------------------------------------------------------------*/
+
+INLINE void mul64To128( bits64 a, bits64 b, bits64 *z0Ptr, bits64 *z1Ptr )
+{
+    bits32 aHigh, aLow, bHigh, bLow;
+    bits64 z0, zMiddleA, zMiddleB, z1;
+
+    aLow = a;
+    aHigh = a>>32;
+    bLow = b;
+    bHigh = b>>32;
+    z1 = ( (bits64) aLow ) * bLow;
+    zMiddleA = ( (bits64) aLow ) * bHigh;
+    zMiddleB = ( (bits64) aHigh ) * bLow;
+    z0 = ( (bits64) aHigh ) * bHigh;
+    zMiddleA += zMiddleB;
+    z0 += ( ( (bits64) ( zMiddleA < zMiddleB ) )<<32 ) + ( zMiddleA>>32 );
+    zMiddleA <<= 32;
+    z1 += zMiddleA;
+    z0 += ( z1 < zMiddleA );
+    *z1Ptr = z1;
+    *z0Ptr = z0;
+
+}
+
+/*----------------------------------------------------------------------------
+| Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
+| `b' to obtain a 192-bit product.  The product is broken into three 64-bit
+| pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
+| `z2Ptr'.
+*----------------------------------------------------------------------------*/
+
+INLINE void
+ mul128By64To192(
+     bits64 a0,
+     bits64 a1,
+     bits64 b,
+     bits64 *z0Ptr,
+     bits64 *z1Ptr,
+     bits64 *z2Ptr
+ )
+{
+    bits64 z0, z1, z2, more1;
+
+    mul64To128( a1, b, &z1, &z2 );
+    mul64To128( a0, b, &z0, &more1 );
+    add128( z0, more1, 0, z1, &z0, &z1 );
+    *z2Ptr = z2;
+    *z1Ptr = z1;
+    *z0Ptr = z0;
+
+}
+
+/*----------------------------------------------------------------------------
+| Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
+| 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
+| product.  The product is broken into four 64-bit pieces which are stored at
+| the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
+*----------------------------------------------------------------------------*/
+
+INLINE void
+ mul128To256(
+     bits64 a0,
+     bits64 a1,
+     bits64 b0,
+     bits64 b1,
+     bits64 *z0Ptr,
+     bits64 *z1Ptr,
+     bits64 *z2Ptr,
+     bits64 *z3Ptr
+ )
+{
+    bits64 z0, z1, z2, z3;
+    bits64 more1, more2;
+
+    mul64To128( a1, b1, &z2, &z3 );
+    mul64To128( a1, b0, &z1, &more2 );
+    add128( z1, more2, 0, z2, &z1, &z2 );
+    mul64To128( a0, b0, &z0, &more1 );
+    add128( z0, more1, 0, z1, &z0, &z1 );
+    mul64To128( a0, b1, &more1, &more2 );
+    add128( more1, more2, 0, z2, &more1, &z2 );
+    add128( z0, z1, 0, more1, &z0, &z1 );
+    *z3Ptr = z3;
+    *z2Ptr = z2;
+    *z1Ptr = z1;
+    *z0Ptr = z0;
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns an approximation to the 64-bit integer quotient obtained by dividing
+| `b' into the 128-bit value formed by concatenating `a0' and `a1'.  The
+| divisor `b' must be at least 2^63.  If q is the exact quotient truncated
+| toward zero, the approximation returned lies between q and q + 2 inclusive.
+| If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
+| unsigned integer is returned.
+*----------------------------------------------------------------------------*/
+
+static bits64 estimateDiv128To64( bits64 a0, bits64 a1, bits64 b )
+{
+    bits64 b0, b1;
+    bits64 rem0, rem1, term0, term1;
+    bits64 z;
+
+    if ( b <= a0 ) return LIT64( 0xFFFFFFFFFFFFFFFF );
+    b0 = b>>32;
+    z = ( b0<<32 <= a0 ) ? LIT64( 0xFFFFFFFF00000000 ) : ( a0 / b0 )<<32;
+    mul64To128( b, z, &term0, &term1 );
+    sub128( a0, a1, term0, term1, &rem0, &rem1 );
+    while ( ( (sbits64) rem0 ) < 0 ) {
+        z -= LIT64( 0x100000000 );
+        b1 = b<<32;
+        add128( rem0, rem1, b0, b1, &rem0, &rem1 );
+    }
+    rem0 = ( rem0<<32 ) | ( rem1>>32 );
+    z |= ( b0<<32 <= rem0 ) ? 0xFFFFFFFF : rem0 / b0;
+    return z;
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns an approximation to the square root of the 32-bit significand given
+| by `a'.  Considered as an integer, `a' must be at least 2^31.  If bit 0 of
+| `aExp' (the least significant bit) is 1, the integer returned approximates
+| 2^31*sqrt(`a'/2^31), where `a' is considered an integer.  If bit 0 of `aExp'
+| is 0, the integer returned approximates 2^31*sqrt(`a'/2^30).  In either
+| case, the approximation returned lies strictly within +/-2 of the exact
+| value.
+*----------------------------------------------------------------------------*/
+
+static bits32 estimateSqrt32( int16 aExp, bits32 a )
+{
+    static const bits16 sqrtOddAdjustments[] = {
+        0x0004, 0x0022, 0x005D, 0x00B1, 0x011D, 0x019F, 0x0236, 0x02E0,
+        0x039C, 0x0468, 0x0545, 0x0631, 0x072B, 0x0832, 0x0946, 0x0A67
+    };
+    static const bits16 sqrtEvenAdjustments[] = {
+        0x0A2D, 0x08AF, 0x075A, 0x0629, 0x051A, 0x0429, 0x0356, 0x029E,
+        0x0200, 0x0179, 0x0109, 0x00AF, 0x0068, 0x0034, 0x0012, 0x0002
+    };
+    int8 index;
+    bits32 z;
+
+    index = ( a>>27 ) & 15;
+    if ( aExp & 1 ) {
+        z = 0x4000 + ( a>>17 ) - sqrtOddAdjustments[ (int)index ];
+        z = ( ( a / z )<<14 ) + ( z<<15 );
+        a >>= 1;
+    }
+    else {
+        z = 0x8000 + ( a>>17 ) - sqrtEvenAdjustments[ (int)index ];
+        z = a / z + z;
+        z = ( 0x20000 <= z ) ? 0xFFFF8000 : ( z<<15 );
+        if ( z <= a ) return (bits32) ( ( (sbits32) a )>>1 );
+    }
+    return ( (bits32) ( ( ( (bits64) a )<<31 ) / z ) ) + ( z>>1 );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the number of leading 0 bits before the most-significant 1 bit of
+| `a'.  If `a' is zero, 32 is returned.
+*----------------------------------------------------------------------------*/
+
+static int8 countLeadingZeros32( bits32 a )
+{
+    static const int8 countLeadingZerosHigh[] = {
+        8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+    };
+    int8 shiftCount;
+
+    shiftCount = 0;
+    if ( a < 0x10000 ) {
+        shiftCount += 16;
+        a <<= 16;
+    }
+    if ( a < 0x1000000 ) {
+        shiftCount += 8;
+        a <<= 8;
+    }
+    shiftCount += countLeadingZerosHigh[ a>>24 ];
+    return shiftCount;
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the number of leading 0 bits before the most-significant 1 bit of
+| `a'.  If `a' is zero, 64 is returned.
+*----------------------------------------------------------------------------*/
+
+static int8 countLeadingZeros64( bits64 a )
+{
+    int8 shiftCount;
+
+    shiftCount = 0;
+    if ( a < ( (bits64) 1 )<<32 ) {
+        shiftCount += 32;
+    }
+    else {
+        a >>= 32;
+    }
+    shiftCount += countLeadingZeros32( a );
+    return shiftCount;
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
+| is equal to the 128-bit value formed by concatenating `b0' and `b1'.
+| Otherwise, returns 0.
+*----------------------------------------------------------------------------*/
+
+INLINE flag eq128( bits64 a0, bits64 a1, bits64 b0, bits64 b1 )
+{
+
+    return ( a0 == b0 ) && ( a1 == b1 );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
+| than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
+| Otherwise, returns 0.
+*----------------------------------------------------------------------------*/
+
+INLINE flag le128( bits64 a0, bits64 a1, bits64 b0, bits64 b1 )
+{
+
+    return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 <= b1 ) );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
+| than the 128-bit value formed by concatenating `b0' and `b1'.  Otherwise,
+| returns 0.
+*----------------------------------------------------------------------------*/
+
+INLINE flag lt128( bits64 a0, bits64 a1, bits64 b0, bits64 b1 )
+{
+
+    return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 < b1 ) );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is
+| not equal to the 128-bit value formed by concatenating `b0' and `b1'.
+| Otherwise, returns 0.
+*----------------------------------------------------------------------------*/
+
+INLINE flag ne128( bits64 a0, bits64 a1, bits64 b0, bits64 b1 )
+{
+
+    return ( a0 != b0 ) || ( a1 != b1 );
+
+}
diff --git a/src/recompiler/fpu/softfloat-native.c b/src/recompiler/fpu/softfloat-native.c
new file mode 100644
index 00000000..7c7820ab
--- /dev/null
+++ b/src/recompiler/fpu/softfloat-native.c
@@ -0,0 +1,521 @@
+/* Native implementation of soft float functions. Only a single status
+   context is supported */
+#include "softfloat.h"
+#include <math.h>
+#if defined(CONFIG_SOLARIS)
+#include <fenv.h>
+#endif
+
+void set_float_rounding_mode(int val STATUS_PARAM)
+{
+    STATUS(float_rounding_mode) = val;
+#if (defined(CONFIG_BSD) && !defined(__APPLE__) && !defined(__GLIBC__)) || \
+    (defined(CONFIG_SOLARIS) && (CONFIG_SOLARIS_VERSION < 10 || CONFIG_SOLARIS_VERSION == 11)) /* VBOX adds sol 11 */
+    fpsetround(val);
+#else
+    fesetround(val);
+#endif
+}
+
+#ifdef FLOATX80
+void set_floatx80_rounding_precision(int val STATUS_PARAM)
+{
+    STATUS(floatx80_rounding_precision) = val;
+}
+#endif
+
+#if defined(CONFIG_BSD) || \
+    (defined(CONFIG_SOLARIS) && CONFIG_SOLARIS_VERSION < 10)
+#define lrint(d)		((int32_t)rint(d))
+#define llrint(d)		((int64_t)rint(d))
+#define lrintf(f)		((int32_t)rint(f))
+#define llrintf(f)		((int64_t)rint(f))
+#define sqrtf(f)		((float)sqrt(f))
+#define remainderf(fa, fb)	((float)remainder(fa, fb))
+#define rintf(f)		((float)rint(f))
+# if defined(VBOX) && defined(HOST_BSD) /* Some defines which only apply to *BSD */
+#  define lrintl(f)            ((int32_t)rint(f))
+#  define llrintl(f)           ((int64_t)rint(f))
+#  define rintl(d)             ((int32_t)rint(d))
+#  define sqrtl(f)             (sqrt(f))
+#  define remainderl(fa, fb)   (remainder(fa, fb))
+# endif /* VBOX && _BSD */
+#if !defined(__sparc__) && \
+    (defined(CONFIG_SOLARIS) && CONFIG_SOLARIS_VERSION < 10)
+extern long double rintl(long double);
+extern long double scalbnl(long double, int);
+
+long long
+llrintl(long double x) {
+	return ((long long) rintl(x));
+}
+
+long
+lrintl(long double x) {
+	return ((long) rintl(x));
+}
+
+long double
+ldexpl(long double x, int n) {
+	return (scalbnl(x, n));
+}
+#endif
+#endif
+
+#if defined(_ARCH_PPC)
+
+/* correct (but slow) PowerPC rint() (glibc version is incorrect) */
+static double qemu_rint(double x)
+{
+    double y = 4503599627370496.0;
+    if (fabs(x) >= y)
+        return x;
+    if (x < 0)
+        y = -y;
+    y = (x + y) - y;
+    if (y == 0.0)
+        y = copysign(y, x);
+    return y;
+}
+
+#define rint qemu_rint
+#endif
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE integer-to-floating-point conversion routines.
+*----------------------------------------------------------------------------*/
+float32 int32_to_float32(int v STATUS_PARAM)
+{
+    return (float32)v;
+}
+
+float32 uint32_to_float32(unsigned int v STATUS_PARAM)
+{
+    return (float32)v;
+}
+
+float64 int32_to_float64(int v STATUS_PARAM)
+{
+    return (float64)v;
+}
+
+float64 uint32_to_float64(unsigned int v STATUS_PARAM)
+{
+    return (float64)v;
+}
+
+#ifdef FLOATX80
+floatx80 int32_to_floatx80(int v STATUS_PARAM)
+{
+    return (floatx80)v;
+}
+#endif
+float32 int64_to_float32( int64_t v STATUS_PARAM)
+{
+    return (float32)v;
+}
+float32 uint64_to_float32( uint64_t v STATUS_PARAM)
+{
+    return (float32)v;
+}
+float64 int64_to_float64( int64_t v STATUS_PARAM)
+{
+    return (float64)v;
+}
+float64 uint64_to_float64( uint64_t v STATUS_PARAM)
+{
+    return (float64)v;
+}
+#ifdef FLOATX80
+floatx80 int64_to_floatx80( int64_t v STATUS_PARAM)
+{
+    return (floatx80)v;
+}
+#endif
+
+/* XXX: this code implements the x86 behaviour, not the IEEE one.  */
+#if HOST_LONG_BITS == 32
+static inline int long_to_int32(long a)
+{
+    return a;
+}
+#else
+static inline int long_to_int32(long a)
+{
+    if (a != (int32_t)a)
+        a = 0x80000000;
+    return a;
+}
+#endif
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE single-precision conversion routines.
+*----------------------------------------------------------------------------*/
+int float32_to_int32( float32 a STATUS_PARAM)
+{
+    return long_to_int32(lrintf(a));
+}
+int float32_to_int32_round_to_zero( float32 a STATUS_PARAM)
+{
+    return (int)a;
+}
+int64_t float32_to_int64( float32 a STATUS_PARAM)
+{
+    return llrintf(a);
+}
+
+int64_t float32_to_int64_round_to_zero( float32 a STATUS_PARAM)
+{
+    return (int64_t)a;
+}
+
+float64 float32_to_float64( float32 a STATUS_PARAM)
+{
+    return a;
+}
+#ifdef FLOATX80
+floatx80 float32_to_floatx80( float32 a STATUS_PARAM)
+{
+    return a;
+}
+#endif
+
+unsigned int float32_to_uint32( float32 a STATUS_PARAM)
+{
+    int64_t v;
+    unsigned int res;
+
+    v = llrintf(a);
+    if (v < 0) {
+        res = 0;
+    } else if (v > 0xffffffff) {
+        res = 0xffffffff;
+    } else {
+        res = v;
+    }
+    return res;
+}
+unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM)
+{
+    int64_t v;
+    unsigned int res;
+
+    v = (int64_t)a;
+    if (v < 0) {
+        res = 0;
+    } else if (v > 0xffffffff) {
+        res = 0xffffffff;
+    } else {
+        res = v;
+    }
+    return res;
+}
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE single-precision operations.
+*----------------------------------------------------------------------------*/
+float32 float32_round_to_int( float32 a STATUS_PARAM)
+{
+    return rintf(a);
+}
+
+float32 float32_rem( float32 a, float32 b STATUS_PARAM)
+{
+    return remainderf(a, b);
+}
+
+float32 float32_sqrt( float32 a STATUS_PARAM)
+{
+    return sqrtf(a);
+}
+int float32_compare( float32 a, float32 b STATUS_PARAM )
+{
+    if (a < b) {
+        return float_relation_less;
+    } else if (a == b) {
+        return float_relation_equal;
+    } else if (a > b) {
+        return float_relation_greater;
+    } else {
+        return float_relation_unordered;
+    }
+}
+int float32_compare_quiet( float32 a, float32 b STATUS_PARAM )
+{
+    if (isless(a, b)) {
+        return float_relation_less;
+    } else if (a == b) {
+        return float_relation_equal;
+    } else if (isgreater(a, b)) {
+        return float_relation_greater;
+    } else {
+        return float_relation_unordered;
+    }
+}
+int float32_is_signaling_nan( float32 a1)
+{
+    float32u u;
+    uint32_t a;
+    u.f = a1;
+    a = u.i;
+    return ( ( ( a>>22 ) & 0x1FF ) == 0x1FE ) && ( a & 0x003FFFFF );
+}
+
+int float32_is_nan( float32 a1 )
+{
+    float32u u;
+    uint64_t a;
+    u.f = a1;
+    a = u.i;
+    return ( 0xFF800000 < ( a<<1 ) );
+}
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE double-precision conversion routines.
+*----------------------------------------------------------------------------*/
+int float64_to_int32( float64 a STATUS_PARAM)
+{
+    return long_to_int32(lrint(a));
+}
+int float64_to_int32_round_to_zero( float64 a STATUS_PARAM)
+{
+    return (int)a;
+}
+int64_t float64_to_int64( float64 a STATUS_PARAM)
+{
+    return llrint(a);
+}
+int64_t float64_to_int64_round_to_zero( float64 a STATUS_PARAM)
+{
+    return (int64_t)a;
+}
+float32 float64_to_float32( float64 a STATUS_PARAM)
+{
+    return a;
+}
+#ifdef FLOATX80
+floatx80 float64_to_floatx80( float64 a STATUS_PARAM)
+{
+    return a;
+}
+#endif
+#ifdef FLOAT128
+float128 float64_to_float128( float64 a STATUS_PARAM)
+{
+    return a;
+}
+#endif
+
+unsigned int float64_to_uint32( float64 a STATUS_PARAM)
+{
+    int64_t v;
+    unsigned int res;
+
+    v = llrint(a);
+    if (v < 0) {
+        res = 0;
+    } else if (v > 0xffffffff) {
+        res = 0xffffffff;
+    } else {
+        res = v;
+    }
+    return res;
+}
+unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM)
+{
+    int64_t v;
+    unsigned int res;
+
+    v = (int64_t)a;
+    if (v < 0) {
+        res = 0;
+    } else if (v > 0xffffffff) {
+        res = 0xffffffff;
+    } else {
+        res = v;
+    }
+    return res;
+}
+uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
+{
+    int64_t v;
+
+    v = llrint(a + (float64)INT64_MIN);
+
+    return v - INT64_MIN;
+}
+uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
+{
+    int64_t v;
+
+    v = (int64_t)(a + (float64)INT64_MIN);
+
+    return v - INT64_MIN;
+}
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE double-precision operations.
+*----------------------------------------------------------------------------*/
+#if defined(__sun__) && \
+    (defined(CONFIG_SOLARIS) && CONFIG_SOLARIS_VERSION < 10)
+static inline float64 trunc(float64 x)
+{
+    return x < 0 ? -floor(-x) : floor(x);
+}
+#endif
+float64 float64_trunc_to_int( float64 a STATUS_PARAM )
+{
+    return trunc(a);
+}
+
+float64 float64_round_to_int( float64 a STATUS_PARAM )
+{
+    return rint(a);
+}
+
+float64 float64_rem( float64 a, float64 b STATUS_PARAM)
+{
+    return remainder(a, b);
+}
+
+float64 float64_sqrt( float64 a STATUS_PARAM)
+{
+    return sqrt(a);
+}
+int float64_compare( float64 a, float64 b STATUS_PARAM )
+{
+    if (a < b) {
+        return float_relation_less;
+    } else if (a == b) {
+        return float_relation_equal;
+    } else if (a > b) {
+        return float_relation_greater;
+    } else {
+        return float_relation_unordered;
+    }
+}
+int float64_compare_quiet( float64 a, float64 b STATUS_PARAM )
+{
+    if (isless(a, b)) {
+        return float_relation_less;
+    } else if (a == b) {
+        return float_relation_equal;
+    } else if (isgreater(a, b)) {
+        return float_relation_greater;
+    } else {
+        return float_relation_unordered;
+    }
+}
+int float64_is_signaling_nan( float64 a1)
+{
+    float64u u;
+    uint64_t a;
+    u.f = a1;
+    a = u.i;
+    return
+           ( ( ( a>>51 ) & 0xFFF ) == 0xFFE )
+        && ( a & LIT64( 0x0007FFFFFFFFFFFF ) );
+
+}
+
+int float64_is_nan( float64 a1 )
+{
+    float64u u;
+    uint64_t a;
+    u.f = a1;
+    a = u.i;
+
+    return ( LIT64( 0xFFF0000000000000 ) < (bits64) ( a<<1 ) );
+
+}
+
+#ifdef FLOATX80
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE extended double-precision conversion routines.
+*----------------------------------------------------------------------------*/
+int floatx80_to_int32( floatx80 a STATUS_PARAM)
+{
+    return long_to_int32(lrintl(a));
+}
+int floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM)
+{
+    return (int)a;
+}
+int64_t floatx80_to_int64( floatx80 a STATUS_PARAM)
+{
+    return llrintl(a);
+}
+int64_t floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM)
+{
+    return (int64_t)a;
+}
+float32 floatx80_to_float32( floatx80 a STATUS_PARAM)
+{
+    return a;
+}
+float64 floatx80_to_float64( floatx80 a STATUS_PARAM)
+{
+    return a;
+}
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE extended double-precision operations.
+*----------------------------------------------------------------------------*/
+floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM)
+{
+    return rintl(a);
+}
+floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM)
+{
+    return remainderl(a, b);
+}
+floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM)
+{
+    return sqrtl(a);
+}
+int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
+{
+    if (a < b) {
+        return float_relation_less;
+    } else if (a == b) {
+        return float_relation_equal;
+    } else if (a > b) {
+        return float_relation_greater;
+    } else {
+        return float_relation_unordered;
+    }
+}
+int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
+{
+    if (isless(a, b)) {
+        return float_relation_less;
+    } else if (a == b) {
+        return float_relation_equal;
+    } else if (isgreater(a, b)) {
+        return float_relation_greater;
+    } else {
+        return float_relation_unordered;
+    }
+}
+int floatx80_is_signaling_nan( floatx80 a1)
+{
+    floatx80u u;
+    uint64_t aLow;
+    u.f = a1;
+
+    aLow = u.i.low & ~ LIT64( 0x4000000000000000 );
+    return
+           ( ( u.i.high & 0x7FFF ) == 0x7FFF )
+        && (bits64) ( aLow<<1 )
+        && ( u.i.low == aLow );
+}
+
+int floatx80_is_nan( floatx80 a1 )
+{
+    floatx80u u;
+    u.f = a1;
+    return ( ( u.i.high & 0x7FFF ) == 0x7FFF ) && (bits64) ( u.i.low<<1 );
+}
+
+#endif
diff --git a/src/recompiler/fpu/softfloat-native.h b/src/recompiler/fpu/softfloat-native.h
new file mode 100644
index 00000000..13be732a
--- /dev/null
+++ b/src/recompiler/fpu/softfloat-native.h
@@ -0,0 +1,493 @@
+/* Native implementation of soft float functions */
+#define __C99FEATURES__
+#include <math.h>
+
+#if (defined(CONFIG_BSD) && !defined(__APPLE__) && !defined(__GLIBC__) && !defined(__FreeBSD__)) \
+    || defined(CONFIG_SOLARIS) /* VBox: Added __FreeBSD__ */
+#include <ieeefp.h>
+#define fabsf(f) ((float)fabs(f))
+#else
+#include <fenv.h>
+#endif
+
+#if defined(__OpenBSD__) || defined(__NetBSD__)
+#include <sys/param.h>
+#endif
+
+/*
+ * Define some C99-7.12.3 classification macros and
+ *        some C99-.12.4 for Solaris systems OS less than 10,
+ *        or Solaris 10 systems running GCC 3.x or less.
+ *   Solaris 10 with GCC4 does not need these macros as they
+ *   are defined in <iso/math_c99.h> with a compiler directive
+ */
+#if defined(CONFIG_SOLARIS) && \
+           ((CONFIG_SOLARIS_VERSION <= 9 ) || \
+           ((CONFIG_SOLARIS_VERSION == 10) && (__GNUC__ < 4))) \
+    || (defined(__OpenBSD__) && (OpenBSD < 200811))
+/*
+ * C99 7.12.3 classification macros
+ * and
+ * C99 7.12.14 comparison macros
+ *
+ * ... do not work on Solaris 10 using GNU CC 3.4.x.
+ * Try to workaround the missing / broken C99 math macros.
+ */
+#if defined(__OpenBSD__)
+#define unordered(x, y) (isnan(x) || isnan(y))
+#endif
+
+#ifdef __NetBSD__
+#ifndef isgreater
+#define isgreater(x, y)		__builtin_isgreater(x, y)
+#endif
+#ifndef isgreaterequal
+#define isgreaterequal(x, y)	__builtin_isgreaterequal(x, y)
+#endif
+#ifndef isless
+#define isless(x, y)		__builtin_isless(x, y)
+#endif
+#ifndef islessequal
+#define islessequal(x, y)	__builtin_islessequal(x, y)
+#endif
+#ifndef isunordered
+#define isunordered(x, y)	__builtin_isunordered(x, y)
+#endif
+#endif
+
+
+#define isnormal(x)             (fpclass(x) >= FP_NZERO)
+#define isgreater(x, y)         ((!unordered(x, y)) && ((x) > (y)))
+#define isgreaterequal(x, y)    ((!unordered(x, y)) && ((x) >= (y)))
+#define isless(x, y)            ((!unordered(x, y)) && ((x) < (y)))
+#define islessequal(x, y)       ((!unordered(x, y)) && ((x) <= (y)))
+#define isunordered(x,y)        unordered(x, y)
+#endif
+
+#if defined(__sun__) && !defined(CONFIG_NEEDS_LIBSUNMATH)
+
+#ifndef isnan
+# define isnan(x) \
+    (sizeof (x) == sizeof (long double) ? isnan_ld (x) \
+     : sizeof (x) == sizeof (double) ? isnan_d (x) \
+     : isnan_f (x))
+static inline int isnan_f  (float       x) { return x != x; }
+static inline int isnan_d  (double      x) { return x != x; }
+static inline int isnan_ld (long double x) { return x != x; }
+#endif
+
+#ifndef isinf
+# define isinf(x) \
+    (sizeof (x) == sizeof (long double) ? isinf_ld (x) \
+     : sizeof (x) == sizeof (double) ? isinf_d (x) \
+     : isinf_f (x))
+static inline int isinf_f  (float       x) { return isnan (x - x); }
+static inline int isinf_d  (double      x) { return isnan (x - x); }
+static inline int isinf_ld (long double x) { return isnan (x - x); }
+#endif
+#endif
+
+typedef float float32;
+typedef double float64;
+#ifdef FLOATX80
+typedef long double floatx80;
+#endif
+
+typedef union {
+    float32 f;
+    uint32_t i;
+} float32u;
+typedef union {
+    float64 f;
+    uint64_t i;
+} float64u;
+#ifdef FLOATX80
+typedef union {
+    floatx80 f;
+    struct {
+        uint64_t low;
+        uint16_t high;
+    } i;
+} floatx80u;
+#endif
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE floating-point rounding mode.
+*----------------------------------------------------------------------------*/
+#if (defined(CONFIG_BSD) && !defined(__APPLE__) && !defined(__GLIBC__)) \
+    || defined(CONFIG_SOLARIS)
+#if defined(__OpenBSD__)
+#define FE_RM FP_RM
+#define FE_RP FP_RP
+#define FE_RZ FP_RZ
+#endif
+enum {
+    float_round_nearest_even = FP_RN,
+    float_round_down         = FP_RM,
+    float_round_up           = FP_RP,
+    float_round_to_zero      = FP_RZ
+};
+#else
+enum {
+    float_round_nearest_even = FE_TONEAREST,
+    float_round_down         = FE_DOWNWARD,
+    float_round_up           = FE_UPWARD,
+    float_round_to_zero      = FE_TOWARDZERO
+};
+#endif
+
+typedef struct float_status {
+    int float_rounding_mode;
+#ifdef FLOATX80
+    int floatx80_rounding_precision;
+#endif
+} float_status;
+
+void set_float_rounding_mode(int val STATUS_PARAM);
+#ifdef FLOATX80
+void set_floatx80_rounding_precision(int val STATUS_PARAM);
+#endif
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE integer-to-floating-point conversion routines.
+*----------------------------------------------------------------------------*/
+float32 int32_to_float32( int STATUS_PARAM);
+float32 uint32_to_float32( unsigned int STATUS_PARAM);
+float64 int32_to_float64( int STATUS_PARAM);
+float64 uint32_to_float64( unsigned int STATUS_PARAM);
+#ifdef FLOATX80
+floatx80 int32_to_floatx80( int STATUS_PARAM);
+#endif
+#ifdef FLOAT128
+float128 int32_to_float128( int STATUS_PARAM);
+#endif
+float32 int64_to_float32( int64_t STATUS_PARAM);
+float32 uint64_to_float32( uint64_t STATUS_PARAM);
+float64 int64_to_float64( int64_t STATUS_PARAM);
+float64 uint64_to_float64( uint64_t v STATUS_PARAM);
+#ifdef FLOATX80
+floatx80 int64_to_floatx80( int64_t STATUS_PARAM);
+#endif
+#ifdef FLOAT128
+float128 int64_to_float128( int64_t STATUS_PARAM);
+#endif
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE single-precision conversion routines.
+*----------------------------------------------------------------------------*/
+int float32_to_int32( float32  STATUS_PARAM);
+int float32_to_int32_round_to_zero( float32  STATUS_PARAM);
+unsigned int float32_to_uint32( float32 a STATUS_PARAM);
+unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM);
+int64_t float32_to_int64( float32  STATUS_PARAM);
+int64_t float32_to_int64_round_to_zero( float32  STATUS_PARAM);
+float64 float32_to_float64( float32  STATUS_PARAM);
+#ifdef FLOATX80
+floatx80 float32_to_floatx80( float32  STATUS_PARAM);
+#endif
+#ifdef FLOAT128
+float128 float32_to_float128( float32  STATUS_PARAM);
+#endif
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE single-precision operations.
+*----------------------------------------------------------------------------*/
+float32 float32_round_to_int( float32  STATUS_PARAM);
+INLINE float32 float32_add( float32 a, float32 b STATUS_PARAM)
+{
+    return a + b;
+}
+INLINE float32 float32_sub( float32 a, float32 b STATUS_PARAM)
+{
+    return a - b;
+}
+INLINE float32 float32_mul( float32 a, float32 b STATUS_PARAM)
+{
+    return a * b;
+}
+INLINE float32 float32_div( float32 a, float32 b STATUS_PARAM)
+{
+    return a / b;
+}
+float32 float32_rem( float32, float32  STATUS_PARAM);
+float32 float32_sqrt( float32  STATUS_PARAM);
+INLINE int float32_eq( float32 a, float32 b STATUS_PARAM)
+{
+    return a == b;
+}
+INLINE int float32_le( float32 a, float32 b STATUS_PARAM)
+{
+    return a <= b;
+}
+INLINE int float32_lt( float32 a, float32 b STATUS_PARAM)
+{
+    return a < b;
+}
+INLINE int float32_eq_signaling( float32 a, float32 b STATUS_PARAM)
+{
+    return a <= b && a >= b;
+}
+INLINE int float32_le_quiet( float32 a, float32 b STATUS_PARAM)
+{
+    return islessequal(a, b);
+}
+INLINE int float32_lt_quiet( float32 a, float32 b STATUS_PARAM)
+{
+    return isless(a, b);
+}
+INLINE int float32_unordered( float32 a, float32 b STATUS_PARAM)
+{
+    return isunordered(a, b);
+
+}
+int float32_compare( float32, float32 STATUS_PARAM );
+int float32_compare_quiet( float32, float32 STATUS_PARAM );
+int float32_is_signaling_nan( float32 );
+int float32_is_nan( float32 );
+
+INLINE float32 float32_abs(float32 a)
+{
+    return fabsf(a);
+}
+
+INLINE float32 float32_chs(float32 a)
+{
+    return -a;
+}
+
+INLINE float32 float32_is_infinity(float32 a)
+{
+    return fpclassify(a) == FP_INFINITE;
+}
+
+INLINE float32 float32_is_neg(float32 a)
+{
+    float32u u;
+    u.f = a;
+    return u.i >> 31;
+}
+
+INLINE float32 float32_is_zero(float32 a)
+{
+    return fpclassify(a) == FP_ZERO;
+}
+
+INLINE float32 float32_scalbn(float32 a, int n)
+{
+    return scalbnf(a, n);
+}
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE double-precision conversion routines.
+*----------------------------------------------------------------------------*/
+int float64_to_int32( float64 STATUS_PARAM );
+int float64_to_int32_round_to_zero( float64 STATUS_PARAM );
+unsigned int float64_to_uint32( float64 STATUS_PARAM );
+unsigned int float64_to_uint32_round_to_zero( float64 STATUS_PARAM );
+int64_t float64_to_int64( float64 STATUS_PARAM );
+int64_t float64_to_int64_round_to_zero( float64 STATUS_PARAM );
+uint64_t float64_to_uint64( float64 STATUS_PARAM );
+uint64_t float64_to_uint64_round_to_zero( float64 STATUS_PARAM );
+float32 float64_to_float32( float64 STATUS_PARAM );
+#ifdef FLOATX80
+floatx80 float64_to_floatx80( float64 STATUS_PARAM );
+#endif
+#ifdef FLOAT128
+float128 float64_to_float128( float64 STATUS_PARAM );
+#endif
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE double-precision operations.
+*----------------------------------------------------------------------------*/
+float64 float64_round_to_int( float64 STATUS_PARAM );
+float64 float64_trunc_to_int( float64 STATUS_PARAM );
+INLINE float64 float64_add( float64 a, float64 b STATUS_PARAM)
+{
+    return a + b;
+}
+INLINE float64 float64_sub( float64 a, float64 b STATUS_PARAM)
+{
+    return a - b;
+}
+INLINE float64 float64_mul( float64 a, float64 b STATUS_PARAM)
+{
+    return a * b;
+}
+INLINE float64 float64_div( float64 a, float64 b STATUS_PARAM)
+{
+    return a / b;
+}
+float64 float64_rem( float64, float64 STATUS_PARAM );
+float64 float64_sqrt( float64 STATUS_PARAM );
+INLINE int float64_eq( float64 a, float64 b STATUS_PARAM)
+{
+    return a == b;
+}
+INLINE int float64_le( float64 a, float64 b STATUS_PARAM)
+{
+    return a <= b;
+}
+INLINE int float64_lt( float64 a, float64 b STATUS_PARAM)
+{
+    return a < b;
+}
+INLINE int float64_eq_signaling( float64 a, float64 b STATUS_PARAM)
+{
+    return a <= b && a >= b;
+}
+INLINE int float64_le_quiet( float64 a, float64 b STATUS_PARAM)
+{
+    return islessequal(a, b);
+}
+INLINE int float64_lt_quiet( float64 a, float64 b STATUS_PARAM)
+{
+    return isless(a, b);
+
+}
+INLINE int float64_unordered( float64 a, float64 b STATUS_PARAM)
+{
+    return isunordered(a, b);
+
+}
+int float64_compare( float64, float64 STATUS_PARAM );
+int float64_compare_quiet( float64, float64 STATUS_PARAM );
+int float64_is_signaling_nan( float64 );
+int float64_is_nan( float64 );
+
+INLINE float64 float64_abs(float64 a)
+{
+    return fabs(a);
+}
+
+INLINE float64 float64_chs(float64 a)
+{
+    return -a;
+}
+
+INLINE float64 float64_is_infinity(float64 a)
+{
+    return fpclassify(a) == FP_INFINITE;
+}
+
+INLINE float64 float64_is_neg(float64 a)
+{
+    float64u u;
+    u.f = a;
+    return u.i >> 63;
+}
+
+INLINE float64 float64_is_zero(float64 a)
+{
+    return fpclassify(a) == FP_ZERO;
+}
+
+INLINE float64 float64_scalbn(float64 a, int n)
+{
+    return scalbn(a, n);
+}
+
+#ifdef FLOATX80
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE extended double-precision conversion routines.
+*----------------------------------------------------------------------------*/
+int floatx80_to_int32( floatx80 STATUS_PARAM );
+int floatx80_to_int32_round_to_zero( floatx80 STATUS_PARAM );
+int64_t floatx80_to_int64( floatx80 STATUS_PARAM);
+int64_t floatx80_to_int64_round_to_zero( floatx80 STATUS_PARAM);
+float32 floatx80_to_float32( floatx80 STATUS_PARAM );
+float64 floatx80_to_float64( floatx80 STATUS_PARAM );
+#ifdef FLOAT128
+float128 floatx80_to_float128( floatx80 STATUS_PARAM );
+#endif
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE extended double-precision operations.
+*----------------------------------------------------------------------------*/
+floatx80 floatx80_round_to_int( floatx80 STATUS_PARAM );
+INLINE floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM)
+{
+    return a + b;
+}
+INLINE floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM)
+{
+    return a - b;
+}
+INLINE floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM)
+{
+    return a * b;
+}
+INLINE floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM)
+{
+    return a / b;
+}
+floatx80 floatx80_rem( floatx80, floatx80 STATUS_PARAM );
+floatx80 floatx80_sqrt( floatx80 STATUS_PARAM );
+INLINE int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM)
+{
+    return a == b;
+}
+INLINE int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM)
+{
+    return a <= b;
+}
+INLINE int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM)
+{
+    return a < b;
+}
+INLINE int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM)
+{
+    return a <= b && a >= b;
+}
+INLINE int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM)
+{
+    return islessequal(a, b);
+}
+INLINE int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM)
+{
+    return isless(a, b);
+
+}
+INLINE int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM)
+{
+    return isunordered(a, b);
+
+}
+int floatx80_compare( floatx80, floatx80 STATUS_PARAM );
+int floatx80_compare_quiet( floatx80, floatx80 STATUS_PARAM );
+int floatx80_is_signaling_nan( floatx80 );
+int floatx80_is_nan( floatx80 );
+
+INLINE floatx80 floatx80_abs(floatx80 a)
+{
+    return fabsl(a);
+}
+
+INLINE floatx80 floatx80_chs(floatx80 a)
+{
+    return -a;
+}
+
+INLINE floatx80 floatx80_is_infinity(floatx80 a)
+{
+    return fpclassify(a) == FP_INFINITE;
+}
+
+INLINE floatx80 floatx80_is_neg(floatx80 a)
+{
+    floatx80u u;
+    u.f = a;
+    return u.i.high >> 15;
+}
+
+INLINE floatx80 floatx80_is_zero(floatx80 a)
+{
+    return fpclassify(a) == FP_ZERO;
+}
+
+INLINE floatx80 floatx80_scalbn(floatx80 a, int n)
+{
+    return scalbnl(a, n);
+}
+
+#endif
diff --git a/src/recompiler/fpu/softfloat-specialize.h b/src/recompiler/fpu/softfloat-specialize.h
new file mode 100644
index 00000000..8e6aceb5
--- /dev/null
+++ b/src/recompiler/fpu/softfloat-specialize.h
@@ -0,0 +1,581 @@
+
+/*============================================================================
+
+This C source fragment is part of the SoftFloat IEC/IEEE Floating-point
+Arithmetic Package, Release 2b.
+
+Written by John R. Hauser.  This work was made possible in part by the
+International Computer Science Institute, located at Suite 600, 1947 Center
+Street, Berkeley, California 94704.  Funding was partially provided by the
+National Science Foundation under grant MIP-9311980.  The original version
+of this code was written as part of a project to build a fixed-point vector
+processor in collaboration with the University of California at Berkeley,
+overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
+is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
+arithmetic/SoftFloat.html'.
+
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
+been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
+RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
+AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
+COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
+EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
+INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
+OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
+
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+
+=============================================================================*/
+
+#if defined(TARGET_MIPS) || defined(TARGET_HPPA)
+#define SNAN_BIT_IS_ONE		1
+#else
+#define SNAN_BIT_IS_ONE		0
+#endif
+
+/*----------------------------------------------------------------------------
+| Raises the exceptions specified by `flags'.  Floating-point traps can be
+| defined here if desired.  It is currently not possible for such a trap
+| to substitute a result value.  If traps are not implemented, this routine
+| should be simply `float_exception_flags |= flags;'.
+*----------------------------------------------------------------------------*/
+
+void float_raise( int8 flags STATUS_PARAM )
+{
+    STATUS(float_exception_flags) |= flags;
+}
+
+/*----------------------------------------------------------------------------
+| Internal canonical NaN format.
+*----------------------------------------------------------------------------*/
+typedef struct {
+    flag sign;
+    bits64 high, low;
+} commonNaNT;
+
+/*----------------------------------------------------------------------------
+| The pattern for a default generated single-precision NaN.
+*----------------------------------------------------------------------------*/
+#if defined(TARGET_SPARC)
+#define float32_default_nan make_float32(0x7FFFFFFF)
+#elif defined(TARGET_POWERPC) || defined(TARGET_ARM) || defined(TARGET_ALPHA)
+#define float32_default_nan make_float32(0x7FC00000)
+#elif defined(TARGET_HPPA)
+#define float32_default_nan make_float32(0x7FA00000)
+#elif SNAN_BIT_IS_ONE
+#define float32_default_nan make_float32(0x7FBFFFFF)
+#else
+#define float32_default_nan make_float32(0xFFC00000)
+#endif
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the single-precision floating-point value `a' is a quiet
+| NaN; otherwise returns 0.
+*----------------------------------------------------------------------------*/
+
+int float32_is_nan( float32 a_ )
+{
+    uint32_t a = float32_val(a_);
+#if SNAN_BIT_IS_ONE
+    return ( ( ( a>>22 ) & 0x1FF ) == 0x1FE ) && ( a & 0x003FFFFF );
+#else
+    return ( 0xFF800000 <= (bits32) ( a<<1 ) );
+#endif
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the single-precision floating-point value `a' is a signaling
+| NaN; otherwise returns 0.
+*----------------------------------------------------------------------------*/
+
+int float32_is_signaling_nan( float32 a_ )
+{
+    uint32_t a = float32_val(a_);
+#if SNAN_BIT_IS_ONE
+    return ( 0xFF800000 <= (bits32) ( a<<1 ) );
+#else
+    return ( ( ( a>>22 ) & 0x1FF ) == 0x1FE ) && ( a & 0x003FFFFF );
+#endif
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the single-precision floating-point NaN
+| `a' to the canonical NaN format.  If `a' is a signaling NaN, the invalid
+| exception is raised.
+*----------------------------------------------------------------------------*/
+
+static commonNaNT float32ToCommonNaN( float32 a STATUS_PARAM )
+{
+    commonNaNT z;
+
+    if ( float32_is_signaling_nan( a ) ) float_raise( float_flag_invalid STATUS_VAR );
+    z.sign = float32_val(a)>>31;
+    z.low = 0;
+    z.high = ( (bits64) float32_val(a) )<<41;
+    return z;
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the canonical NaN `a' to the single-
+| precision floating-point format.
+*----------------------------------------------------------------------------*/
+
+static float32 commonNaNToFloat32( commonNaNT a )
+{
+    bits32 mantissa = a.high>>41;
+    if ( mantissa )
+        return make_float32(
+            ( ( (bits32) a.sign )<<31 ) | 0x7F800000 | ( a.high>>41 ) );
+    else
+        return float32_default_nan;
+}
+
+/*----------------------------------------------------------------------------
+| Takes two single-precision floating-point values `a' and `b', one of which
+| is a NaN, and returns the appropriate NaN result.  If either `a' or `b' is a
+| signaling NaN, the invalid exception is raised.
+*----------------------------------------------------------------------------*/
+
+static float32 propagateFloat32NaN( float32 a, float32 b STATUS_PARAM)
+{
+    flag aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN;
+    bits32 av, bv, res;
+
+    if ( STATUS(default_nan_mode) )
+        return float32_default_nan;
+
+    aIsNaN = float32_is_nan( a );
+    aIsSignalingNaN = float32_is_signaling_nan( a );
+    bIsNaN = float32_is_nan( b );
+    bIsSignalingNaN = float32_is_signaling_nan( b );
+    av = float32_val(a);
+    bv = float32_val(b);
+#if SNAN_BIT_IS_ONE
+    av &= ~0x00400000;
+    bv &= ~0x00400000;
+#else
+    av |= 0x00400000;
+    bv |= 0x00400000;
+#endif
+    if ( aIsSignalingNaN | bIsSignalingNaN ) float_raise( float_flag_invalid STATUS_VAR);
+    if ( aIsSignalingNaN ) {
+        if ( bIsSignalingNaN ) goto returnLargerSignificand;
+        res = bIsNaN ? bv : av;
+    }
+    else if ( aIsNaN ) {
+        if ( bIsSignalingNaN || ! bIsNaN )
+            res = av;
+        else {
+ returnLargerSignificand:
+            if ( (bits32) ( av<<1 ) < (bits32) ( bv<<1 ) )
+                res = bv;
+            else if ( (bits32) ( bv<<1 ) < (bits32) ( av<<1 ) )
+                res = av;
+            else
+                res = ( av < bv ) ? av : bv;
+        }
+    }
+    else {
+        res = bv;
+    }
+    return make_float32(res);
+}
+
+/*----------------------------------------------------------------------------
+| The pattern for a default generated double-precision NaN.
+*----------------------------------------------------------------------------*/
+#if defined(TARGET_SPARC)
+#define float64_default_nan make_float64(LIT64( 0x7FFFFFFFFFFFFFFF ))
+#elif defined(TARGET_POWERPC) || defined(TARGET_ARM) || defined(TARGET_ALPHA)
+#define float64_default_nan make_float64(LIT64( 0x7FF8000000000000 ))
+#elif defined(TARGET_HPPA)
+#define float64_default_nan make_float64(LIT64( 0x7FF4000000000000 ))
+#elif SNAN_BIT_IS_ONE
+#define float64_default_nan make_float64(LIT64( 0x7FF7FFFFFFFFFFFF ))
+#else
+#define float64_default_nan make_float64(LIT64( 0xFFF8000000000000 ))
+#endif
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the double-precision floating-point value `a' is a quiet
+| NaN; otherwise returns 0.
+*----------------------------------------------------------------------------*/
+
+int float64_is_nan( float64 a_ )
+{
+    bits64 a = float64_val(a_);
+#if SNAN_BIT_IS_ONE
+    return
+           ( ( ( a>>51 ) & 0xFFF ) == 0xFFE )
+        && ( a & LIT64( 0x0007FFFFFFFFFFFF ) );
+#else
+    return ( LIT64( 0xFFF0000000000000 ) <= (bits64) ( a<<1 ) );
+#endif
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the double-precision floating-point value `a' is a signaling
+| NaN; otherwise returns 0.
+*----------------------------------------------------------------------------*/
+
+int float64_is_signaling_nan( float64 a_ )
+{
+    bits64 a = float64_val(a_);
+#if SNAN_BIT_IS_ONE
+    return ( LIT64( 0xFFF0000000000000 ) <= (bits64) ( a<<1 ) );
+#else
+    return
+           ( ( ( a>>51 ) & 0xFFF ) == 0xFFE )
+        && ( a & LIT64( 0x0007FFFFFFFFFFFF ) );
+#endif
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the double-precision floating-point NaN
+| `a' to the canonical NaN format.  If `a' is a signaling NaN, the invalid
+| exception is raised.
+*----------------------------------------------------------------------------*/
+
+static commonNaNT float64ToCommonNaN( float64 a STATUS_PARAM)
+{
+    commonNaNT z;
+
+    if ( float64_is_signaling_nan( a ) ) float_raise( float_flag_invalid STATUS_VAR);
+    z.sign = float64_val(a)>>63;
+    z.low = 0;
+    z.high = float64_val(a)<<12;
+    return z;
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the canonical NaN `a' to the double-
+| precision floating-point format.
+*----------------------------------------------------------------------------*/
+
+static float64 commonNaNToFloat64( commonNaNT a )
+{
+    bits64 mantissa = a.high>>12;
+
+    if ( mantissa )
+        return make_float64(
+              ( ( (bits64) a.sign )<<63 )
+            | LIT64( 0x7FF0000000000000 )
+            | ( a.high>>12 ));
+    else
+        return float64_default_nan;
+}
+
+/*----------------------------------------------------------------------------
+| Takes two double-precision floating-point values `a' and `b', one of which
+| is a NaN, and returns the appropriate NaN result.  If either `a' or `b' is a
+| signaling NaN, the invalid exception is raised.
+*----------------------------------------------------------------------------*/
+
+static float64 propagateFloat64NaN( float64 a, float64 b STATUS_PARAM)
+{
+    flag aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN;
+    bits64 av, bv, res;
+
+    if ( STATUS(default_nan_mode) )
+        return float64_default_nan;
+
+    aIsNaN = float64_is_nan( a );
+    aIsSignalingNaN = float64_is_signaling_nan( a );
+    bIsNaN = float64_is_nan( b );
+    bIsSignalingNaN = float64_is_signaling_nan( b );
+    av = float64_val(a);
+    bv = float64_val(b);
+#if SNAN_BIT_IS_ONE
+    av &= ~LIT64( 0x0008000000000000 );
+    bv &= ~LIT64( 0x0008000000000000 );
+#else
+    av |= LIT64( 0x0008000000000000 );
+    bv |= LIT64( 0x0008000000000000 );
+#endif
+    if ( aIsSignalingNaN | bIsSignalingNaN ) float_raise( float_flag_invalid STATUS_VAR);
+    if ( aIsSignalingNaN ) {
+        if ( bIsSignalingNaN ) goto returnLargerSignificand;
+        res = bIsNaN ? bv : av;
+    }
+    else if ( aIsNaN ) {
+        if ( bIsSignalingNaN || ! bIsNaN )
+            res = av;
+        else {
+ returnLargerSignificand:
+            if ( (bits64) ( av<<1 ) < (bits64) ( bv<<1 ) )
+                res = bv;
+            else if ( (bits64) ( bv<<1 ) < (bits64) ( av<<1 ) )
+                res = av;
+            else
+                res = ( av < bv ) ? av : bv;
+        }
+    }
+    else {
+        res = bv;
+    }
+    return make_float64(res);
+}
+
+#ifdef FLOATX80
+
+/*----------------------------------------------------------------------------
+| The pattern for a default generated extended double-precision NaN.  The
+| `high' and `low' values hold the most- and least-significant bits,
+| respectively.
+*----------------------------------------------------------------------------*/
+#if SNAN_BIT_IS_ONE
+#define floatx80_default_nan_high 0x7FFF
+#define floatx80_default_nan_low  LIT64( 0xBFFFFFFFFFFFFFFF )
+#else
+#define floatx80_default_nan_high 0xFFFF
+#define floatx80_default_nan_low  LIT64( 0xC000000000000000 )
+#endif
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the extended double-precision floating-point value `a' is a
+| quiet NaN; otherwise returns 0.
+*----------------------------------------------------------------------------*/
+
+int floatx80_is_nan( floatx80 a )
+{
+#if SNAN_BIT_IS_ONE
+    bits64 aLow;
+
+    aLow = a.low & ~ LIT64( 0x4000000000000000 );
+    return
+           ( ( a.high & 0x7FFF ) == 0x7FFF )
+        && (bits64) ( aLow<<1 )
+        && ( a.low == aLow );
+#else
+    return ( ( a.high & 0x7FFF ) == 0x7FFF ) && (bits64) ( a.low<<1 );
+#endif
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the extended double-precision floating-point value `a' is a
+| signaling NaN; otherwise returns 0.
+*----------------------------------------------------------------------------*/
+
+int floatx80_is_signaling_nan( floatx80 a )
+{
+#if SNAN_BIT_IS_ONE
+    return ( ( a.high & 0x7FFF ) == 0x7FFF ) && (bits64) ( a.low<<1 );
+#else
+    bits64 aLow;
+
+    aLow = a.low & ~ LIT64( 0x4000000000000000 );
+    return
+           ( ( a.high & 0x7FFF ) == 0x7FFF )
+        && (bits64) ( aLow<<1 )
+        && ( a.low == aLow );
+#endif
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the extended double-precision floating-
+| point NaN `a' to the canonical NaN format.  If `a' is a signaling NaN, the
+| invalid exception is raised.
+*----------------------------------------------------------------------------*/
+
+static commonNaNT floatx80ToCommonNaN( floatx80 a STATUS_PARAM)
+{
+    commonNaNT z;
+
+    if ( floatx80_is_signaling_nan( a ) ) float_raise( float_flag_invalid STATUS_VAR);
+    z.sign = a.high>>15;
+    z.low = 0;
+    z.high = a.low;
+    return z;
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the canonical NaN `a' to the extended
+| double-precision floating-point format.
+*----------------------------------------------------------------------------*/
+
+static floatx80 commonNaNToFloatx80( commonNaNT a )
+{
+    floatx80 z;
+
+    if (a.high)
+        z.low = a.high;
+    else
+        z.low = floatx80_default_nan_low;
+    z.high = ( ( (bits16) a.sign )<<15 ) | 0x7FFF;
+    return z;
+}
+
+/*----------------------------------------------------------------------------
+| Takes two extended double-precision floating-point values `a' and `b', one
+| of which is a NaN, and returns the appropriate NaN result.  If either `a' or
+| `b' is a signaling NaN, the invalid exception is raised.
+*----------------------------------------------------------------------------*/
+
+static floatx80 propagateFloatx80NaN( floatx80 a, floatx80 b STATUS_PARAM)
+{
+    flag aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN;
+
+    if ( STATUS(default_nan_mode) ) {
+        a.low = floatx80_default_nan_low;
+        a.high = floatx80_default_nan_high;
+        return a;
+    }
+
+    aIsNaN = floatx80_is_nan( a );
+    aIsSignalingNaN = floatx80_is_signaling_nan( a );
+    bIsNaN = floatx80_is_nan( b );
+    bIsSignalingNaN = floatx80_is_signaling_nan( b );
+#if SNAN_BIT_IS_ONE
+    a.low &= ~LIT64( 0xC000000000000000 );
+    b.low &= ~LIT64( 0xC000000000000000 );
+#else
+    a.low |= LIT64( 0xC000000000000000 );
+    b.low |= LIT64( 0xC000000000000000 );
+#endif
+    if ( aIsSignalingNaN | bIsSignalingNaN ) float_raise( float_flag_invalid STATUS_VAR);
+    if ( aIsSignalingNaN ) {
+        if ( bIsSignalingNaN ) goto returnLargerSignificand;
+        return bIsNaN ? b : a;
+    }
+    else if ( aIsNaN ) {
+        if ( bIsSignalingNaN || ! bIsNaN ) return a;
+ returnLargerSignificand:
+        if ( a.low < b.low ) return b;
+        if ( b.low < a.low ) return a;
+        return ( a.high < b.high ) ? a : b;
+    }
+    else {
+        return b;
+    }
+}
+
+#endif
+
+#ifdef FLOAT128
+
+/*----------------------------------------------------------------------------
+| The pattern for a default generated quadruple-precision NaN.  The `high' and
+| `low' values hold the most- and least-significant bits, respectively.
+*----------------------------------------------------------------------------*/
+#if SNAN_BIT_IS_ONE
+#define float128_default_nan_high LIT64( 0x7FFF7FFFFFFFFFFF )
+#define float128_default_nan_low  LIT64( 0xFFFFFFFFFFFFFFFF )
+#else
+#define float128_default_nan_high LIT64( 0xFFFF800000000000 )
+#define float128_default_nan_low  LIT64( 0x0000000000000000 )
+#endif
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the quadruple-precision floating-point value `a' is a quiet
+| NaN; otherwise returns 0.
+*----------------------------------------------------------------------------*/
+
+int float128_is_nan( float128 a )
+{
+#if SNAN_BIT_IS_ONE
+    return
+           ( ( ( a.high>>47 ) & 0xFFFF ) == 0xFFFE )
+        && ( a.low || ( a.high & LIT64( 0x00007FFFFFFFFFFF ) ) );
+#else
+    return
+           ( LIT64( 0xFFFE000000000000 ) <= (bits64) ( a.high<<1 ) )
+        && ( a.low || ( a.high & LIT64( 0x0000FFFFFFFFFFFF ) ) );
+#endif
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the quadruple-precision floating-point value `a' is a
+| signaling NaN; otherwise returns 0.
+*----------------------------------------------------------------------------*/
+
+int float128_is_signaling_nan( float128 a )
+{
+#if SNAN_BIT_IS_ONE
+    return
+           ( LIT64( 0xFFFE000000000000 ) <= (bits64) ( a.high<<1 ) )
+        && ( a.low || ( a.high & LIT64( 0x0000FFFFFFFFFFFF ) ) );
+#else
+    return
+           ( ( ( a.high>>47 ) & 0xFFFF ) == 0xFFFE )
+        && ( a.low || ( a.high & LIT64( 0x00007FFFFFFFFFFF ) ) );
+#endif
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the quadruple-precision floating-point NaN
+| `a' to the canonical NaN format.  If `a' is a signaling NaN, the invalid
+| exception is raised.
+*----------------------------------------------------------------------------*/
+
+static commonNaNT float128ToCommonNaN( float128 a STATUS_PARAM)
+{
+    commonNaNT z;
+
+    if ( float128_is_signaling_nan( a ) ) float_raise( float_flag_invalid STATUS_VAR);
+    z.sign = a.high>>63;
+    shortShift128Left( a.high, a.low, 16, &z.high, &z.low );
+    return z;
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the canonical NaN `a' to the quadruple-
+| precision floating-point format.
+*----------------------------------------------------------------------------*/
+
+static float128 commonNaNToFloat128( commonNaNT a )
+{
+    float128 z;
+
+    shift128Right( a.high, a.low, 16, &z.high, &z.low );
+    z.high |= ( ( (bits64) a.sign )<<63 ) | LIT64( 0x7FFF000000000000 );
+    return z;
+}
+
+/*----------------------------------------------------------------------------
+| Takes two quadruple-precision floating-point values `a' and `b', one of
+| which is a NaN, and returns the appropriate NaN result.  If either `a' or
+| `b' is a signaling NaN, the invalid exception is raised.
+*----------------------------------------------------------------------------*/
+
+static float128 propagateFloat128NaN( float128 a, float128 b STATUS_PARAM)
+{
+    flag aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN;
+
+    if ( STATUS(default_nan_mode) ) {
+        a.low = float128_default_nan_low;
+        a.high = float128_default_nan_high;
+        return a;
+    }
+
+    aIsNaN = float128_is_nan( a );
+    aIsSignalingNaN = float128_is_signaling_nan( a );
+    bIsNaN = float128_is_nan( b );
+    bIsSignalingNaN = float128_is_signaling_nan( b );
+#if SNAN_BIT_IS_ONE
+    a.high &= ~LIT64( 0x0000800000000000 );
+    b.high &= ~LIT64( 0x0000800000000000 );
+#else
+    a.high |= LIT64( 0x0000800000000000 );
+    b.high |= LIT64( 0x0000800000000000 );
+#endif
+    if ( aIsSignalingNaN | bIsSignalingNaN ) float_raise( float_flag_invalid STATUS_VAR);
+    if ( aIsSignalingNaN ) {
+        if ( bIsSignalingNaN ) goto returnLargerSignificand;
+        return bIsNaN ? b : a;
+    }
+    else if ( aIsNaN ) {
+        if ( bIsSignalingNaN || ! bIsNaN ) return a;
+ returnLargerSignificand:
+        if ( lt128( a.high<<1, a.low, b.high<<1, b.low ) ) return b;
+        if ( lt128( b.high<<1, b.low, a.high<<1, a.low ) ) return a;
+        return ( a.high < b.high ) ? a : b;
+    }
+    else {
+        return b;
+    }
+}
+
+#endif
diff --git a/src/recompiler/fpu/softfloat.c b/src/recompiler/fpu/softfloat.c
new file mode 100644
index 00000000..6806ebc2
--- /dev/null
+++ b/src/recompiler/fpu/softfloat.c
@@ -0,0 +1,5883 @@
+
+/*============================================================================
+
+This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
+Package, Release 2b.
+
+Written by John R. Hauser.  This work was made possible in part by the
+International Computer Science Institute, located at Suite 600, 1947 Center
+Street, Berkeley, California 94704.  Funding was partially provided by the
+National Science Foundation under grant MIP-9311980.  The original version
+of this code was written as part of a project to build a fixed-point vector
+processor in collaboration with the University of California at Berkeley,
+overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
+is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
+arithmetic/SoftFloat.html'.
+
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
+been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
+RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
+AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
+COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
+EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
+INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
+OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
+
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+
+=============================================================================*/
+
+/* FIXME: Flush-To-Zero only effects results.  Denormal inputs should also
+   be flushed to zero.  */
+#include "softfloat.h"
+
+/*----------------------------------------------------------------------------
+| Primitive arithmetic functions, including multi-word arithmetic, and
+| division and square root approximations.  (Can be specialized to target if
+| desired.)
+*----------------------------------------------------------------------------*/
+#include "softfloat-macros.h"
+
+/*----------------------------------------------------------------------------
+| Functions and definitions to determine:  (1) whether tininess for underflow
+| is detected before or after rounding by default, (2) what (if anything)
+| happens when exceptions are raised, (3) how signaling NaNs are distinguished
+| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
+| are propagated from function inputs to output.  These details are target-
+| specific.
+*----------------------------------------------------------------------------*/
+#include "softfloat-specialize.h"
+
+void set_float_rounding_mode(int val STATUS_PARAM)
+{
+    STATUS(float_rounding_mode) = val;
+}
+
+void set_float_exception_flags(int val STATUS_PARAM)
+{
+    STATUS(float_exception_flags) = val;
+}
+
+#ifdef FLOATX80
+void set_floatx80_rounding_precision(int val STATUS_PARAM)
+{
+    STATUS(floatx80_rounding_precision) = val;
+}
+#endif
+
+/*----------------------------------------------------------------------------
+| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
+| and 7, and returns the properly rounded 32-bit integer corresponding to the
+| input.  If `zSign' is 1, the input is negated before being converted to an
+| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
+| is simply rounded to an integer, with the inexact exception raised if the
+| input cannot be represented exactly as an integer.  However, if the fixed-
+| point input is too large, the invalid exception is raised and the largest
+| positive or negative integer is returned.
+*----------------------------------------------------------------------------*/
+
+static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
+{
+    int8 roundingMode;
+    flag roundNearestEven;
+    int8 roundIncrement, roundBits;
+    int32 z;
+
+    roundingMode = STATUS(float_rounding_mode);
+    roundNearestEven = ( roundingMode == float_round_nearest_even );
+    roundIncrement = 0x40;
+    if ( ! roundNearestEven ) {
+        if ( roundingMode == float_round_to_zero ) {
+            roundIncrement = 0;
+        }
+        else {
+            roundIncrement = 0x7F;
+            if ( zSign ) {
+                if ( roundingMode == float_round_up ) roundIncrement = 0;
+            }
+            else {
+                if ( roundingMode == float_round_down ) roundIncrement = 0;
+            }
+        }
+    }
+    roundBits = absZ & 0x7F;
+    absZ = ( absZ + roundIncrement )>>7;
+    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
+    z = absZ;
+    if ( zSign ) z = - z;
+    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
+        float_raise( float_flag_invalid STATUS_VAR);
+        return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
+    }
+    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
+    return z;
+
+}
+
+/*----------------------------------------------------------------------------
+| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
+| `absZ1', with binary point between bits 63 and 64 (between the input words),
+| and returns the properly rounded 64-bit integer corresponding to the input.
+| If `zSign' is 1, the input is negated before being converted to an integer.
+| Ordinarily, the fixed-point input is simply rounded to an integer, with
+| the inexact exception raised if the input cannot be represented exactly as
+| an integer.  However, if the fixed-point input is too large, the invalid
+| exception is raised and the largest positive or negative integer is
+| returned.
+*----------------------------------------------------------------------------*/
+
+static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
+{
+    int8 roundingMode;
+    flag roundNearestEven, increment;
+    int64 z;
+
+    roundingMode = STATUS(float_rounding_mode);
+    roundNearestEven = ( roundingMode == float_round_nearest_even );
+    increment = ( (sbits64) absZ1 < 0 );
+    if ( ! roundNearestEven ) {
+        if ( roundingMode == float_round_to_zero ) {
+            increment = 0;
+        }
+        else {
+            if ( zSign ) {
+                increment = ( roundingMode == float_round_down ) && absZ1;
+            }
+            else {
+                increment = ( roundingMode == float_round_up ) && absZ1;
+            }
+        }
+    }
+    if ( increment ) {
+        ++absZ0;
+        if ( absZ0 == 0 ) goto overflow;
+        absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
+    }
+    z = absZ0;
+    if ( zSign ) z = - z;
+    if ( z && ( ( z < 0 ) ^ zSign ) ) {
+ overflow:
+        float_raise( float_flag_invalid STATUS_VAR);
+        return
+              zSign ? (sbits64) LIT64( 0x8000000000000000 )
+            : LIT64( 0x7FFFFFFFFFFFFFFF );
+    }
+    if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
+    return z;
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the fraction bits of the single-precision floating-point value `a'.
+*----------------------------------------------------------------------------*/
+
+INLINE bits32 extractFloat32Frac( float32 a )
+{
+
+    return float32_val(a) & 0x007FFFFF;
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the exponent bits of the single-precision floating-point value `a'.
+*----------------------------------------------------------------------------*/
+
+INLINE int16 extractFloat32Exp( float32 a )
+{
+
+    return ( float32_val(a)>>23 ) & 0xFF;
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the sign bit of the single-precision floating-point value `a'.
+*----------------------------------------------------------------------------*/
+
+INLINE flag extractFloat32Sign( float32 a )
+{
+
+    return float32_val(a)>>31;
+
+}
+
+/*----------------------------------------------------------------------------
+| Normalizes the subnormal single-precision floating-point value represented
+| by the denormalized significand `aSig'.  The normalized exponent and
+| significand are stored at the locations pointed to by `zExpPtr' and
+| `zSigPtr', respectively.
+*----------------------------------------------------------------------------*/
+
+static void
+ normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
+{
+    int8 shiftCount;
+
+    shiftCount = countLeadingZeros32( aSig ) - 8;
+    *zSigPtr = aSig<<shiftCount;
+    *zExpPtr = 1 - shiftCount;
+
+}
+
+/*----------------------------------------------------------------------------
+| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
+| single-precision floating-point value, returning the result.  After being
+| shifted into the proper positions, the three fields are simply added
+| together to form the result.  This means that any integer portion of `zSig'
+| will be added into the exponent.  Since a properly normalized significand
+| will have an integer portion equal to 1, the `zExp' input should be 1 less
+| than the desired result exponent whenever `zSig' is a complete, normalized
+| significand.
+*----------------------------------------------------------------------------*/
+
+INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
+{
+
+    return make_float32(
+          ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig);
+
+}
+
+/*----------------------------------------------------------------------------
+| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
+| and significand `zSig', and returns the proper single-precision floating-
+| point value corresponding to the abstract input.  Ordinarily, the abstract
+| value is simply rounded and packed into the single-precision format, with
+| the inexact exception raised if the abstract input cannot be represented
+| exactly.  However, if the abstract value is too large, the overflow and
+| inexact exceptions are raised and an infinity or maximal finite value is
+| returned.  If the abstract value is too small, the input value is rounded to
+| a subnormal number, and the underflow and inexact exceptions are raised if
+| the abstract input cannot be represented exactly as a subnormal single-
+| precision floating-point number.
+|     The input significand `zSig' has its binary point between bits 30
+| and 29, which is 7 bits to the left of the usual location.  This shifted
+| significand must be normalized or smaller.  If `zSig' is not normalized,
+| `zExp' must be 0; in that case, the result returned is a subnormal number,
+| and it must not require rounding.  In the usual case that `zSig' is
+| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
+| The handling of underflow and overflow follows the IEC/IEEE Standard for
+| Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
+{
+    int8 roundingMode;
+    flag roundNearestEven;
+    int8 roundIncrement, roundBits;
+    flag isTiny;
+
+    roundingMode = STATUS(float_rounding_mode);
+    roundNearestEven = ( roundingMode == float_round_nearest_even );
+    roundIncrement = 0x40;
+    if ( ! roundNearestEven ) {
+        if ( roundingMode == float_round_to_zero ) {
+            roundIncrement = 0;
+        }
+        else {
+            roundIncrement = 0x7F;
+            if ( zSign ) {
+                if ( roundingMode == float_round_up ) roundIncrement = 0;
+            }
+            else {
+                if ( roundingMode == float_round_down ) roundIncrement = 0;
+            }
+        }
+    }
+    roundBits = zSig & 0x7F;
+    if ( 0xFD <= (bits16) zExp ) {
+        if (    ( 0xFD < zExp )
+             || (    ( zExp == 0xFD )
+                  && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
+           ) {
+            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
+            return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
+        }
+        if ( zExp < 0 ) {
+            if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
+            isTiny =
+                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
+                || ( zExp < -1 )
+                || ( zSig + roundIncrement < 0x80000000 );
+            shift32RightJamming( zSig, - zExp, &zSig );
+            zExp = 0;
+            roundBits = zSig & 0x7F;
+            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
+        }
+    }
+    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
+    zSig = ( zSig + roundIncrement )>>7;
+    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
+    if ( zSig == 0 ) zExp = 0;
+    return packFloat32( zSign, zExp, zSig );
+
+}
+
+/*----------------------------------------------------------------------------
+| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
+| and significand `zSig', and returns the proper single-precision floating-
+| point value corresponding to the abstract input.  This routine is just like
+| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
+| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
+| floating-point exponent.
+*----------------------------------------------------------------------------*/
+
+static float32
+ normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
+{
+    int8 shiftCount;
+
+    shiftCount = countLeadingZeros32( zSig ) - 1;
+    return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the fraction bits of the double-precision floating-point value `a'.
+*----------------------------------------------------------------------------*/
+
+INLINE bits64 extractFloat64Frac( float64 a )
+{
+
+    return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the exponent bits of the double-precision floating-point value `a'.
+*----------------------------------------------------------------------------*/
+
+INLINE int16 extractFloat64Exp( float64 a )
+{
+
+    return ( float64_val(a)>>52 ) & 0x7FF;
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the sign bit of the double-precision floating-point value `a'.
+*----------------------------------------------------------------------------*/
+
+INLINE flag extractFloat64Sign( float64 a )
+{
+
+    return float64_val(a)>>63;
+
+}
+
+/*----------------------------------------------------------------------------
+| Normalizes the subnormal double-precision floating-point value represented
+| by the denormalized significand `aSig'.  The normalized exponent and
+| significand are stored at the locations pointed to by `zExpPtr' and
+| `zSigPtr', respectively.
+*----------------------------------------------------------------------------*/
+
+static void
+ normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
+{
+    int8 shiftCount;
+
+    shiftCount = countLeadingZeros64( aSig ) - 11;
+    *zSigPtr = aSig<<shiftCount;
+    *zExpPtr = 1 - shiftCount;
+
+}
+
+/*----------------------------------------------------------------------------
+| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
+| double-precision floating-point value, returning the result.  After being
+| shifted into the proper positions, the three fields are simply added
+| together to form the result.  This means that any integer portion of `zSig'
+| will be added into the exponent.  Since a properly normalized significand
+| will have an integer portion equal to 1, the `zExp' input should be 1 less
+| than the desired result exponent whenever `zSig' is a complete, normalized
+| significand.
+*----------------------------------------------------------------------------*/
+
+INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
+{
+
+    return make_float64(
+        ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig);
+
+}
+
+/*----------------------------------------------------------------------------
+| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
+| and significand `zSig', and returns the proper double-precision floating-
+| point value corresponding to the abstract input.  Ordinarily, the abstract
+| value is simply rounded and packed into the double-precision format, with
+| the inexact exception raised if the abstract input cannot be represented
+| exactly.  However, if the abstract value is too large, the overflow and
+| inexact exceptions are raised and an infinity or maximal finite value is
+| returned.  If the abstract value is too small, the input value is rounded
+| to a subnormal number, and the underflow and inexact exceptions are raised
+| if the abstract input cannot be represented exactly as a subnormal double-
+| precision floating-point number.
+|     The input significand `zSig' has its binary point between bits 62
+| and 61, which is 10 bits to the left of the usual location.  This shifted
+| significand must be normalized or smaller.  If `zSig' is not normalized,
+| `zExp' must be 0; in that case, the result returned is a subnormal number,
+| and it must not require rounding.  In the usual case that `zSig' is
+| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
+| The handling of underflow and overflow follows the IEC/IEEE Standard for
+| Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
+{
+    int8 roundingMode;
+    flag roundNearestEven;
+    int16 roundIncrement, roundBits;
+    flag isTiny;
+
+    roundingMode = STATUS(float_rounding_mode);
+    roundNearestEven = ( roundingMode == float_round_nearest_even );
+    roundIncrement = 0x200;
+    if ( ! roundNearestEven ) {
+        if ( roundingMode == float_round_to_zero ) {
+            roundIncrement = 0;
+        }
+        else {
+            roundIncrement = 0x3FF;
+            if ( zSign ) {
+                if ( roundingMode == float_round_up ) roundIncrement = 0;
+            }
+            else {
+                if ( roundingMode == float_round_down ) roundIncrement = 0;
+            }
+        }
+    }
+    roundBits = zSig & 0x3FF;
+    if ( 0x7FD <= (bits16) zExp ) {
+        if (    ( 0x7FD < zExp )
+             || (    ( zExp == 0x7FD )
+                  && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
+           ) {
+            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
+            return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
+        }
+        if ( zExp < 0 ) {
+            if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
+            isTiny =
+                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
+                || ( zExp < -1 )
+                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
+            shift64RightJamming( zSig, - zExp, &zSig );
+            zExp = 0;
+            roundBits = zSig & 0x3FF;
+            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
+        }
+    }
+    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
+    zSig = ( zSig + roundIncrement )>>10;
+    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
+    if ( zSig == 0 ) zExp = 0;
+    return packFloat64( zSign, zExp, zSig );
+
+}
+
+/*----------------------------------------------------------------------------
+| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
+| and significand `zSig', and returns the proper double-precision floating-
+| point value corresponding to the abstract input.  This routine is just like
+| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
+| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
+| floating-point exponent.
+*----------------------------------------------------------------------------*/
+
+static float64
+ normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
+{
+    int8 shiftCount;
+
+    shiftCount = countLeadingZeros64( zSig ) - 1;
+    return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
+
+}
+
+#ifdef FLOATX80
+
+/*----------------------------------------------------------------------------
+| Returns the fraction bits of the extended double-precision floating-point
+| value `a'.
+*----------------------------------------------------------------------------*/
+
+INLINE bits64 extractFloatx80Frac( floatx80 a )
+{
+
+    return a.low;
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the exponent bits of the extended double-precision floating-point
+| value `a'.
+*----------------------------------------------------------------------------*/
+
+INLINE int32 extractFloatx80Exp( floatx80 a )
+{
+
+    return a.high & 0x7FFF;
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the sign bit of the extended double-precision floating-point value
+| `a'.
+*----------------------------------------------------------------------------*/
+
+INLINE flag extractFloatx80Sign( floatx80 a )
+{
+
+    return a.high>>15;
+
+}
+
+/*----------------------------------------------------------------------------
+| Normalizes the subnormal extended double-precision floating-point value
+| represented by the denormalized significand `aSig'.  The normalized exponent
+| and significand are stored at the locations pointed to by `zExpPtr' and
+| `zSigPtr', respectively.
+*----------------------------------------------------------------------------*/
+
+static void
+ normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
+{
+    int8 shiftCount;
+
+    shiftCount = countLeadingZeros64( aSig );
+    *zSigPtr = aSig<<shiftCount;
+    *zExpPtr = 1 - shiftCount;
+
+}
+
+/*----------------------------------------------------------------------------
+| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
+| extended double-precision floating-point value, returning the result.
+*----------------------------------------------------------------------------*/
+
+INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
+{
+    floatx80 z;
+
+    z.low = zSig;
+    z.high = ( ( (bits16) zSign )<<15 ) + zExp;
+    return z;
+
+}
+
+/*----------------------------------------------------------------------------
+| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
+| and extended significand formed by the concatenation of `zSig0' and `zSig1',
+| and returns the proper extended double-precision floating-point value
+| corresponding to the abstract input.  Ordinarily, the abstract value is
+| rounded and packed into the extended double-precision format, with the
+| inexact exception raised if the abstract input cannot be represented
+| exactly.  However, if the abstract value is too large, the overflow and
+| inexact exceptions are raised and an infinity or maximal finite value is
+| returned.  If the abstract value is too small, the input value is rounded to
+| a subnormal number, and the underflow and inexact exceptions are raised if
+| the abstract input cannot be represented exactly as a subnormal extended
+| double-precision floating-point number.
+|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
+| number of bits as single or double precision, respectively.  Otherwise, the
+| result is rounded to the full precision of the extended double-precision
+| format.
+|     The input significand must be normalized or smaller.  If the input
+| significand is not normalized, `zExp' must be 0; in that case, the result
+| returned is a subnormal number, and it must not require rounding.  The
+| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
+| Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+static floatx80
+ roundAndPackFloatx80(
+     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
+ STATUS_PARAM)
+{
+    int8 roundingMode;
+    flag roundNearestEven, increment, isTiny;
+    int64 roundIncrement, roundMask, roundBits;
+
+    roundingMode = STATUS(float_rounding_mode);
+    roundNearestEven = ( roundingMode == float_round_nearest_even );
+    if ( roundingPrecision == 80 ) goto precision80;
+    if ( roundingPrecision == 64 ) {
+        roundIncrement = LIT64( 0x0000000000000400 );
+        roundMask = LIT64( 0x00000000000007FF );
+    }
+    else if ( roundingPrecision == 32 ) {
+        roundIncrement = LIT64( 0x0000008000000000 );
+        roundMask = LIT64( 0x000000FFFFFFFFFF );
+    }
+    else {
+        goto precision80;
+    }
+    zSig0 |= ( zSig1 != 0 );
+    if ( ! roundNearestEven ) {
+        if ( roundingMode == float_round_to_zero ) {
+            roundIncrement = 0;
+        }
+        else {
+            roundIncrement = roundMask;
+            if ( zSign ) {
+                if ( roundingMode == float_round_up ) roundIncrement = 0;
+            }
+            else {
+                if ( roundingMode == float_round_down ) roundIncrement = 0;
+            }
+        }
+    }
+    roundBits = zSig0 & roundMask;
+    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
+        if (    ( 0x7FFE < zExp )
+             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
+           ) {
+            goto overflow;
+        }
+        if ( zExp <= 0 ) {
+            if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
+            isTiny =
+                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
+                || ( zExp < 0 )
+                || ( zSig0 <= zSig0 + roundIncrement );
+            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
+            zExp = 0;
+            roundBits = zSig0 & roundMask;
+            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
+            if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
+            zSig0 += roundIncrement;
+            if ( (sbits64) zSig0 < 0 ) zExp = 1;
+            roundIncrement = roundMask + 1;
+            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
+                roundMask |= roundIncrement;
+            }
+            zSig0 &= ~ roundMask;
+            return packFloatx80( zSign, zExp, zSig0 );
+        }
+    }
+    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
+    zSig0 += roundIncrement;
+    if ( zSig0 < roundIncrement ) {
+        ++zExp;
+        zSig0 = LIT64( 0x8000000000000000 );
+    }
+    roundIncrement = roundMask + 1;
+    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
+        roundMask |= roundIncrement;
+    }
+    zSig0 &= ~ roundMask;
+    if ( zSig0 == 0 ) zExp = 0;
+    return packFloatx80( zSign, zExp, zSig0 );
+ precision80:
+    increment = ( (sbits64) zSig1 < 0 );
+    if ( ! roundNearestEven ) {
+        if ( roundingMode == float_round_to_zero ) {
+            increment = 0;
+        }
+        else {
+            if ( zSign ) {
+                increment = ( roundingMode == float_round_down ) && zSig1;
+            }
+            else {
+                increment = ( roundingMode == float_round_up ) && zSig1;
+            }
+        }
+    }
+    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
+        if (    ( 0x7FFE < zExp )
+             || (    ( zExp == 0x7FFE )
+                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
+                  && increment
+                )
+           ) {
+            roundMask = 0;
+ overflow:
+            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
+            if (    ( roundingMode == float_round_to_zero )
+                 || ( zSign && ( roundingMode == float_round_up ) )
+                 || ( ! zSign && ( roundingMode == float_round_down ) )
+               ) {
+                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
+            }
+            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
+        }
+        if ( zExp <= 0 ) {
+            isTiny =
+                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
+                || ( zExp < 0 )
+                || ! increment
+                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
+            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
+            zExp = 0;
+            if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
+            if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
+            if ( roundNearestEven ) {
+                increment = ( (sbits64) zSig1 < 0 );
+            }
+            else {
+                if ( zSign ) {
+                    increment = ( roundingMode == float_round_down ) && zSig1;
+                }
+                else {
+                    increment = ( roundingMode == float_round_up ) && zSig1;
+                }
+            }
+            if ( increment ) {
+                ++zSig0;
+                zSig0 &=
+                    ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
+                if ( (sbits64) zSig0 < 0 ) zExp = 1;
+            }
+            return packFloatx80( zSign, zExp, zSig0 );
+        }
+    }
+    if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
+    if ( increment ) {
+        ++zSig0;
+        if ( zSig0 == 0 ) {
+            ++zExp;
+            zSig0 = LIT64( 0x8000000000000000 );
+        }
+        else {
+            zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
+        }
+    }
+    else {
+        if ( zSig0 == 0 ) zExp = 0;
+    }
+    return packFloatx80( zSign, zExp, zSig0 );
+
+}
+
+/*----------------------------------------------------------------------------
+| Takes an abstract floating-point value having sign `zSign', exponent
+| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
+| and returns the proper extended double-precision floating-point value
+| corresponding to the abstract input.  This routine is just like
+| `roundAndPackFloatx80' except that the input significand does not have to be
+| normalized.
+*----------------------------------------------------------------------------*/
+
+static floatx80
+ normalizeRoundAndPackFloatx80(
+     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
+ STATUS_PARAM)
+{
+    int8 shiftCount;
+
+    if ( zSig0 == 0 ) {
+        zSig0 = zSig1;
+        zSig1 = 0;
+        zExp -= 64;
+    }
+    shiftCount = countLeadingZeros64( zSig0 );
+    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
+    zExp -= shiftCount;
+    return
+        roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
+
+}
+
+#endif
+
+#ifdef FLOAT128
+
+/*----------------------------------------------------------------------------
+| Returns the least-significant 64 fraction bits of the quadruple-precision
+| floating-point value `a'.
+*----------------------------------------------------------------------------*/
+
+INLINE bits64 extractFloat128Frac1( float128 a )
+{
+
+    return a.low;
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the most-significant 48 fraction bits of the quadruple-precision
+| floating-point value `a'.
+*----------------------------------------------------------------------------*/
+
+INLINE bits64 extractFloat128Frac0( float128 a )
+{
+
+    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the exponent bits of the quadruple-precision floating-point value
+| `a'.
+*----------------------------------------------------------------------------*/
+
+INLINE int32 extractFloat128Exp( float128 a )
+{
+
+    return ( a.high>>48 ) & 0x7FFF;
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the sign bit of the quadruple-precision floating-point value `a'.
+*----------------------------------------------------------------------------*/
+
+INLINE flag extractFloat128Sign( float128 a )
+{
+
+    return a.high>>63;
+
+}
+
+/*----------------------------------------------------------------------------
+| Normalizes the subnormal quadruple-precision floating-point value
+| represented by the denormalized significand formed by the concatenation of
+| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
+| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
+| significand are stored at the location pointed to by `zSig0Ptr', and the
+| least significant 64 bits of the normalized significand are stored at the
+| location pointed to by `zSig1Ptr'.
+*----------------------------------------------------------------------------*/
+
+static void
+ normalizeFloat128Subnormal(
+     bits64 aSig0,
+     bits64 aSig1,
+     int32 *zExpPtr,
+     bits64 *zSig0Ptr,
+     bits64 *zSig1Ptr
+ )
+{
+    int8 shiftCount;
+
+    if ( aSig0 == 0 ) {
+        shiftCount = countLeadingZeros64( aSig1 ) - 15;
+        if ( shiftCount < 0 ) {
+            *zSig0Ptr = aSig1>>( - shiftCount );
+            *zSig1Ptr = aSig1<<( shiftCount & 63 );
+        }
+        else {
+            *zSig0Ptr = aSig1<<shiftCount;
+            *zSig1Ptr = 0;
+        }
+        *zExpPtr = - shiftCount - 63;
+    }
+    else {
+        shiftCount = countLeadingZeros64( aSig0 ) - 15;
+        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
+        *zExpPtr = 1 - shiftCount;
+    }
+
+}
+
+/*----------------------------------------------------------------------------
+| Packs the sign `zSign', the exponent `zExp', and the significand formed
+| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
+| floating-point value, returning the result.  After being shifted into the
+| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
+| added together to form the most significant 32 bits of the result.  This
+| means that any integer portion of `zSig0' will be added into the exponent.
+| Since a properly normalized significand will have an integer portion equal
+| to 1, the `zExp' input should be 1 less than the desired result exponent
+| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
+| significand.
+*----------------------------------------------------------------------------*/
+
+INLINE float128
+ packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
+{
+    float128 z;
+
+    z.low = zSig1;
+    z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
+    return z;
+
+}
+
+/*----------------------------------------------------------------------------
+| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
+| and extended significand formed by the concatenation of `zSig0', `zSig1',
+| and `zSig2', and returns the proper quadruple-precision floating-point value
+| corresponding to the abstract input.  Ordinarily, the abstract value is
+| simply rounded and packed into the quadruple-precision format, with the
+| inexact exception raised if the abstract input cannot be represented
+| exactly.  However, if the abstract value is too large, the overflow and
+| inexact exceptions are raised and an infinity or maximal finite value is
+| returned.  If the abstract value is too small, the input value is rounded to
+| a subnormal number, and the underflow and inexact exceptions are raised if
+| the abstract input cannot be represented exactly as a subnormal quadruple-
+| precision floating-point number.
+|     The input significand must be normalized or smaller.  If the input
+| significand is not normalized, `zExp' must be 0; in that case, the result
+| returned is a subnormal number, and it must not require rounding.  In the
+| usual case that the input significand is normalized, `zExp' must be 1 less
+| than the ``true'' floating-point exponent.  The handling of underflow and
+| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+static float128
+ roundAndPackFloat128(
+     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
+{
+    int8 roundingMode;
+    flag roundNearestEven, increment, isTiny;
+
+    roundingMode = STATUS(float_rounding_mode);
+    roundNearestEven = ( roundingMode == float_round_nearest_even );
+    increment = ( (sbits64) zSig2 < 0 );
+    if ( ! roundNearestEven ) {
+        if ( roundingMode == float_round_to_zero ) {
+            increment = 0;
+        }
+        else {
+            if ( zSign ) {
+                increment = ( roundingMode == float_round_down ) && zSig2;
+            }
+            else {
+                increment = ( roundingMode == float_round_up ) && zSig2;
+            }
+        }
+    }
+    if ( 0x7FFD <= (bits32) zExp ) {
+        if (    ( 0x7FFD < zExp )
+             || (    ( zExp == 0x7FFD )
+                  && eq128(
+                         LIT64( 0x0001FFFFFFFFFFFF ),
+                         LIT64( 0xFFFFFFFFFFFFFFFF ),
+                         zSig0,
+                         zSig1
+                     )
+                  && increment
+                )
+           ) {
+            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
+            if (    ( roundingMode == float_round_to_zero )
+                 || ( zSign && ( roundingMode == float_round_up ) )
+                 || ( ! zSign && ( roundingMode == float_round_down ) )
+               ) {
+                return
+                    packFloat128(
+                        zSign,
+                        0x7FFE,
+                        LIT64( 0x0000FFFFFFFFFFFF ),
+                        LIT64( 0xFFFFFFFFFFFFFFFF )
+                    );
+            }
+            return packFloat128( zSign, 0x7FFF, 0, 0 );
+        }
+        if ( zExp < 0 ) {
+            if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
+            isTiny =
+                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
+                || ( zExp < -1 )
+                || ! increment
+                || lt128(
+                       zSig0,
+                       zSig1,
+                       LIT64( 0x0001FFFFFFFFFFFF ),
+                       LIT64( 0xFFFFFFFFFFFFFFFF )
+                   );
+            shift128ExtraRightJamming(
+                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
+            zExp = 0;
+            if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
+            if ( roundNearestEven ) {
+                increment = ( (sbits64) zSig2 < 0 );
+            }
+            else {
+                if ( zSign ) {
+                    increment = ( roundingMode == float_round_down ) && zSig2;
+                }
+                else {
+                    increment = ( roundingMode == float_round_up ) && zSig2;
+                }
+            }
+        }
+    }
+    if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
+    if ( increment ) {
+        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
+        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
+    }
+    else {
+        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
+    }
+    return packFloat128( zSign, zExp, zSig0, zSig1 );
+
+}
+
+/*----------------------------------------------------------------------------
+| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
+| and significand formed by the concatenation of `zSig0' and `zSig1', and
+| returns the proper quadruple-precision floating-point value corresponding
+| to the abstract input.  This routine is just like `roundAndPackFloat128'
+| except that the input significand has fewer bits and does not have to be
+| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
+| point exponent.
+*----------------------------------------------------------------------------*/
+
+static float128
+ normalizeRoundAndPackFloat128(
+     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
+{
+    int8 shiftCount;
+    bits64 zSig2;
+
+    if ( zSig0 == 0 ) {
+        zSig0 = zSig1;
+        zSig1 = 0;
+        zExp -= 64;
+    }
+    shiftCount = countLeadingZeros64( zSig0 ) - 15;
+    if ( 0 <= shiftCount ) {
+        zSig2 = 0;
+        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
+    }
+    else {
+        shift128ExtraRightJamming(
+            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
+    }
+    zExp -= shiftCount;
+    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
+
+}
+
+#endif
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the 32-bit two's complement integer `a'
+| to the single-precision floating-point format.  The conversion is performed
+| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float32 int32_to_float32( int32 a STATUS_PARAM )
+{
+    flag zSign;
+
+    if ( a == 0 ) return float32_zero;
+    if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
+    zSign = ( a < 0 );
+    return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the 32-bit two's complement integer `a'
+| to the double-precision floating-point format.  The conversion is performed
+| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float64 int32_to_float64( int32 a STATUS_PARAM )
+{
+    flag zSign;
+    uint32 absA;
+    int8 shiftCount;
+    bits64 zSig;
+
+    if ( a == 0 ) return float64_zero;
+    zSign = ( a < 0 );
+    absA = zSign ? - a : a;
+    shiftCount = countLeadingZeros32( absA ) + 21;
+    zSig = absA;
+    return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
+
+}
+
+#ifdef FLOATX80
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the 32-bit two's complement integer `a'
+| to the extended double-precision floating-point format.  The conversion
+| is performed according to the IEC/IEEE Standard for Binary Floating-Point
+| Arithmetic.
+*----------------------------------------------------------------------------*/
+
+floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
+{
+    flag zSign;
+    uint32 absA;
+    int8 shiftCount;
+    bits64 zSig;
+
+    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
+    zSign = ( a < 0 );
+    absA = zSign ? - a : a;
+    shiftCount = countLeadingZeros32( absA ) + 32;
+    zSig = absA;
+    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
+
+}
+
+#endif
+
+#ifdef FLOAT128
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the 32-bit two's complement integer `a' to
+| the quadruple-precision floating-point format.  The conversion is performed
+| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float128 int32_to_float128( int32 a STATUS_PARAM )
+{
+    flag zSign;
+    uint32 absA;
+    int8 shiftCount;
+    bits64 zSig0;
+
+    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
+    zSign = ( a < 0 );
+    absA = zSign ? - a : a;
+    shiftCount = countLeadingZeros32( absA ) + 17;
+    zSig0 = absA;
+    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
+
+}
+
+#endif
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the 64-bit two's complement integer `a'
+| to the single-precision floating-point format.  The conversion is performed
+| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float32 int64_to_float32( int64 a STATUS_PARAM )
+{
+    flag zSign;
+    uint64 absA;
+    int8 shiftCount;
+
+    if ( a == 0 ) return float32_zero;
+    zSign = ( a < 0 );
+    absA = zSign ? - a : a;
+    shiftCount = countLeadingZeros64( absA ) - 40;
+    if ( 0 <= shiftCount ) {
+        return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
+    }
+    else {
+        shiftCount += 7;
+        if ( shiftCount < 0 ) {
+            shift64RightJamming( absA, - shiftCount, &absA );
+        }
+        else {
+            absA <<= shiftCount;
+        }
+        return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
+    }
+
+}
+
+float32 uint64_to_float32( uint64 a STATUS_PARAM )
+{
+    int8 shiftCount;
+
+    if ( a == 0 ) return float32_zero;
+    shiftCount = countLeadingZeros64( a ) - 40;
+    if ( 0 <= shiftCount ) {
+        return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
+    }
+    else {
+        shiftCount += 7;
+        if ( shiftCount < 0 ) {
+            shift64RightJamming( a, - shiftCount, &a );
+        }
+        else {
+            a <<= shiftCount;
+        }
+        return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
+    }
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the 64-bit two's complement integer `a'
+| to the double-precision floating-point format.  The conversion is performed
+| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float64 int64_to_float64( int64 a STATUS_PARAM )
+{
+    flag zSign;
+
+    if ( a == 0 ) return float64_zero;
+    if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
+        return packFloat64( 1, 0x43E, 0 );
+    }
+    zSign = ( a < 0 );
+    return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
+
+}
+
+float64 uint64_to_float64( uint64 a STATUS_PARAM )
+{
+    if ( a == 0 ) return float64_zero;
+    return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
+
+}
+
+#ifdef FLOATX80
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the 64-bit two's complement integer `a'
+| to the extended double-precision floating-point format.  The conversion
+| is performed according to the IEC/IEEE Standard for Binary Floating-Point
+| Arithmetic.
+*----------------------------------------------------------------------------*/
+
+floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
+{
+    flag zSign;
+    uint64 absA;
+    int8 shiftCount;
+
+    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
+    zSign = ( a < 0 );
+    absA = zSign ? - a : a;
+    shiftCount = countLeadingZeros64( absA );
+    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
+
+}
+
+#endif
+
+#ifdef FLOAT128
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the 64-bit two's complement integer `a' to
+| the quadruple-precision floating-point format.  The conversion is performed
+| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float128 int64_to_float128( int64 a STATUS_PARAM )
+{
+    flag zSign;
+    uint64 absA;
+    int8 shiftCount;
+    int32 zExp;
+    bits64 zSig0, zSig1;
+
+    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
+    zSign = ( a < 0 );
+    absA = zSign ? - a : a;
+    shiftCount = countLeadingZeros64( absA ) + 49;
+    zExp = 0x406E - shiftCount;
+    if ( 64 <= shiftCount ) {
+        zSig1 = 0;
+        zSig0 = absA;
+        shiftCount -= 64;
+    }
+    else {
+        zSig1 = absA;
+        zSig0 = 0;
+    }
+    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
+    return packFloat128( zSign, zExp, zSig0, zSig1 );
+
+}
+
+#endif
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the single-precision floating-point value
+| `a' to the 32-bit two's complement integer format.  The conversion is
+| performed according to the IEC/IEEE Standard for Binary Floating-Point
+| Arithmetic---which means in particular that the conversion is rounded
+| according to the current rounding mode.  If `a' is a NaN, the largest
+| positive integer is returned.  Otherwise, if the conversion overflows, the
+| largest integer with the same sign as `a' is returned.
+*----------------------------------------------------------------------------*/
+
+int32 float32_to_int32( float32 a STATUS_PARAM )
+{
+    flag aSign;
+    int16 aExp, shiftCount;
+    bits32 aSig;
+    bits64 aSig64;
+
+    aSig = extractFloat32Frac( a );
+    aExp = extractFloat32Exp( a );
+    aSign = extractFloat32Sign( a );
+    if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
+    if ( aExp ) aSig |= 0x00800000;
+    shiftCount = 0xAF - aExp;
+    aSig64 = aSig;
+    aSig64 <<= 32;
+    if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
+    return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the single-precision floating-point value
+| `a' to the 32-bit two's complement integer format.  The conversion is
+| performed according to the IEC/IEEE Standard for Binary Floating-Point
+| Arithmetic, except that the conversion is always rounded toward zero.
+| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
+| the conversion overflows, the largest integer with the same sign as `a' is
+| returned.
+*----------------------------------------------------------------------------*/
+
+int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
+{
+    flag aSign;
+    int16 aExp, shiftCount;
+    bits32 aSig;
+    int32 z;
+
+    aSig = extractFloat32Frac( a );
+    aExp = extractFloat32Exp( a );
+    aSign = extractFloat32Sign( a );
+    shiftCount = aExp - 0x9E;
+    if ( 0 <= shiftCount ) {
+        if ( float32_val(a) != 0xCF000000 ) {
+            float_raise( float_flag_invalid STATUS_VAR);
+            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
+        }
+        return (sbits32) 0x80000000;
+    }
+    else if ( aExp <= 0x7E ) {
+        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
+        return 0;
+    }
+    aSig = ( aSig | 0x00800000 )<<8;
+    z = aSig>>( - shiftCount );
+    if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
+        STATUS(float_exception_flags) |= float_flag_inexact;
+    }
+    if ( aSign ) z = - z;
+    return z;
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the single-precision floating-point value
+| `a' to the 64-bit two's complement integer format.  The conversion is
+| performed according to the IEC/IEEE Standard for Binary Floating-Point
+| Arithmetic---which means in particular that the conversion is rounded
+| according to the current rounding mode.  If `a' is a NaN, the largest
+| positive integer is returned.  Otherwise, if the conversion overflows, the
+| largest integer with the same sign as `a' is returned.
+*----------------------------------------------------------------------------*/
+
+int64 float32_to_int64( float32 a STATUS_PARAM )
+{
+    flag aSign;
+    int16 aExp, shiftCount;
+    bits32 aSig;
+    bits64 aSig64, aSigExtra;
+
+    aSig = extractFloat32Frac( a );
+    aExp = extractFloat32Exp( a );
+    aSign = extractFloat32Sign( a );
+    shiftCount = 0xBE - aExp;
+    if ( shiftCount < 0 ) {
+        float_raise( float_flag_invalid STATUS_VAR);
+        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
+            return LIT64( 0x7FFFFFFFFFFFFFFF );
+        }
+        return (sbits64) LIT64( 0x8000000000000000 );
+    }
+    if ( aExp ) aSig |= 0x00800000;
+    aSig64 = aSig;
+    aSig64 <<= 40;
+    shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
+    return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the single-precision floating-point value
+| `a' to the 64-bit two's complement integer format.  The conversion is
+| performed according to the IEC/IEEE Standard for Binary Floating-Point
+| Arithmetic, except that the conversion is always rounded toward zero.  If
+| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
+| conversion overflows, the largest integer with the same sign as `a' is
+| returned.
+*----------------------------------------------------------------------------*/
+
+int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
+{
+    flag aSign;
+    int16 aExp, shiftCount;
+    bits32 aSig;
+    bits64 aSig64;
+    int64 z;
+
+    aSig = extractFloat32Frac( a );
+    aExp = extractFloat32Exp( a );
+    aSign = extractFloat32Sign( a );
+    shiftCount = aExp - 0xBE;
+    if ( 0 <= shiftCount ) {
+        if ( float32_val(a) != 0xDF000000 ) {
+            float_raise( float_flag_invalid STATUS_VAR);
+            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
+                return LIT64( 0x7FFFFFFFFFFFFFFF );
+            }
+        }
+        return (sbits64) LIT64( 0x8000000000000000 );
+    }
+    else if ( aExp <= 0x7E ) {
+        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
+        return 0;
+    }
+    aSig64 = aSig | 0x00800000;
+    aSig64 <<= 40;
+    z = aSig64>>( - shiftCount );
+    if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
+        STATUS(float_exception_flags) |= float_flag_inexact;
+    }
+    if ( aSign ) z = - z;
+    return z;
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the single-precision floating-point value
+| `a' to the double-precision floating-point format.  The conversion is
+| performed according to the IEC/IEEE Standard for Binary Floating-Point
+| Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float64 float32_to_float64( float32 a STATUS_PARAM )
+{
+    flag aSign;
+    int16 aExp;
+    bits32 aSig;
+
+    aSig = extractFloat32Frac( a );
+    aExp = extractFloat32Exp( a );
+    aSign = extractFloat32Sign( a );
+    if ( aExp == 0xFF ) {
+        if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
+        return packFloat64( aSign, 0x7FF, 0 );
+    }
+    if ( aExp == 0 ) {
+        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
+        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
+        --aExp;
+    }
+    return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
+
+}
+
+#ifdef FLOATX80
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the single-precision floating-point value
+| `a' to the extended double-precision floating-point format.  The conversion
+| is performed according to the IEC/IEEE Standard for Binary Floating-Point
+| Arithmetic.
+*----------------------------------------------------------------------------*/
+
+floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
+{
+    flag aSign;
+    int16 aExp;
+    bits32 aSig;
+
+    aSig = extractFloat32Frac( a );
+    aExp = extractFloat32Exp( a );
+    aSign = extractFloat32Sign( a );
+    if ( aExp == 0xFF ) {
+        if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
+        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
+    }
+    if ( aExp == 0 ) {
+        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
+        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
+    }
+    aSig |= 0x00800000;
+    return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
+
+}
+
+#endif
+
+#ifdef FLOAT128
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the single-precision floating-point value
+| `a' to the double-precision floating-point format.  The conversion is
+| performed according to the IEC/IEEE Standard for Binary Floating-Point
+| Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float128 float32_to_float128( float32 a STATUS_PARAM )
+{
+    flag aSign;
+    int16 aExp;
+    bits32 aSig;
+
+    aSig = extractFloat32Frac( a );
+    aExp = extractFloat32Exp( a );
+    aSign = extractFloat32Sign( a );
+    if ( aExp == 0xFF ) {
+        if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
+        return packFloat128( aSign, 0x7FFF, 0, 0 );
+    }
+    if ( aExp == 0 ) {
+        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
+        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
+        --aExp;
+    }
+    return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
+
+}
+
+#endif
+
+/*----------------------------------------------------------------------------
+| Rounds the single-precision floating-point value `a' to an integer, and
+| returns the result as a single-precision floating-point value.  The
+| operation is performed according to the IEC/IEEE Standard for Binary
+| Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float32 float32_round_to_int( float32 a STATUS_PARAM)
+{
+    flag aSign;
+    int16 aExp;
+    bits32 lastBitMask, roundBitsMask;
+    int8 roundingMode;
+    bits32 z;
+
+    aExp = extractFloat32Exp( a );
+    if ( 0x96 <= aExp ) {
+        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
+            return propagateFloat32NaN( a, a STATUS_VAR );
+        }
+        return a;
+    }
+    if ( aExp <= 0x7E ) {
+        if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a;
+        STATUS(float_exception_flags) |= float_flag_inexact;
+        aSign = extractFloat32Sign( a );
+        switch ( STATUS(float_rounding_mode) ) {
+         case float_round_nearest_even:
+            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
+                return packFloat32( aSign, 0x7F, 0 );
+            }
+            break;
+         case float_round_down:
+            return make_float32(aSign ? 0xBF800000 : 0);
+         case float_round_up:
+            return make_float32(aSign ? 0x80000000 : 0x3F800000);
+        }
+        return packFloat32( aSign, 0, 0 );
+    }
+    lastBitMask = 1;
+    lastBitMask <<= 0x96 - aExp;
+    roundBitsMask = lastBitMask - 1;
+    z = float32_val(a);
+    roundingMode = STATUS(float_rounding_mode);
+    if ( roundingMode == float_round_nearest_even ) {
+        z += lastBitMask>>1;
+        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
+    }
+    else if ( roundingMode != float_round_to_zero ) {
+        if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
+            z += roundBitsMask;
+        }
+    }
+    z &= ~ roundBitsMask;
+    if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
+    return make_float32(z);
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of adding the absolute values of the single-precision
+| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
+| before being returned.  `zSign' is ignored if the result is a NaN.
+| The addition is performed according to the IEC/IEEE Standard for Binary
+| Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
+{
+    int16 aExp, bExp, zExp;
+    bits32 aSig, bSig, zSig;
+    int16 expDiff;
+
+    aSig = extractFloat32Frac( a );
+    aExp = extractFloat32Exp( a );
+    bSig = extractFloat32Frac( b );
+    bExp = extractFloat32Exp( b );
+    expDiff = aExp - bExp;
+    aSig <<= 6;
+    bSig <<= 6;
+    if ( 0 < expDiff ) {
+        if ( aExp == 0xFF ) {
+            if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
+            return a;
+        }
+        if ( bExp == 0 ) {
+            --expDiff;
+        }
+        else {
+            bSig |= 0x20000000;
+        }
+        shift32RightJamming( bSig, expDiff, &bSig );
+        zExp = aExp;
+    }
+    else if ( expDiff < 0 ) {
+        if ( bExp == 0xFF ) {
+            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
+            return packFloat32( zSign, 0xFF, 0 );
+        }
+        if ( aExp == 0 ) {
+            ++expDiff;
+        }
+        else {
+            aSig |= 0x20000000;
+        }
+        shift32RightJamming( aSig, - expDiff, &aSig );
+        zExp = bExp;
+    }
+    else {
+        if ( aExp == 0xFF ) {
+            if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
+            return a;
+        }
+        if ( aExp == 0 ) {
+            if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
+            return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
+        }
+        zSig = 0x40000000 + aSig + bSig;
+        zExp = aExp;
+        goto roundAndPack;
+    }
+    aSig |= 0x20000000;
+    zSig = ( aSig + bSig )<<1;
+    --zExp;
+    if ( (sbits32) zSig < 0 ) {
+        zSig = aSig + bSig;
+        ++zExp;
+    }
+ roundAndPack:
+    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of subtracting the absolute values of the single-
+| precision floating-point values `a' and `b'.  If `zSign' is 1, the
+| difference is negated before being returned.  `zSign' is ignored if the
+| result is a NaN.  The subtraction is performed according to the IEC/IEEE
+| Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
+{
+    int16 aExp, bExp, zExp;
+    bits32 aSig, bSig, zSig;
+    int16 expDiff;
+
+    aSig = extractFloat32Frac( a );
+    aExp = extractFloat32Exp( a );
+    bSig = extractFloat32Frac( b );
+    bExp = extractFloat32Exp( b );
+    expDiff = aExp - bExp;
+    aSig <<= 7;
+    bSig <<= 7;
+    if ( 0 < expDiff ) goto aExpBigger;
+    if ( expDiff < 0 ) goto bExpBigger;
+    if ( aExp == 0xFF ) {
+        if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
+        float_raise( float_flag_invalid STATUS_VAR);
+        return float32_default_nan;
+    }
+    if ( aExp == 0 ) {
+        aExp = 1;
+        bExp = 1;
+    }
+    if ( bSig < aSig ) goto aBigger;
+    if ( aSig < bSig ) goto bBigger;
+    return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
+ bExpBigger:
+    if ( bExp == 0xFF ) {
+        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
+        return packFloat32( zSign ^ 1, 0xFF, 0 );
+    }
+    if ( aExp == 0 ) {
+        ++expDiff;
+    }
+    else {
+        aSig |= 0x40000000;
+    }
+    shift32RightJamming( aSig, - expDiff, &aSig );
+    bSig |= 0x40000000;
+ bBigger:
+    zSig = bSig - aSig;
+    zExp = bExp;
+    zSign ^= 1;
+    goto normalizeRoundAndPack;
+ aExpBigger:
+    if ( aExp == 0xFF ) {
+        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
+        return a;
+    }
+    if ( bExp == 0 ) {
+        --expDiff;
+    }
+    else {
+        bSig |= 0x40000000;
+    }
+    shift32RightJamming( bSig, expDiff, &bSig );
+    aSig |= 0x40000000;
+ aBigger:
+    zSig = aSig - bSig;
+    zExp = aExp;
+ normalizeRoundAndPack:
+    --zExp;
+    return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of adding the single-precision floating-point values `a'
+| and `b'.  The operation is performed according to the IEC/IEEE Standard for
+| Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float32 float32_add( float32 a, float32 b STATUS_PARAM )
+{
+    flag aSign, bSign;
+
+    aSign = extractFloat32Sign( a );
+    bSign = extractFloat32Sign( b );
+    if ( aSign == bSign ) {
+        return addFloat32Sigs( a, b, aSign STATUS_VAR);
+    }
+    else {
+        return subFloat32Sigs( a, b, aSign STATUS_VAR );
+    }
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of subtracting the single-precision floating-point values
+| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
+| for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float32 float32_sub( float32 a, float32 b STATUS_PARAM )
+{
+    flag aSign, bSign;
+
+    aSign = extractFloat32Sign( a );
+    bSign = extractFloat32Sign( b );
+    if ( aSign == bSign ) {
+        return subFloat32Sigs( a, b, aSign STATUS_VAR );
+    }
+    else {
+        return addFloat32Sigs( a, b, aSign STATUS_VAR );
+    }
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of multiplying the single-precision floating-point values
+| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
+| for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float32 float32_mul( float32 a, float32 b STATUS_PARAM )
+{
+    flag aSign, bSign, zSign;
+    int16 aExp, bExp, zExp;
+    bits32 aSig, bSig;
+    bits64 zSig64;
+    bits32 zSig;
+
+    aSig = extractFloat32Frac( a );
+    aExp = extractFloat32Exp( a );
+    aSign = extractFloat32Sign( a );
+    bSig = extractFloat32Frac( b );
+    bExp = extractFloat32Exp( b );
+    bSign = extractFloat32Sign( b );
+    zSign = aSign ^ bSign;
+    if ( aExp == 0xFF ) {
+        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
+            return propagateFloat32NaN( a, b STATUS_VAR );
+        }
+        if ( ( bExp | bSig ) == 0 ) {
+            float_raise( float_flag_invalid STATUS_VAR);
+            return float32_default_nan;
+        }
+        return packFloat32( zSign, 0xFF, 0 );
+    }
+    if ( bExp == 0xFF ) {
+        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
+        if ( ( aExp | aSig ) == 0 ) {
+            float_raise( float_flag_invalid STATUS_VAR);
+            return float32_default_nan;
+        }
+        return packFloat32( zSign, 0xFF, 0 );
+    }
+    if ( aExp == 0 ) {
+        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
+        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
+    }
+    if ( bExp == 0 ) {
+        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
+        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
+    }
+    zExp = aExp + bExp - 0x7F;
+    aSig = ( aSig | 0x00800000 )<<7;
+    bSig = ( bSig | 0x00800000 )<<8;
+    shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
+    zSig = zSig64;
+    if ( 0 <= (sbits32) ( zSig<<1 ) ) {
+        zSig <<= 1;
+        --zExp;
+    }
+    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of dividing the single-precision floating-point value `a'
+| by the corresponding value `b'.  The operation is performed according to the
+| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float32 float32_div( float32 a, float32 b STATUS_PARAM )
+{
+    flag aSign, bSign, zSign;
+    int16 aExp, bExp, zExp;
+    bits32 aSig, bSig, zSig;
+
+    aSig = extractFloat32Frac( a );
+    aExp = extractFloat32Exp( a );
+    aSign = extractFloat32Sign( a );
+    bSig = extractFloat32Frac( b );
+    bExp = extractFloat32Exp( b );
+    bSign = extractFloat32Sign( b );
+    zSign = aSign ^ bSign;
+    if ( aExp == 0xFF ) {
+        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
+        if ( bExp == 0xFF ) {
+            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
+            float_raise( float_flag_invalid STATUS_VAR);
+            return float32_default_nan;
+        }
+        return packFloat32( zSign, 0xFF, 0 );
+    }
+    if ( bExp == 0xFF ) {
+        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
+        return packFloat32( zSign, 0, 0 );
+    }
+    if ( bExp == 0 ) {
+        if ( bSig == 0 ) {
+            if ( ( aExp | aSig ) == 0 ) {
+                float_raise( float_flag_invalid STATUS_VAR);
+                return float32_default_nan;
+            }
+            float_raise( float_flag_divbyzero STATUS_VAR);
+            return packFloat32( zSign, 0xFF, 0 );
+        }
+        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
+    }
+    if ( aExp == 0 ) {
+        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
+        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
+    }
+    zExp = aExp - bExp + 0x7D;
+    aSig = ( aSig | 0x00800000 )<<7;
+    bSig = ( bSig | 0x00800000 )<<8;
+    if ( bSig <= ( aSig + aSig ) ) {
+        aSig >>= 1;
+        ++zExp;
+    }
+    zSig = ( ( (bits64) aSig )<<32 ) / bSig;
+    if ( ( zSig & 0x3F ) == 0 ) {
+        zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
+    }
+    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the remainder of the single-precision floating-point value `a'
+| with respect to the corresponding value `b'.  The operation is performed
+| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float32 float32_rem( float32 a, float32 b STATUS_PARAM )
+{
+    flag aSign, zSign;
+    int16 aExp, bExp, expDiff;
+    bits32 aSig, bSig;
+    bits32 q;
+    bits64 aSig64, bSig64, q64;
+    bits32 alternateASig;
+    sbits32 sigMean;
+
+    aSig = extractFloat32Frac( a );
+    aExp = extractFloat32Exp( a );
+    aSign = extractFloat32Sign( a );
+    bSig = extractFloat32Frac( b );
+    bExp = extractFloat32Exp( b );
+    if ( aExp == 0xFF ) {
+        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
+            return propagateFloat32NaN( a, b STATUS_VAR );
+        }
+        float_raise( float_flag_invalid STATUS_VAR);
+        return float32_default_nan;
+    }
+    if ( bExp == 0xFF ) {
+        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
+        return a;
+    }
+    if ( bExp == 0 ) {
+        if ( bSig == 0 ) {
+            float_raise( float_flag_invalid STATUS_VAR);
+            return float32_default_nan;
+        }
+        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
+    }
+    if ( aExp == 0 ) {
+        if ( aSig == 0 ) return a;
+        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
+    }
+    expDiff = aExp - bExp;
+    aSig |= 0x00800000;
+    bSig |= 0x00800000;
+    if ( expDiff < 32 ) {
+        aSig <<= 8;
+        bSig <<= 8;
+        if ( expDiff < 0 ) {
+            if ( expDiff < -1 ) return a;
+            aSig >>= 1;
+        }
+        q = ( bSig <= aSig );
+        if ( q ) aSig -= bSig;
+        if ( 0 < expDiff ) {
+            q = ( ( (bits64) aSig )<<32 ) / bSig;
+            q >>= 32 - expDiff;
+            bSig >>= 2;
+            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
+        }
+        else {
+            aSig >>= 2;
+            bSig >>= 2;
+        }
+    }
+    else {
+        if ( bSig <= aSig ) aSig -= bSig;
+        aSig64 = ( (bits64) aSig )<<40;
+        bSig64 = ( (bits64) bSig )<<40;
+        expDiff -= 64;
+        while ( 0 < expDiff ) {
+            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
+            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
+            aSig64 = - ( ( bSig * q64 )<<38 );
+            expDiff -= 62;
+        }
+        expDiff += 64;
+        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
+        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
+        q = q64>>( 64 - expDiff );
+        bSig <<= 6;
+        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
+    }
+    do {
+        alternateASig = aSig;
+        ++q;
+        aSig -= bSig;
+    } while ( 0 <= (sbits32) aSig );
+    sigMean = aSig + alternateASig;
+    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
+        aSig = alternateASig;
+    }
+    zSign = ( (sbits32) aSig < 0 );
+    if ( zSign ) aSig = - aSig;
+    return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the square root of the single-precision floating-point value `a'.
+| The operation is performed according to the IEC/IEEE Standard for Binary
+| Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float32 float32_sqrt( float32 a STATUS_PARAM )
+{
+    flag aSign;
+    int16 aExp, zExp;
+    bits32 aSig, zSig;
+    bits64 rem, term;
+
+    aSig = extractFloat32Frac( a );
+    aExp = extractFloat32Exp( a );
+    aSign = extractFloat32Sign( a );
+    if ( aExp == 0xFF ) {
+        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
+        if ( ! aSign ) return a;
+        float_raise( float_flag_invalid STATUS_VAR);
+        return float32_default_nan;
+    }
+    if ( aSign ) {
+        if ( ( aExp | aSig ) == 0 ) return a;
+        float_raise( float_flag_invalid STATUS_VAR);
+        return float32_default_nan;
+    }
+    if ( aExp == 0 ) {
+        if ( aSig == 0 ) return float32_zero;
+        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
+    }
+    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
+    aSig = ( aSig | 0x00800000 )<<8;
+    zSig = estimateSqrt32( aExp, aSig ) + 2;
+    if ( ( zSig & 0x7F ) <= 5 ) {
+        if ( zSig < 2 ) {
+            zSig = 0x7FFFFFFF;
+            goto roundAndPack;
+        }
+        aSig >>= aExp & 1;
+        term = ( (bits64) zSig ) * zSig;
+        rem = ( ( (bits64) aSig )<<32 ) - term;
+        while ( (sbits64) rem < 0 ) {
+            --zSig;
+            rem += ( ( (bits64) zSig )<<1 ) | 1;
+        }
+        zSig |= ( rem != 0 );
+    }
+    shift32RightJamming( zSig, 1, &zSig );
+ roundAndPack:
+    return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the binary exponential of the single-precision floating-point value
+| `a'. The operation is performed according to the IEC/IEEE Standard for
+| Binary Floating-Point Arithmetic.
+|
+| Uses the following identities:
+|
+| 1. -------------------------------------------------------------------------
+|      x    x*ln(2)
+|     2  = e
+|
+| 2. -------------------------------------------------------------------------
+|                      2     3     4     5           n
+|      x        x     x     x     x     x           x
+|     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
+|               1!    2!    3!    4!    5!          n!
+*----------------------------------------------------------------------------*/
+
+static const float64 float32_exp2_coefficients[15] =
+{
+    make_float64( 0x3ff0000000000000ll ), /*  1 */
+    make_float64( 0x3fe0000000000000ll ), /*  2 */
+    make_float64( 0x3fc5555555555555ll ), /*  3 */
+    make_float64( 0x3fa5555555555555ll ), /*  4 */
+    make_float64( 0x3f81111111111111ll ), /*  5 */
+    make_float64( 0x3f56c16c16c16c17ll ), /*  6 */
+    make_float64( 0x3f2a01a01a01a01all ), /*  7 */
+    make_float64( 0x3efa01a01a01a01all ), /*  8 */
+    make_float64( 0x3ec71de3a556c734ll ), /*  9 */
+    make_float64( 0x3e927e4fb7789f5cll ), /* 10 */
+    make_float64( 0x3e5ae64567f544e4ll ), /* 11 */
+    make_float64( 0x3e21eed8eff8d898ll ), /* 12 */
+    make_float64( 0x3de6124613a86d09ll ), /* 13 */
+    make_float64( 0x3da93974a8c07c9dll ), /* 14 */
+    make_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
+};
+
+float32 float32_exp2( float32 a STATUS_PARAM )
+{
+    flag aSign;
+    int16 aExp;
+    bits32 aSig;
+    float64 r, x, xn;
+    int i;
+
+    aSig = extractFloat32Frac( a );
+    aExp = extractFloat32Exp( a );
+    aSign = extractFloat32Sign( a );
+
+    if ( aExp == 0xFF) {
+        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
+        return (aSign) ? float32_zero : a;
+    }
+    if (aExp == 0) {
+        if (aSig == 0) return float32_one;
+    }
+
+    float_raise( float_flag_inexact STATUS_VAR);
+
+    /* ******************************* */
+    /* using float64 for approximation */
+    /* ******************************* */
+    x = float32_to_float64(a STATUS_VAR);
+    x = float64_mul(x, float64_ln2 STATUS_VAR);
+
+    xn = x;
+    r = float64_one;
+    for (i = 0 ; i < 15 ; i++) {
+        float64 f;
+
+        f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
+        r = float64_add(r, f STATUS_VAR);
+
+        xn = float64_mul(xn, x STATUS_VAR);
+    }
+
+    return float64_to_float32(r, status);
+}
+
+/*----------------------------------------------------------------------------
+| Returns the binary log of the single-precision floating-point value `a'.
+| The operation is performed according to the IEC/IEEE Standard for Binary
+| Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+float32 float32_log2( float32 a STATUS_PARAM )
+{
+    flag aSign, zSign;
+    int16 aExp;
+    bits32 aSig, zSig, i;
+
+    aSig = extractFloat32Frac( a );
+    aExp = extractFloat32Exp( a );
+    aSign = extractFloat32Sign( a );
+
+    if ( aExp == 0 ) {
+        if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
+        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
+    }
+    if ( aSign ) {
+        float_raise( float_flag_invalid STATUS_VAR);
+        return float32_default_nan;
+    }
+    if ( aExp == 0xFF ) {
+        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
+        return a;
+    }
+
+    aExp -= 0x7F;
+    aSig |= 0x00800000;
+    zSign = aExp < 0;
+    zSig = aExp << 23;
+
+    for (i = 1 << 22; i > 0; i >>= 1) {
+        aSig = ( (bits64)aSig * aSig ) >> 23;
+        if ( aSig & 0x01000000 ) {
+            aSig >>= 1;
+            zSig |= i;
+        }
+    }
+
+    if ( zSign )
+        zSig = -zSig;
+
+    return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the single-precision floating-point value `a' is equal to
+| the corresponding value `b', and 0 otherwise.  The comparison is performed
+| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+int float32_eq( float32 a, float32 b STATUS_PARAM )
+{
+
+    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
+         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
+       ) {
+        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
+            float_raise( float_flag_invalid STATUS_VAR);
+        }
+        return 0;
+    }
+    return ( float32_val(a) == float32_val(b) ) ||
+            ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the single-precision floating-point value `a' is less than
+| or equal to the corresponding value `b', and 0 otherwise.  The comparison
+| is performed according to the IEC/IEEE Standard for Binary Floating-Point
+| Arithmetic.
+*----------------------------------------------------------------------------*/
+
+int float32_le( float32 a, float32 b STATUS_PARAM )
+{
+    flag aSign, bSign;
+    bits32 av, bv;
+
+    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
+         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
+       ) {
+        float_raise( float_flag_invalid STATUS_VAR);
+        return 0;
+    }
+    aSign = extractFloat32Sign( a );
+    bSign = extractFloat32Sign( b );
+    av = float32_val(a);
+    bv = float32_val(b);
+    if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
+    return ( av == bv ) || ( aSign ^ ( av < bv ) );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the single-precision floating-point value `a' is less than
+| the corresponding value `b', and 0 otherwise.  The comparison is performed
+| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+int float32_lt( float32 a, float32 b STATUS_PARAM )
+{
+    flag aSign, bSign;
+    bits32 av, bv;
+
+    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
+         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
+       ) {
+        float_raise( float_flag_invalid STATUS_VAR);
+        return 0;
+    }
+    aSign = extractFloat32Sign( a );
+    bSign = extractFloat32Sign( b );
+    av = float32_val(a);
+    bv = float32_val(b);
+    if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
+    return ( av != bv ) && ( aSign ^ ( av < bv ) );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the single-precision floating-point value `a' is equal to
+| the corresponding value `b', and 0 otherwise.  The invalid exception is
+| raised if either operand is a NaN.  Otherwise, the comparison is performed
+| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
+{
+    bits32 av, bv;
+
+    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
+         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
+       ) {
+        float_raise( float_flag_invalid STATUS_VAR);
+        return 0;
+    }
+    av = float32_val(a);
+    bv = float32_val(b);
+    return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the single-precision floating-point value `a' is less than or
+| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
+| cause an exception.  Otherwise, the comparison is performed according to the
+| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
+{
+    flag aSign, bSign;
+    bits32 av, bv;
+
+    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
+         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
+       ) {
+        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
+            float_raise( float_flag_invalid STATUS_VAR);
+        }
+        return 0;
+    }
+    aSign = extractFloat32Sign( a );
+    bSign = extractFloat32Sign( b );
+    av = float32_val(a);
+    bv = float32_val(b);
+    if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
+    return ( av == bv ) || ( aSign ^ ( av < bv ) );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the single-precision floating-point value `a' is less than
+| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
+| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
+| Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
+{
+    flag aSign, bSign;
+    bits32 av, bv;
+
+    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
+         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
+       ) {
+        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
+            float_raise( float_flag_invalid STATUS_VAR);
+        }
+        return 0;
+    }
+    aSign = extractFloat32Sign( a );
+    bSign = extractFloat32Sign( b );
+    av = float32_val(a);
+    bv = float32_val(b);
+    if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
+    return ( av != bv ) && ( aSign ^ ( av < bv ) );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the double-precision floating-point value
+| `a' to the 32-bit two's complement integer format.  The conversion is
+| performed according to the IEC/IEEE Standard for Binary Floating-Point
+| Arithmetic---which means in particular that the conversion is rounded
+| according to the current rounding mode.  If `a' is a NaN, the largest
+| positive integer is returned.  Otherwise, if the conversion overflows, the
+| largest integer with the same sign as `a' is returned.
+*----------------------------------------------------------------------------*/
+
+int32 float64_to_int32( float64 a STATUS_PARAM )
+{
+    flag aSign;
+    int16 aExp, shiftCount;
+    bits64 aSig;
+
+    aSig = extractFloat64Frac( a );
+    aExp = extractFloat64Exp( a );
+    aSign = extractFloat64Sign( a );
+    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
+    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
+    shiftCount = 0x42C - aExp;
+    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
+    return roundAndPackInt32( aSign, aSig STATUS_VAR );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the double-precision floating-point value
+| `a' to the 32-bit two's complement integer format.  The conversion is
+| performed according to the IEC/IEEE Standard for Binary Floating-Point
+| Arithmetic, except that the conversion is always rounded toward zero.
+| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
+| the conversion overflows, the largest integer with the same sign as `a' is
+| returned.
+*----------------------------------------------------------------------------*/
+
+int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
+{
+    flag aSign;
+    int16 aExp, shiftCount;
+    bits64 aSig, savedASig;
+    int32 z;
+
+    aSig = extractFloat64Frac( a );
+    aExp = extractFloat64Exp( a );
+    aSign = extractFloat64Sign( a );
+    if ( 0x41E < aExp ) {
+        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
+        goto invalid;
+    }
+    else if ( aExp < 0x3FF ) {
+        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
+        return 0;
+    }
+    aSig |= LIT64( 0x0010000000000000 );
+    shiftCount = 0x433 - aExp;
+    savedASig = aSig;
+    aSig >>= shiftCount;
+    z = aSig;
+    if ( aSign ) z = - z;
+    if ( ( z < 0 ) ^ aSign ) {
+ invalid:
+        float_raise( float_flag_invalid STATUS_VAR);
+        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
+    }
+    if ( ( aSig<<shiftCount ) != savedASig ) {
+        STATUS(float_exception_flags) |= float_flag_inexact;
+    }
+    return z;
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the double-precision floating-point value
+| `a' to the 64-bit two's complement integer format.  The conversion is
+| performed according to the IEC/IEEE Standard for Binary Floating-Point
+| Arithmetic---which means in particular that the conversion is rounded
+| according to the current rounding mode.  If `a' is a NaN, the largest
+| positive integer is returned.  Otherwise, if the conversion overflows, the
+| largest integer with the same sign as `a' is returned.
+*----------------------------------------------------------------------------*/
+
+int64 float64_to_int64( float64 a STATUS_PARAM )
+{
+    flag aSign;
+    int16 aExp, shiftCount;
+    bits64 aSig, aSigExtra;
+
+    aSig = extractFloat64Frac( a );
+    aExp = extractFloat64Exp( a );
+    aSign = extractFloat64Sign( a );
+    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
+    shiftCount = 0x433 - aExp;
+    if ( shiftCount <= 0 ) {
+        if ( 0x43E < aExp ) {
+            float_raise( float_flag_invalid STATUS_VAR);
+            if (    ! aSign
+                 || (    ( aExp == 0x7FF )
+                      && ( aSig != LIT64( 0x0010000000000000 ) ) )
+               ) {
+                return LIT64( 0x7FFFFFFFFFFFFFFF );
+            }
+            return (sbits64) LIT64( 0x8000000000000000 );
+        }
+        aSigExtra = 0;
+        aSig <<= - shiftCount;
+    }
+    else {
+        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
+    }
+    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the double-precision floating-point value
+| `a' to the 64-bit two's complement integer format.  The conversion is
+| performed according to the IEC/IEEE Standard for Binary Floating-Point
+| Arithmetic, except that the conversion is always rounded toward zero.
+| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
+| the conversion overflows, the largest integer with the same sign as `a' is
+| returned.
+*----------------------------------------------------------------------------*/
+
+int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
+{
+    flag aSign;
+    int16 aExp, shiftCount;
+    bits64 aSig;
+    int64 z;
+
+    aSig = extractFloat64Frac( a );
+    aExp = extractFloat64Exp( a );
+    aSign = extractFloat64Sign( a );
+    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
+    shiftCount = aExp - 0x433;
+    if ( 0 <= shiftCount ) {
+        if ( 0x43E <= aExp ) {
+            if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
+                float_raise( float_flag_invalid STATUS_VAR);
+                if (    ! aSign
+                     || (    ( aExp == 0x7FF )
+                          && ( aSig != LIT64( 0x0010000000000000 ) ) )
+                   ) {
+                    return LIT64( 0x7FFFFFFFFFFFFFFF );
+                }
+            }
+            return (sbits64) LIT64( 0x8000000000000000 );
+        }
+        z = aSig<<shiftCount;
+    }
+    else {
+        if ( aExp < 0x3FE ) {
+            if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
+            return 0;
+        }
+        z = aSig>>( - shiftCount );
+        if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
+            STATUS(float_exception_flags) |= float_flag_inexact;
+        }
+    }
+    if ( aSign ) z = - z;
+    return z;
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the double-precision floating-point value
+| `a' to the single-precision floating-point format.  The conversion is
+| performed according to the IEC/IEEE Standard for Binary Floating-Point
+| Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float32 float64_to_float32( float64 a STATUS_PARAM )
+{
+    flag aSign;
+    int16 aExp;
+    bits64 aSig;
+    bits32 zSig;
+
+    aSig = extractFloat64Frac( a );
+    aExp = extractFloat64Exp( a );
+    aSign = extractFloat64Sign( a );
+    if ( aExp == 0x7FF ) {
+        if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
+        return packFloat32( aSign, 0xFF, 0 );
+    }
+    shift64RightJamming( aSig, 22, &aSig );
+    zSig = aSig;
+    if ( aExp || zSig ) {
+        zSig |= 0x40000000;
+        aExp -= 0x381;
+    }
+    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
+
+}
+
+
+/*----------------------------------------------------------------------------
+| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
+| half-precision floating-point value, returning the result.  After being
+| shifted into the proper positions, the three fields are simply added
+| together to form the result.  This means that any integer portion of `zSig'
+| will be added into the exponent.  Since a properly normalized significand
+| will have an integer portion equal to 1, the `zExp' input should be 1 less
+| than the desired result exponent whenever `zSig' is a complete, normalized
+| significand.
+*----------------------------------------------------------------------------*/
+static bits16 packFloat16(flag zSign, int16 zExp, bits16 zSig)
+{
+    return (((bits32)zSign) << 15) + (((bits32)zExp) << 10) + zSig;
+}
+
+/* Half precision floats come in two formats: standard IEEE and "ARM" format.
+   The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
+
+float32 float16_to_float32( bits16 a, flag ieee STATUS_PARAM )
+{
+    flag aSign;
+    int16 aExp;
+    bits32 aSig;
+
+    aSign = a >> 15;
+    aExp = (a >> 10) & 0x1f;
+    aSig = a & 0x3ff;
+
+    if (aExp == 0x1f && ieee) {
+        if (aSig) {
+            /* Make sure correct exceptions are raised.  */
+            float32ToCommonNaN(a STATUS_VAR);
+            aSig |= 0x200;
+        }
+        return packFloat32(aSign, 0xff, aSig << 13);
+    }
+    if (aExp == 0) {
+        int8 shiftCount;
+
+        if (aSig == 0) {
+            return packFloat32(aSign, 0, 0);
+        }
+
+        shiftCount = countLeadingZeros32( aSig ) - 21;
+        aSig = aSig << shiftCount;
+        aExp = -shiftCount;
+    }
+    return packFloat32( aSign, aExp + 0x70, aSig << 13);
+}
+
+bits16 float32_to_float16( float32 a, flag ieee STATUS_PARAM)
+{
+    flag aSign;
+    int16 aExp;
+    bits32 aSig;
+    bits32 mask;
+    bits32 increment;
+    int8 roundingMode;
+
+    aSig = extractFloat32Frac( a );
+    aExp = extractFloat32Exp( a );
+    aSign = extractFloat32Sign( a );
+    if ( aExp == 0xFF ) {
+        if (aSig) {
+            /* Make sure correct exceptions are raised.  */
+            float32ToCommonNaN(a STATUS_VAR);
+            aSig |= 0x00400000;
+        }
+        return packFloat16(aSign, 0x1f, aSig >> 13);
+    }
+    if (aExp == 0 && aSign == 0) {
+        return packFloat16(aSign, 0, 0);
+    }
+    /* Decimal point between bits 22 and 23.  */
+    aSig |= 0x00800000;
+    aExp -= 0x7f;
+    if (aExp < -14) {
+        mask = 0x007fffff;
+        if (aExp < -24) {
+            aExp = -25;
+        } else {
+            mask >>= 24 + aExp;
+        }
+    } else {
+        mask = 0x00001fff;
+    }
+    if (aSig & mask) {
+        float_raise( float_flag_underflow STATUS_VAR );
+        roundingMode = STATUS(float_rounding_mode);
+        switch (roundingMode) {
+        case float_round_nearest_even:
+            increment = (mask + 1) >> 1;
+            if ((aSig & mask) == increment) {
+                increment = aSig & (increment << 1);
+            }
+            break;
+        case float_round_up:
+            increment = aSign ? 0 : mask;
+            break;
+        case float_round_down:
+            increment = aSign ? mask : 0;
+            break;
+        default: /* round_to_zero */
+            increment = 0;
+            break;
+        }
+        aSig += increment;
+        if (aSig >= 0x01000000) {
+            aSig >>= 1;
+            aExp++;
+        }
+    } else if (aExp < -14
+          && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
+        float_raise( float_flag_underflow STATUS_VAR);
+    }
+
+    if (ieee) {
+        if (aExp > 15) {
+            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
+            return packFloat16(aSign, 0x1f, 0);
+        }
+    } else {
+        if (aExp > 16) {
+            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
+            return packFloat16(aSign, 0x1f, 0x3ff);
+        }
+    }
+    if (aExp < -24) {
+        return packFloat16(aSign, 0, 0);
+    }
+    if (aExp < -14) {
+        aSig >>= -14 - aExp;
+        aExp = -14;
+    }
+    return packFloat16(aSign, aExp + 14, aSig >> 13);
+}
+
+#ifdef FLOATX80
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the double-precision floating-point value
+| `a' to the extended double-precision floating-point format.  The conversion
+| is performed according to the IEC/IEEE Standard for Binary Floating-Point
+| Arithmetic.
+*----------------------------------------------------------------------------*/
+
+floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
+{
+    flag aSign;
+    int16 aExp;
+    bits64 aSig;
+
+    aSig = extractFloat64Frac( a );
+    aExp = extractFloat64Exp( a );
+    aSign = extractFloat64Sign( a );
+    if ( aExp == 0x7FF ) {
+        if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
+        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
+    }
+    if ( aExp == 0 ) {
+        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
+        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
+    }
+    return
+        packFloatx80(
+            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
+
+}
+
+#endif
+
+#ifdef FLOAT128
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the double-precision floating-point value
+| `a' to the quadruple-precision floating-point format.  The conversion is
+| performed according to the IEC/IEEE Standard for Binary Floating-Point
+| Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float128 float64_to_float128( float64 a STATUS_PARAM )
+{
+    flag aSign;
+    int16 aExp;
+    bits64 aSig, zSig0, zSig1;
+
+    aSig = extractFloat64Frac( a );
+    aExp = extractFloat64Exp( a );
+    aSign = extractFloat64Sign( a );
+    if ( aExp == 0x7FF ) {
+        if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
+        return packFloat128( aSign, 0x7FFF, 0, 0 );
+    }
+    if ( aExp == 0 ) {
+        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
+        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
+        --aExp;
+    }
+    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
+    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
+
+}
+
+#endif
+
+/*----------------------------------------------------------------------------
+| Rounds the double-precision floating-point value `a' to an integer, and
+| returns the result as a double-precision floating-point value.  The
+| operation is performed according to the IEC/IEEE Standard for Binary
+| Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float64 float64_round_to_int( float64 a STATUS_PARAM )
+{
+    flag aSign;
+    int16 aExp;
+    bits64 lastBitMask, roundBitsMask;
+    int8 roundingMode;
+    bits64 z;
+
+    aExp = extractFloat64Exp( a );
+    if ( 0x433 <= aExp ) {
+        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
+            return propagateFloat64NaN( a, a STATUS_VAR );
+        }
+        return a;
+    }
+    if ( aExp < 0x3FF ) {
+        if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a;
+        STATUS(float_exception_flags) |= float_flag_inexact;
+        aSign = extractFloat64Sign( a );
+        switch ( STATUS(float_rounding_mode) ) {
+         case float_round_nearest_even:
+            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
+                return packFloat64( aSign, 0x3FF, 0 );
+            }
+            break;
+         case float_round_down:
+            return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
+         case float_round_up:
+            return make_float64(
+            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
+        }
+        return packFloat64( aSign, 0, 0 );
+    }
+    lastBitMask = 1;
+    lastBitMask <<= 0x433 - aExp;
+    roundBitsMask = lastBitMask - 1;
+    z = float64_val(a);
+    roundingMode = STATUS(float_rounding_mode);
+    if ( roundingMode == float_round_nearest_even ) {
+        z += lastBitMask>>1;
+        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
+    }
+    else if ( roundingMode != float_round_to_zero ) {
+        if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
+            z += roundBitsMask;
+        }
+    }
+    z &= ~ roundBitsMask;
+    if ( z != float64_val(a) )
+        STATUS(float_exception_flags) |= float_flag_inexact;
+    return make_float64(z);
+
+}
+
+float64 float64_trunc_to_int( float64 a STATUS_PARAM)
+{
+    int oldmode;
+    float64 res;
+    oldmode = STATUS(float_rounding_mode);
+    STATUS(float_rounding_mode) = float_round_to_zero;
+    res = float64_round_to_int(a STATUS_VAR);
+    STATUS(float_rounding_mode) = oldmode;
+    return res;
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of adding the absolute values of the double-precision
+| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
+| before being returned.  `zSign' is ignored if the result is a NaN.
+| The addition is performed according to the IEC/IEEE Standard for Binary
+| Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
+{
+    int16 aExp, bExp, zExp;
+    bits64 aSig, bSig, zSig;
+    int16 expDiff;
+
+    aSig = extractFloat64Frac( a );
+    aExp = extractFloat64Exp( a );
+    bSig = extractFloat64Frac( b );
+    bExp = extractFloat64Exp( b );
+    expDiff = aExp - bExp;
+    aSig <<= 9;
+    bSig <<= 9;
+    if ( 0 < expDiff ) {
+        if ( aExp == 0x7FF ) {
+            if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
+            return a;
+        }
+        if ( bExp == 0 ) {
+            --expDiff;
+        }
+        else {
+            bSig |= LIT64( 0x2000000000000000 );
+        }
+        shift64RightJamming( bSig, expDiff, &bSig );
+        zExp = aExp;
+    }
+    else if ( expDiff < 0 ) {
+        if ( bExp == 0x7FF ) {
+            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
+            return packFloat64( zSign, 0x7FF, 0 );
+        }
+        if ( aExp == 0 ) {
+            ++expDiff;
+        }
+        else {
+            aSig |= LIT64( 0x2000000000000000 );
+        }
+        shift64RightJamming( aSig, - expDiff, &aSig );
+        zExp = bExp;
+    }
+    else {
+        if ( aExp == 0x7FF ) {
+            if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
+            return a;
+        }
+        if ( aExp == 0 ) {
+            if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
+            return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
+        }
+        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
+        zExp = aExp;
+        goto roundAndPack;
+    }
+    aSig |= LIT64( 0x2000000000000000 );
+    zSig = ( aSig + bSig )<<1;
+    --zExp;
+    if ( (sbits64) zSig < 0 ) {
+        zSig = aSig + bSig;
+        ++zExp;
+    }
+ roundAndPack:
+    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of subtracting the absolute values of the double-
+| precision floating-point values `a' and `b'.  If `zSign' is 1, the
+| difference is negated before being returned.  `zSign' is ignored if the
+| result is a NaN.  The subtraction is performed according to the IEC/IEEE
+| Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
+{
+    int16 aExp, bExp, zExp;
+    bits64 aSig, bSig, zSig;
+    int16 expDiff;
+
+    aSig = extractFloat64Frac( a );
+    aExp = extractFloat64Exp( a );
+    bSig = extractFloat64Frac( b );
+    bExp = extractFloat64Exp( b );
+    expDiff = aExp - bExp;
+    aSig <<= 10;
+    bSig <<= 10;
+    if ( 0 < expDiff ) goto aExpBigger;
+    if ( expDiff < 0 ) goto bExpBigger;
+    if ( aExp == 0x7FF ) {
+        if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
+        float_raise( float_flag_invalid STATUS_VAR);
+        return float64_default_nan;
+    }
+    if ( aExp == 0 ) {
+        aExp = 1;
+        bExp = 1;
+    }
+    if ( bSig < aSig ) goto aBigger;
+    if ( aSig < bSig ) goto bBigger;
+    return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
+ bExpBigger:
+    if ( bExp == 0x7FF ) {
+        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
+        return packFloat64( zSign ^ 1, 0x7FF, 0 );
+    }
+    if ( aExp == 0 ) {
+        ++expDiff;
+    }
+    else {
+        aSig |= LIT64( 0x4000000000000000 );
+    }
+    shift64RightJamming( aSig, - expDiff, &aSig );
+    bSig |= LIT64( 0x4000000000000000 );
+ bBigger:
+    zSig = bSig - aSig;
+    zExp = bExp;
+    zSign ^= 1;
+    goto normalizeRoundAndPack;
+ aExpBigger:
+    if ( aExp == 0x7FF ) {
+        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
+        return a;
+    }
+    if ( bExp == 0 ) {
+        --expDiff;
+    }
+    else {
+        bSig |= LIT64( 0x4000000000000000 );
+    }
+    shift64RightJamming( bSig, expDiff, &bSig );
+    aSig |= LIT64( 0x4000000000000000 );
+ aBigger:
+    zSig = aSig - bSig;
+    zExp = aExp;
+ normalizeRoundAndPack:
+    --zExp;
+    return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of adding the double-precision floating-point values `a'
+| and `b'.  The operation is performed according to the IEC/IEEE Standard for
+| Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float64 float64_add( float64 a, float64 b STATUS_PARAM )
+{
+    flag aSign, bSign;
+
+    aSign = extractFloat64Sign( a );
+    bSign = extractFloat64Sign( b );
+    if ( aSign == bSign ) {
+        return addFloat64Sigs( a, b, aSign STATUS_VAR );
+    }
+    else {
+        return subFloat64Sigs( a, b, aSign STATUS_VAR );
+    }
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of subtracting the double-precision floating-point values
+| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
+| for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float64 float64_sub( float64 a, float64 b STATUS_PARAM )
+{
+    flag aSign, bSign;
+
+    aSign = extractFloat64Sign( a );
+    bSign = extractFloat64Sign( b );
+    if ( aSign == bSign ) {
+        return subFloat64Sigs( a, b, aSign STATUS_VAR );
+    }
+    else {
+        return addFloat64Sigs( a, b, aSign STATUS_VAR );
+    }
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of multiplying the double-precision floating-point values
+| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
+| for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float64 float64_mul( float64 a, float64 b STATUS_PARAM )
+{
+    flag aSign, bSign, zSign;
+    int16 aExp, bExp, zExp;
+    bits64 aSig, bSig, zSig0, zSig1;
+
+    aSig = extractFloat64Frac( a );
+    aExp = extractFloat64Exp( a );
+    aSign = extractFloat64Sign( a );
+    bSig = extractFloat64Frac( b );
+    bExp = extractFloat64Exp( b );
+    bSign = extractFloat64Sign( b );
+    zSign = aSign ^ bSign;
+    if ( aExp == 0x7FF ) {
+        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
+            return propagateFloat64NaN( a, b STATUS_VAR );
+        }
+        if ( ( bExp | bSig ) == 0 ) {
+            float_raise( float_flag_invalid STATUS_VAR);
+            return float64_default_nan;
+        }
+        return packFloat64( zSign, 0x7FF, 0 );
+    }
+    if ( bExp == 0x7FF ) {
+        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
+        if ( ( aExp | aSig ) == 0 ) {
+            float_raise( float_flag_invalid STATUS_VAR);
+            return float64_default_nan;
+        }
+        return packFloat64( zSign, 0x7FF, 0 );
+    }
+    if ( aExp == 0 ) {
+        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
+        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
+    }
+    if ( bExp == 0 ) {
+        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
+        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
+    }
+    zExp = aExp + bExp - 0x3FF;
+    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
+    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
+    mul64To128( aSig, bSig, &zSig0, &zSig1 );
+    zSig0 |= ( zSig1 != 0 );
+    if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
+        zSig0 <<= 1;
+        --zExp;
+    }
+    return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of dividing the double-precision floating-point value `a'
+| by the corresponding value `b'.  The operation is performed according to
+| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float64 float64_div( float64 a, float64 b STATUS_PARAM )
+{
+    flag aSign, bSign, zSign;
+    int16 aExp, bExp, zExp;
+    bits64 aSig, bSig, zSig;
+    bits64 rem0, rem1;
+    bits64 term0, term1;
+
+    aSig = extractFloat64Frac( a );
+    aExp = extractFloat64Exp( a );
+    aSign = extractFloat64Sign( a );
+    bSig = extractFloat64Frac( b );
+    bExp = extractFloat64Exp( b );
+    bSign = extractFloat64Sign( b );
+    zSign = aSign ^ bSign;
+    if ( aExp == 0x7FF ) {
+        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
+        if ( bExp == 0x7FF ) {
+            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
+            float_raise( float_flag_invalid STATUS_VAR);
+            return float64_default_nan;
+        }
+        return packFloat64( zSign, 0x7FF, 0 );
+    }
+    if ( bExp == 0x7FF ) {
+        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
+        return packFloat64( zSign, 0, 0 );
+    }
+    if ( bExp == 0 ) {
+        if ( bSig == 0 ) {
+            if ( ( aExp | aSig ) == 0 ) {
+                float_raise( float_flag_invalid STATUS_VAR);
+                return float64_default_nan;
+            }
+            float_raise( float_flag_divbyzero STATUS_VAR);
+            return packFloat64( zSign, 0x7FF, 0 );
+        }
+        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
+    }
+    if ( aExp == 0 ) {
+        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
+        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
+    }
+    zExp = aExp - bExp + 0x3FD;
+    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
+    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
+    if ( bSig <= ( aSig + aSig ) ) {
+        aSig >>= 1;
+        ++zExp;
+    }
+    zSig = estimateDiv128To64( aSig, 0, bSig );
+    if ( ( zSig & 0x1FF ) <= 2 ) {
+        mul64To128( bSig, zSig, &term0, &term1 );
+        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
+        while ( (sbits64) rem0 < 0 ) {
+            --zSig;
+            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
+        }
+        zSig |= ( rem1 != 0 );
+    }
+    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the remainder of the double-precision floating-point value `a'
+| with respect to the corresponding value `b'.  The operation is performed
+| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float64 float64_rem( float64 a, float64 b STATUS_PARAM )
+{
+    flag aSign, zSign;
+    int16 aExp, bExp, expDiff;
+    bits64 aSig, bSig;
+    bits64 q, alternateASig;
+    sbits64 sigMean;
+
+    aSig = extractFloat64Frac( a );
+    aExp = extractFloat64Exp( a );
+    aSign = extractFloat64Sign( a );
+    bSig = extractFloat64Frac( b );
+    bExp = extractFloat64Exp( b );
+    if ( aExp == 0x7FF ) {
+        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
+            return propagateFloat64NaN( a, b STATUS_VAR );
+        }
+        float_raise( float_flag_invalid STATUS_VAR);
+        return float64_default_nan;
+    }
+    if ( bExp == 0x7FF ) {
+        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
+        return a;
+    }
+    if ( bExp == 0 ) {
+        if ( bSig == 0 ) {
+            float_raise( float_flag_invalid STATUS_VAR);
+            return float64_default_nan;
+        }
+        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
+    }
+    if ( aExp == 0 ) {
+        if ( aSig == 0 ) return a;
+        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
+    }
+    expDiff = aExp - bExp;
+    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
+    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
+    if ( expDiff < 0 ) {
+        if ( expDiff < -1 ) return a;
+        aSig >>= 1;
+    }
+    q = ( bSig <= aSig );
+    if ( q ) aSig -= bSig;
+    expDiff -= 64;
+    while ( 0 < expDiff ) {
+        q = estimateDiv128To64( aSig, 0, bSig );
+        q = ( 2 < q ) ? q - 2 : 0;
+        aSig = - ( ( bSig>>2 ) * q );
+        expDiff -= 62;
+    }
+    expDiff += 64;
+    if ( 0 < expDiff ) {
+        q = estimateDiv128To64( aSig, 0, bSig );
+        q = ( 2 < q ) ? q - 2 : 0;
+        q >>= 64 - expDiff;
+        bSig >>= 2;
+        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
+    }
+    else {
+        aSig >>= 2;
+        bSig >>= 2;
+    }
+    do {
+        alternateASig = aSig;
+        ++q;
+        aSig -= bSig;
+    } while ( 0 <= (sbits64) aSig );
+    sigMean = aSig + alternateASig;
+    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
+        aSig = alternateASig;
+    }
+    zSign = ( (sbits64) aSig < 0 );
+    if ( zSign ) aSig = - aSig;
+    return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the square root of the double-precision floating-point value `a'.
+| The operation is performed according to the IEC/IEEE Standard for Binary
+| Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float64 float64_sqrt( float64 a STATUS_PARAM )
+{
+    flag aSign;
+    int16 aExp, zExp;
+    bits64 aSig, zSig, doubleZSig;
+    bits64 rem0, rem1, term0, term1;
+
+    aSig = extractFloat64Frac( a );
+    aExp = extractFloat64Exp( a );
+    aSign = extractFloat64Sign( a );
+    if ( aExp == 0x7FF ) {
+        if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
+        if ( ! aSign ) return a;
+        float_raise( float_flag_invalid STATUS_VAR);
+        return float64_default_nan;
+    }
+    if ( aSign ) {
+        if ( ( aExp | aSig ) == 0 ) return a;
+        float_raise( float_flag_invalid STATUS_VAR);
+        return float64_default_nan;
+    }
+    if ( aExp == 0 ) {
+        if ( aSig == 0 ) return float64_zero;
+        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
+    }
+    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
+    aSig |= LIT64( 0x0010000000000000 );
+    zSig = estimateSqrt32( aExp, aSig>>21 );
+    aSig <<= 9 - ( aExp & 1 );
+    zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
+    if ( ( zSig & 0x1FF ) <= 5 ) {
+        doubleZSig = zSig<<1;
+        mul64To128( zSig, zSig, &term0, &term1 );
+        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
+        while ( (sbits64) rem0 < 0 ) {
+            --zSig;
+            doubleZSig -= 2;
+            add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
+        }
+        zSig |= ( ( rem0 | rem1 ) != 0 );
+    }
+    return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the binary log of the double-precision floating-point value `a'.
+| The operation is performed according to the IEC/IEEE Standard for Binary
+| Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+float64 float64_log2( float64 a STATUS_PARAM )
+{
+    flag aSign, zSign;
+    int16 aExp;
+    bits64 aSig, aSig0, aSig1, zSig, i;
+
+    aSig = extractFloat64Frac( a );
+    aExp = extractFloat64Exp( a );
+    aSign = extractFloat64Sign( a );
+
+    if ( aExp == 0 ) {
+        if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
+        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
+    }
+    if ( aSign ) {
+        float_raise( float_flag_invalid STATUS_VAR);
+        return float64_default_nan;
+    }
+    if ( aExp == 0x7FF ) {
+        if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
+        return a;
+    }
+
+    aExp -= 0x3FF;
+    aSig |= LIT64( 0x0010000000000000 );
+    zSign = aExp < 0;
+    zSig = (bits64)aExp << 52;
+    for (i = 1LL << 51; i > 0; i >>= 1) {
+        mul64To128( aSig, aSig, &aSig0, &aSig1 );
+        aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
+        if ( aSig & LIT64( 0x0020000000000000 ) ) {
+            aSig >>= 1;
+            zSig |= i;
+        }
+    }
+
+    if ( zSign )
+        zSig = -zSig;
+    return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the double-precision floating-point value `a' is equal to the
+| corresponding value `b', and 0 otherwise.  The comparison is performed
+| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+int float64_eq( float64 a, float64 b STATUS_PARAM )
+{
+    bits64 av, bv;
+
+    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
+         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
+       ) {
+        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
+            float_raise( float_flag_invalid STATUS_VAR);
+        }
+        return 0;
+    }
+    av = float64_val(a);
+    bv = float64_val(b);
+    return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the double-precision floating-point value `a' is less than or
+| equal to the corresponding value `b', and 0 otherwise.  The comparison is
+| performed according to the IEC/IEEE Standard for Binary Floating-Point
+| Arithmetic.
+*----------------------------------------------------------------------------*/
+
+int float64_le( float64 a, float64 b STATUS_PARAM )
+{
+    flag aSign, bSign;
+    bits64 av, bv;
+
+    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
+         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
+       ) {
+        float_raise( float_flag_invalid STATUS_VAR);
+        return 0;
+    }
+    aSign = extractFloat64Sign( a );
+    bSign = extractFloat64Sign( b );
+    av = float64_val(a);
+    bv = float64_val(b);
+    if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
+    return ( av == bv ) || ( aSign ^ ( av < bv ) );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the double-precision floating-point value `a' is less than
+| the corresponding value `b', and 0 otherwise.  The comparison is performed
+| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+int float64_lt( float64 a, float64 b STATUS_PARAM )
+{
+    flag aSign, bSign;
+    bits64 av, bv;
+
+    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
+         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
+       ) {
+        float_raise( float_flag_invalid STATUS_VAR);
+        return 0;
+    }
+    aSign = extractFloat64Sign( a );
+    bSign = extractFloat64Sign( b );
+    av = float64_val(a);
+    bv = float64_val(b);
+    if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
+    return ( av != bv ) && ( aSign ^ ( av < bv ) );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the double-precision floating-point value `a' is equal to the
+| corresponding value `b', and 0 otherwise.  The invalid exception is raised
+| if either operand is a NaN.  Otherwise, the comparison is performed
+| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
+{
+    bits64 av, bv;
+
+    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
+         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
+       ) {
+        float_raise( float_flag_invalid STATUS_VAR);
+        return 0;
+    }
+    av = float64_val(a);
+    bv = float64_val(b);
+    return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the double-precision floating-point value `a' is less than or
+| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
+| cause an exception.  Otherwise, the comparison is performed according to the
+| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
+{
+    flag aSign, bSign;
+    bits64 av, bv;
+
+    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
+         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
+       ) {
+        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
+            float_raise( float_flag_invalid STATUS_VAR);
+        }
+        return 0;
+    }
+    aSign = extractFloat64Sign( a );
+    bSign = extractFloat64Sign( b );
+    av = float64_val(a);
+    bv = float64_val(b);
+    if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
+    return ( av == bv ) || ( aSign ^ ( av < bv ) );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the double-precision floating-point value `a' is less than
+| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
+| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
+| Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
+{
+    flag aSign, bSign;
+    bits64 av, bv;
+
+    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
+         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
+       ) {
+        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
+            float_raise( float_flag_invalid STATUS_VAR);
+        }
+        return 0;
+    }
+    aSign = extractFloat64Sign( a );
+    bSign = extractFloat64Sign( b );
+    av = float64_val(a);
+    bv = float64_val(b);
+    if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
+    return ( av != bv ) && ( aSign ^ ( av < bv ) );
+
+}
+
+#ifdef FLOATX80
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the extended double-precision floating-
+| point value `a' to the 32-bit two's complement integer format.  The
+| conversion is performed according to the IEC/IEEE Standard for Binary
+| Floating-Point Arithmetic---which means in particular that the conversion
+| is rounded according to the current rounding mode.  If `a' is a NaN, the
+| largest positive integer is returned.  Otherwise, if the conversion
+| overflows, the largest integer with the same sign as `a' is returned.
+*----------------------------------------------------------------------------*/
+
+int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
+{
+    flag aSign;
+    int32 aExp, shiftCount;
+    bits64 aSig;
+
+    aSig = extractFloatx80Frac( a );
+    aExp = extractFloatx80Exp( a );
+    aSign = extractFloatx80Sign( a );
+    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
+    shiftCount = 0x4037 - aExp;
+    if ( shiftCount <= 0 ) shiftCount = 1;
+    shift64RightJamming( aSig, shiftCount, &aSig );
+    return roundAndPackInt32( aSign, aSig STATUS_VAR );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the extended double-precision floating-
+| point value `a' to the 32-bit two's complement integer format.  The
+| conversion is performed according to the IEC/IEEE Standard for Binary
+| Floating-Point Arithmetic, except that the conversion is always rounded
+| toward zero.  If `a' is a NaN, the largest positive integer is returned.
+| Otherwise, if the conversion overflows, the largest integer with the same
+| sign as `a' is returned.
+*----------------------------------------------------------------------------*/
+
+int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
+{
+    flag aSign;
+    int32 aExp, shiftCount;
+    bits64 aSig, savedASig;
+    int32 z;
+
+    aSig = extractFloatx80Frac( a );
+    aExp = extractFloatx80Exp( a );
+    aSign = extractFloatx80Sign( a );
+    if ( 0x401E < aExp ) {
+        if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
+        goto invalid;
+    }
+    else if ( aExp < 0x3FFF ) {
+        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
+        return 0;
+    }
+    shiftCount = 0x403E - aExp;
+    savedASig = aSig;
+    aSig >>= shiftCount;
+    z = aSig;
+    if ( aSign ) z = - z;
+    if ( ( z < 0 ) ^ aSign ) {
+ invalid:
+        float_raise( float_flag_invalid STATUS_VAR);
+        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
+    }
+    if ( ( aSig<<shiftCount ) != savedASig ) {
+        STATUS(float_exception_flags) |= float_flag_inexact;
+    }
+    return z;
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the extended double-precision floating-
+| point value `a' to the 64-bit two's complement integer format.  The
+| conversion is performed according to the IEC/IEEE Standard for Binary
+| Floating-Point Arithmetic---which means in particular that the conversion
+| is rounded according to the current rounding mode.  If `a' is a NaN,
+| the largest positive integer is returned.  Otherwise, if the conversion
+| overflows, the largest integer with the same sign as `a' is returned.
+*----------------------------------------------------------------------------*/
+
+int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
+{
+    flag aSign;
+    int32 aExp, shiftCount;
+    bits64 aSig, aSigExtra;
+
+    aSig = extractFloatx80Frac( a );
+    aExp = extractFloatx80Exp( a );
+    aSign = extractFloatx80Sign( a );
+    shiftCount = 0x403E - aExp;
+    if ( shiftCount <= 0 ) {
+        if ( shiftCount ) {
+            float_raise( float_flag_invalid STATUS_VAR);
+            if (    ! aSign
+                 || (    ( aExp == 0x7FFF )
+                      && ( aSig != LIT64( 0x8000000000000000 ) ) )
+               ) {
+                return LIT64( 0x7FFFFFFFFFFFFFFF );
+            }
+            return (sbits64) LIT64( 0x8000000000000000 );
+        }
+        aSigExtra = 0;
+    }
+    else {
+        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
+    }
+    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the extended double-precision floating-
+| point value `a' to the 64-bit two's complement integer format.  The
+| conversion is performed according to the IEC/IEEE Standard for Binary
+| Floating-Point Arithmetic, except that the conversion is always rounded
+| toward zero.  If `a' is a NaN, the largest positive integer is returned.
+| Otherwise, if the conversion overflows, the largest integer with the same
+| sign as `a' is returned.
+*----------------------------------------------------------------------------*/
+
+int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
+{
+    flag aSign;
+    int32 aExp, shiftCount;
+    bits64 aSig;
+    int64 z;
+
+    aSig = extractFloatx80Frac( a );
+    aExp = extractFloatx80Exp( a );
+    aSign = extractFloatx80Sign( a );
+    shiftCount = aExp - 0x403E;
+    if ( 0 <= shiftCount ) {
+        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
+        if ( ( a.high != 0xC03E ) || aSig ) {
+            float_raise( float_flag_invalid STATUS_VAR);
+            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
+                return LIT64( 0x7FFFFFFFFFFFFFFF );
+            }
+        }
+        return (sbits64) LIT64( 0x8000000000000000 );
+    }
+    else if ( aExp < 0x3FFF ) {
+        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
+        return 0;
+    }
+    z = aSig>>( - shiftCount );
+    if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
+        STATUS(float_exception_flags) |= float_flag_inexact;
+    }
+    if ( aSign ) z = - z;
+    return z;
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the extended double-precision floating-
+| point value `a' to the single-precision floating-point format.  The
+| conversion is performed according to the IEC/IEEE Standard for Binary
+| Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
+{
+    flag aSign;
+    int32 aExp;
+    bits64 aSig;
+
+    aSig = extractFloatx80Frac( a );
+    aExp = extractFloatx80Exp( a );
+    aSign = extractFloatx80Sign( a );
+    if ( aExp == 0x7FFF ) {
+        if ( (bits64) ( aSig<<1 ) ) {
+            return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
+        }
+        return packFloat32( aSign, 0xFF, 0 );
+    }
+    shift64RightJamming( aSig, 33, &aSig );
+    if ( aExp || aSig ) aExp -= 0x3F81;
+    return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the extended double-precision floating-
+| point value `a' to the double-precision floating-point format.  The
+| conversion is performed according to the IEC/IEEE Standard for Binary
+| Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
+{
+    flag aSign;
+    int32 aExp;
+    bits64 aSig, zSig;
+
+    aSig = extractFloatx80Frac( a );
+    aExp = extractFloatx80Exp( a );
+    aSign = extractFloatx80Sign( a );
+    if ( aExp == 0x7FFF ) {
+        if ( (bits64) ( aSig<<1 ) ) {
+            return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
+        }
+        return packFloat64( aSign, 0x7FF, 0 );
+    }
+    shift64RightJamming( aSig, 1, &zSig );
+    if ( aExp || aSig ) aExp -= 0x3C01;
+    return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
+
+}
+
+#ifdef FLOAT128
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the extended double-precision floating-
+| point value `a' to the quadruple-precision floating-point format.  The
+| conversion is performed according to the IEC/IEEE Standard for Binary
+| Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
+{
+    flag aSign;
+    int16 aExp;
+    bits64 aSig, zSig0, zSig1;
+
+    aSig = extractFloatx80Frac( a );
+    aExp = extractFloatx80Exp( a );
+    aSign = extractFloatx80Sign( a );
+    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
+        return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
+    }
+    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
+    return packFloat128( aSign, aExp, zSig0, zSig1 );
+
+}
+
+#endif
+
+/*----------------------------------------------------------------------------
+| Rounds the extended double-precision floating-point value `a' to an integer,
+| and returns the result as an extended quadruple-precision floating-point
+| value.  The operation is performed according to the IEC/IEEE Standard for
+| Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
+{
+    flag aSign;
+    int32 aExp;
+    bits64 lastBitMask, roundBitsMask;
+    int8 roundingMode;
+    floatx80 z;
+
+    aExp = extractFloatx80Exp( a );
+    if ( 0x403E <= aExp ) {
+        if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
+            return propagateFloatx80NaN( a, a STATUS_VAR );
+        }
+        return a;
+    }
+    if ( aExp < 0x3FFF ) {
+        if (    ( aExp == 0 )
+             && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
+            return a;
+        }
+        STATUS(float_exception_flags) |= float_flag_inexact;
+        aSign = extractFloatx80Sign( a );
+        switch ( STATUS(float_rounding_mode) ) {
+         case float_round_nearest_even:
+            if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
+               ) {
+                return
+                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
+            }
+            break;
+         case float_round_down:
+            return
+                  aSign ?
+                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
+                : packFloatx80( 0, 0, 0 );
+         case float_round_up:
+            return
+                  aSign ? packFloatx80( 1, 0, 0 )
+                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
+        }
+        return packFloatx80( aSign, 0, 0 );
+    }
+    lastBitMask = 1;
+    lastBitMask <<= 0x403E - aExp;
+    roundBitsMask = lastBitMask - 1;
+    z = a;
+    roundingMode = STATUS(float_rounding_mode);
+    if ( roundingMode == float_round_nearest_even ) {
+        z.low += lastBitMask>>1;
+        if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
+    }
+    else if ( roundingMode != float_round_to_zero ) {
+        if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
+            z.low += roundBitsMask;
+        }
+    }
+    z.low &= ~ roundBitsMask;
+    if ( z.low == 0 ) {
+        ++z.high;
+        z.low = LIT64( 0x8000000000000000 );
+    }
+    if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
+    return z;
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of adding the absolute values of the extended double-
+| precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
+| negated before being returned.  `zSign' is ignored if the result is a NaN.
+| The addition is performed according to the IEC/IEEE Standard for Binary
+| Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
+{
+    int32 aExp, bExp, zExp;
+    bits64 aSig, bSig, zSig0, zSig1;
+    int32 expDiff;
+
+    aSig = extractFloatx80Frac( a );
+    aExp = extractFloatx80Exp( a );
+    bSig = extractFloatx80Frac( b );
+    bExp = extractFloatx80Exp( b );
+    expDiff = aExp - bExp;
+    if ( 0 < expDiff ) {
+        if ( aExp == 0x7FFF ) {
+            if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
+            return a;
+        }
+        if ( bExp == 0 ) --expDiff;
+        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
+        zExp = aExp;
+    }
+    else if ( expDiff < 0 ) {
+        if ( bExp == 0x7FFF ) {
+            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
+            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
+        }
+        if ( aExp == 0 ) ++expDiff;
+        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
+        zExp = bExp;
+    }
+    else {
+        if ( aExp == 0x7FFF ) {
+            if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
+                return propagateFloatx80NaN( a, b STATUS_VAR );
+            }
+            return a;
+        }
+        zSig1 = 0;
+        zSig0 = aSig + bSig;
+        if ( aExp == 0 ) {
+            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
+            goto roundAndPack;
+        }
+        zExp = aExp;
+        goto shiftRight1;
+    }
+    zSig0 = aSig + bSig;
+    if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
+ shiftRight1:
+    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
+    zSig0 |= LIT64( 0x8000000000000000 );
+    ++zExp;
+ roundAndPack:
+    return
+        roundAndPackFloatx80(
+            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of subtracting the absolute values of the extended
+| double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
+| difference is negated before being returned.  `zSign' is ignored if the
+| result is a NaN.  The subtraction is performed according to the IEC/IEEE
+| Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
+{
+    int32 aExp, bExp, zExp;
+    bits64 aSig, bSig, zSig0, zSig1;
+    int32 expDiff;
+    floatx80 z;
+
+    aSig = extractFloatx80Frac( a );
+    aExp = extractFloatx80Exp( a );
+    bSig = extractFloatx80Frac( b );
+    bExp = extractFloatx80Exp( b );
+    expDiff = aExp - bExp;
+    if ( 0 < expDiff ) goto aExpBigger;
+    if ( expDiff < 0 ) goto bExpBigger;
+    if ( aExp == 0x7FFF ) {
+        if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
+            return propagateFloatx80NaN( a, b STATUS_VAR );
+        }
+        float_raise( float_flag_invalid STATUS_VAR);
+        z.low = floatx80_default_nan_low;
+        z.high = floatx80_default_nan_high;
+        return z;
+    }
+    if ( aExp == 0 ) {
+        aExp = 1;
+        bExp = 1;
+    }
+    zSig1 = 0;
+    if ( bSig < aSig ) goto aBigger;
+    if ( aSig < bSig ) goto bBigger;
+    return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
+ bExpBigger:
+    if ( bExp == 0x7FFF ) {
+        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
+        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
+    }
+    if ( aExp == 0 ) ++expDiff;
+    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
+ bBigger:
+    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
+    zExp = bExp;
+    zSign ^= 1;
+    goto normalizeRoundAndPack;
+ aExpBigger:
+    if ( aExp == 0x7FFF ) {
+        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
+        return a;
+    }
+    if ( bExp == 0 ) --expDiff;
+    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
+ aBigger:
+    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
+    zExp = aExp;
+ normalizeRoundAndPack:
+    return
+        normalizeRoundAndPackFloatx80(
+            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of adding the extended double-precision floating-point
+| values `a' and `b'.  The operation is performed according to the IEC/IEEE
+| Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
+{
+    flag aSign, bSign;
+
+    aSign = extractFloatx80Sign( a );
+    bSign = extractFloatx80Sign( b );
+    if ( aSign == bSign ) {
+        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
+    }
+    else {
+        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
+    }
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of subtracting the extended double-precision floating-
+| point values `a' and `b'.  The operation is performed according to the
+| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
+{
+    flag aSign, bSign;
+
+    aSign = extractFloatx80Sign( a );
+    bSign = extractFloatx80Sign( b );
+    if ( aSign == bSign ) {
+        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
+    }
+    else {
+        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
+    }
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of multiplying the extended double-precision floating-
+| point values `a' and `b'.  The operation is performed according to the
+| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
+{
+    flag aSign, bSign, zSign;
+    int32 aExp, bExp, zExp;
+    bits64 aSig, bSig, zSig0, zSig1;
+    floatx80 z;
+
+    aSig = extractFloatx80Frac( a );
+    aExp = extractFloatx80Exp( a );
+    aSign = extractFloatx80Sign( a );
+    bSig = extractFloatx80Frac( b );
+    bExp = extractFloatx80Exp( b );
+    bSign = extractFloatx80Sign( b );
+    zSign = aSign ^ bSign;
+    if ( aExp == 0x7FFF ) {
+        if (    (bits64) ( aSig<<1 )
+             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
+            return propagateFloatx80NaN( a, b STATUS_VAR );
+        }
+        if ( ( bExp | bSig ) == 0 ) goto invalid;
+        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
+    }
+    if ( bExp == 0x7FFF ) {
+        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
+        if ( ( aExp | aSig ) == 0 ) {
+ invalid:
+            float_raise( float_flag_invalid STATUS_VAR);
+            z.low = floatx80_default_nan_low;
+            z.high = floatx80_default_nan_high;
+            return z;
+        }
+        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
+    }
+    if ( aExp == 0 ) {
+        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
+        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
+    }
+    if ( bExp == 0 ) {
+        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
+        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
+    }
+    zExp = aExp + bExp - 0x3FFE;
+    mul64To128( aSig, bSig, &zSig0, &zSig1 );
+    if ( 0 < (sbits64) zSig0 ) {
+        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
+        --zExp;
+    }
+    return
+        roundAndPackFloatx80(
+            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of dividing the extended double-precision floating-point
+| value `a' by the corresponding value `b'.  The operation is performed
+| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
+{
+    flag aSign, bSign, zSign;
+    int32 aExp, bExp, zExp;
+    bits64 aSig, bSig, zSig0, zSig1;
+    bits64 rem0, rem1, rem2, term0, term1, term2;
+    floatx80 z;
+
+    aSig = extractFloatx80Frac( a );
+    aExp = extractFloatx80Exp( a );
+    aSign = extractFloatx80Sign( a );
+    bSig = extractFloatx80Frac( b );
+    bExp = extractFloatx80Exp( b );
+    bSign = extractFloatx80Sign( b );
+    zSign = aSign ^ bSign;
+    if ( aExp == 0x7FFF ) {
+        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
+        if ( bExp == 0x7FFF ) {
+            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
+            goto invalid;
+        }
+        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
+    }
+    if ( bExp == 0x7FFF ) {
+        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
+        return packFloatx80( zSign, 0, 0 );
+    }
+    if ( bExp == 0 ) {
+        if ( bSig == 0 ) {
+            if ( ( aExp | aSig ) == 0 ) {
+ invalid:
+                float_raise( float_flag_invalid STATUS_VAR);
+                z.low = floatx80_default_nan_low;
+                z.high = floatx80_default_nan_high;
+                return z;
+            }
+            float_raise( float_flag_divbyzero STATUS_VAR);
+            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
+        }
+        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
+    }
+    if ( aExp == 0 ) {
+        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
+        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
+    }
+    zExp = aExp - bExp + 0x3FFE;
+    rem1 = 0;
+    if ( bSig <= aSig ) {
+        shift128Right( aSig, 0, 1, &aSig, &rem1 );
+        ++zExp;
+    }
+    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
+    mul64To128( bSig, zSig0, &term0, &term1 );
+    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
+    while ( (sbits64) rem0 < 0 ) {
+        --zSig0;
+        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
+    }
+    zSig1 = estimateDiv128To64( rem1, 0, bSig );
+    if ( (bits64) ( zSig1<<1 ) <= 8 ) {
+        mul64To128( bSig, zSig1, &term1, &term2 );
+        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
+        while ( (sbits64) rem1 < 0 ) {
+            --zSig1;
+            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
+        }
+        zSig1 |= ( ( rem1 | rem2 ) != 0 );
+    }
+    return
+        roundAndPackFloatx80(
+            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the remainder of the extended double-precision floating-point value
+| `a' with respect to the corresponding value `b'.  The operation is performed
+| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
+{
+    flag aSign, zSign;
+    int32 aExp, bExp, expDiff;
+    bits64 aSig0, aSig1, bSig;
+    bits64 q, term0, term1, alternateASig0, alternateASig1;
+    floatx80 z;
+
+    aSig0 = extractFloatx80Frac( a );
+    aExp = extractFloatx80Exp( a );
+    aSign = extractFloatx80Sign( a );
+    bSig = extractFloatx80Frac( b );
+    bExp = extractFloatx80Exp( b );
+    if ( aExp == 0x7FFF ) {
+        if (    (bits64) ( aSig0<<1 )
+             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
+            return propagateFloatx80NaN( a, b STATUS_VAR );
+        }
+        goto invalid;
+    }
+    if ( bExp == 0x7FFF ) {
+        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
+        return a;
+    }
+    if ( bExp == 0 ) {
+        if ( bSig == 0 ) {
+ invalid:
+            float_raise( float_flag_invalid STATUS_VAR);
+            z.low = floatx80_default_nan_low;
+            z.high = floatx80_default_nan_high;
+            return z;
+        }
+        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
+    }
+    if ( aExp == 0 ) {
+        if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
+        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
+    }
+    bSig |= LIT64( 0x8000000000000000 );
+    zSign = aSign;
+    expDiff = aExp - bExp;
+    aSig1 = 0;
+    if ( expDiff < 0 ) {
+        if ( expDiff < -1 ) return a;
+        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
+        expDiff = 0;
+    }
+    q = ( bSig <= aSig0 );
+    if ( q ) aSig0 -= bSig;
+    expDiff -= 64;
+    while ( 0 < expDiff ) {
+        q = estimateDiv128To64( aSig0, aSig1, bSig );
+        q = ( 2 < q ) ? q - 2 : 0;
+        mul64To128( bSig, q, &term0, &term1 );
+        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
+        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
+        expDiff -= 62;
+    }
+    expDiff += 64;
+    if ( 0 < expDiff ) {
+        q = estimateDiv128To64( aSig0, aSig1, bSig );
+        q = ( 2 < q ) ? q - 2 : 0;
+        q >>= 64 - expDiff;
+        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
+        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
+        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
+        while ( le128( term0, term1, aSig0, aSig1 ) ) {
+            ++q;
+            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
+        }
+    }
+    else {
+        term1 = 0;
+        term0 = bSig;
+    }
+    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
+    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
+         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
+              && ( q & 1 ) )
+       ) {
+        aSig0 = alternateASig0;
+        aSig1 = alternateASig1;
+        zSign = ! zSign;
+    }
+    return
+        normalizeRoundAndPackFloatx80(
+            80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the square root of the extended double-precision floating-point
+| value `a'.  The operation is performed according to the IEC/IEEE Standard
+| for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
+{
+    flag aSign;
+    int32 aExp, zExp;
+    bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
+    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
+    floatx80 z;
+
+    aSig0 = extractFloatx80Frac( a );
+    aExp = extractFloatx80Exp( a );
+    aSign = extractFloatx80Sign( a );
+    if ( aExp == 0x7FFF ) {
+        if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
+        if ( ! aSign ) return a;
+        goto invalid;
+    }
+    if ( aSign ) {
+        if ( ( aExp | aSig0 ) == 0 ) return a;
+ invalid:
+        float_raise( float_flag_invalid STATUS_VAR);
+        z.low = floatx80_default_nan_low;
+        z.high = floatx80_default_nan_high;
+        return z;
+    }
+    if ( aExp == 0 ) {
+        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
+        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
+    }
+    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
+    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
+    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
+    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
+    doubleZSig0 = zSig0<<1;
+    mul64To128( zSig0, zSig0, &term0, &term1 );
+    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
+    while ( (sbits64) rem0 < 0 ) {
+        --zSig0;
+        doubleZSig0 -= 2;
+        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
+    }
+    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
+    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
+        if ( zSig1 == 0 ) zSig1 = 1;
+        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
+        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
+        mul64To128( zSig1, zSig1, &term2, &term3 );
+        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
+        while ( (sbits64) rem1 < 0 ) {
+            --zSig1;
+            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
+            term3 |= 1;
+            term2 |= doubleZSig0;
+            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
+        }
+        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
+    }
+    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
+    zSig0 |= doubleZSig0;
+    return
+        roundAndPackFloatx80(
+            STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the extended double-precision floating-point value `a' is
+| equal to the corresponding value `b', and 0 otherwise.  The comparison is
+| performed according to the IEC/IEEE Standard for Binary Floating-Point
+| Arithmetic.
+*----------------------------------------------------------------------------*/
+
+int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
+{
+
+    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
+              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
+         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
+              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
+       ) {
+        if (    floatx80_is_signaling_nan( a )
+             || floatx80_is_signaling_nan( b ) ) {
+            float_raise( float_flag_invalid STATUS_VAR);
+        }
+        return 0;
+    }
+    return
+           ( a.low == b.low )
+        && (    ( a.high == b.high )
+             || (    ( a.low == 0 )
+                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
+           );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the extended double-precision floating-point value `a' is
+| less than or equal to the corresponding value `b', and 0 otherwise.  The
+| comparison is performed according to the IEC/IEEE Standard for Binary
+| Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
+{
+    flag aSign, bSign;
+
+    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
+              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
+         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
+              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
+       ) {
+        float_raise( float_flag_invalid STATUS_VAR);
+        return 0;
+    }
+    aSign = extractFloatx80Sign( a );
+    bSign = extractFloatx80Sign( b );
+    if ( aSign != bSign ) {
+        return
+               aSign
+            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
+                 == 0 );
+    }
+    return
+          aSign ? le128( b.high, b.low, a.high, a.low )
+        : le128( a.high, a.low, b.high, b.low );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the extended double-precision floating-point value `a' is
+| less than the corresponding value `b', and 0 otherwise.  The comparison
+| is performed according to the IEC/IEEE Standard for Binary Floating-Point
+| Arithmetic.
+*----------------------------------------------------------------------------*/
+
+int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
+{
+    flag aSign, bSign;
+
+    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
+              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
+         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
+              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
+       ) {
+        float_raise( float_flag_invalid STATUS_VAR);
+        return 0;
+    }
+    aSign = extractFloatx80Sign( a );
+    bSign = extractFloatx80Sign( b );
+    if ( aSign != bSign ) {
+        return
+               aSign
+            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
+                 != 0 );
+    }
+    return
+          aSign ? lt128( b.high, b.low, a.high, a.low )
+        : lt128( a.high, a.low, b.high, b.low );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the extended double-precision floating-point value `a' is equal
+| to the corresponding value `b', and 0 otherwise.  The invalid exception is
+| raised if either operand is a NaN.  Otherwise, the comparison is performed
+| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
+{
+
+    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
+              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
+         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
+              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
+       ) {
+        float_raise( float_flag_invalid STATUS_VAR);
+        return 0;
+    }
+    return
+           ( a.low == b.low )
+        && (    ( a.high == b.high )
+             || (    ( a.low == 0 )
+                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
+           );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the extended double-precision floating-point value `a' is less
+| than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
+| do not cause an exception.  Otherwise, the comparison is performed according
+| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
+{
+    flag aSign, bSign;
+
+    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
+              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
+         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
+              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
+       ) {
+        if (    floatx80_is_signaling_nan( a )
+             || floatx80_is_signaling_nan( b ) ) {
+            float_raise( float_flag_invalid STATUS_VAR);
+        }
+        return 0;
+    }
+    aSign = extractFloatx80Sign( a );
+    bSign = extractFloatx80Sign( b );
+    if ( aSign != bSign ) {
+        return
+               aSign
+            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
+                 == 0 );
+    }
+    return
+          aSign ? le128( b.high, b.low, a.high, a.low )
+        : le128( a.high, a.low, b.high, b.low );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the extended double-precision floating-point value `a' is less
+| than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
+| an exception.  Otherwise, the comparison is performed according to the
+| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
+{
+    flag aSign, bSign;
+
+    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
+              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
+         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
+              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
+       ) {
+        if (    floatx80_is_signaling_nan( a )
+             || floatx80_is_signaling_nan( b ) ) {
+            float_raise( float_flag_invalid STATUS_VAR);
+        }
+        return 0;
+    }
+    aSign = extractFloatx80Sign( a );
+    bSign = extractFloatx80Sign( b );
+    if ( aSign != bSign ) {
+        return
+               aSign
+            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
+                 != 0 );
+    }
+    return
+          aSign ? lt128( b.high, b.low, a.high, a.low )
+        : lt128( a.high, a.low, b.high, b.low );
+
+}
+
+#endif
+
+#ifdef FLOAT128
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the quadruple-precision floating-point
+| value `a' to the 32-bit two's complement integer format.  The conversion
+| is performed according to the IEC/IEEE Standard for Binary Floating-Point
+| Arithmetic---which means in particular that the conversion is rounded
+| according to the current rounding mode.  If `a' is a NaN, the largest
+| positive integer is returned.  Otherwise, if the conversion overflows, the
+| largest integer with the same sign as `a' is returned.
+*----------------------------------------------------------------------------*/
+
+int32 float128_to_int32( float128 a STATUS_PARAM )
+{
+    flag aSign;
+    int32 aExp, shiftCount;
+    bits64 aSig0, aSig1;
+
+    aSig1 = extractFloat128Frac1( a );
+    aSig0 = extractFloat128Frac0( a );
+    aExp = extractFloat128Exp( a );
+    aSign = extractFloat128Sign( a );
+    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
+    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
+    aSig0 |= ( aSig1 != 0 );
+    shiftCount = 0x4028 - aExp;
+    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
+    return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the quadruple-precision floating-point
+| value `a' to the 32-bit two's complement integer format.  The conversion
+| is performed according to the IEC/IEEE Standard for Binary Floating-Point
+| Arithmetic, except that the conversion is always rounded toward zero.  If
+| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
+| conversion overflows, the largest integer with the same sign as `a' is
+| returned.
+*----------------------------------------------------------------------------*/
+
+int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
+{
+    flag aSign;
+    int32 aExp, shiftCount;
+    bits64 aSig0, aSig1, savedASig;
+    int32 z;
+
+    aSig1 = extractFloat128Frac1( a );
+    aSig0 = extractFloat128Frac0( a );
+    aExp = extractFloat128Exp( a );
+    aSign = extractFloat128Sign( a );
+    aSig0 |= ( aSig1 != 0 );
+    if ( 0x401E < aExp ) {
+        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
+        goto invalid;
+    }
+    else if ( aExp < 0x3FFF ) {
+        if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
+        return 0;
+    }
+    aSig0 |= LIT64( 0x0001000000000000 );
+    shiftCount = 0x402F - aExp;
+    savedASig = aSig0;
+    aSig0 >>= shiftCount;
+    z = aSig0;
+    if ( aSign ) z = - z;
+    if ( ( z < 0 ) ^ aSign ) {
+ invalid:
+        float_raise( float_flag_invalid STATUS_VAR);
+        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
+    }
+    if ( ( aSig0<<shiftCount ) != savedASig ) {
+        STATUS(float_exception_flags) |= float_flag_inexact;
+    }
+    return z;
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the quadruple-precision floating-point
+| value `a' to the 64-bit two's complement integer format.  The conversion
+| is performed according to the IEC/IEEE Standard for Binary Floating-Point
+| Arithmetic---which means in particular that the conversion is rounded
+| according to the current rounding mode.  If `a' is a NaN, the largest
+| positive integer is returned.  Otherwise, if the conversion overflows, the
+| largest integer with the same sign as `a' is returned.
+*----------------------------------------------------------------------------*/
+
+int64 float128_to_int64( float128 a STATUS_PARAM )
+{
+    flag aSign;
+    int32 aExp, shiftCount;
+    bits64 aSig0, aSig1;
+
+    aSig1 = extractFloat128Frac1( a );
+    aSig0 = extractFloat128Frac0( a );
+    aExp = extractFloat128Exp( a );
+    aSign = extractFloat128Sign( a );
+    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
+    shiftCount = 0x402F - aExp;
+    if ( shiftCount <= 0 ) {
+        if ( 0x403E < aExp ) {
+            float_raise( float_flag_invalid STATUS_VAR);
+            if (    ! aSign
+                 || (    ( aExp == 0x7FFF )
+                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
+                    )
+               ) {
+                return LIT64( 0x7FFFFFFFFFFFFFFF );
+            }
+            return (sbits64) LIT64( 0x8000000000000000 );
+        }
+        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
+    }
+    else {
+        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
+    }
+    return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the quadruple-precision floating-point
+| value `a' to the 64-bit two's complement integer format.  The conversion
+| is performed according to the IEC/IEEE Standard for Binary Floating-Point
+| Arithmetic, except that the conversion is always rounded toward zero.
+| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
+| the conversion overflows, the largest integer with the same sign as `a' is
+| returned.
+*----------------------------------------------------------------------------*/
+
+int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
+{
+    flag aSign;
+    int32 aExp, shiftCount;
+    bits64 aSig0, aSig1;
+    int64 z;
+
+    aSig1 = extractFloat128Frac1( a );
+    aSig0 = extractFloat128Frac0( a );
+    aExp = extractFloat128Exp( a );
+    aSign = extractFloat128Sign( a );
+    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
+    shiftCount = aExp - 0x402F;
+    if ( 0 < shiftCount ) {
+        if ( 0x403E <= aExp ) {
+            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
+            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
+                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
+                if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
+            }
+            else {
+                float_raise( float_flag_invalid STATUS_VAR);
+                if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
+                    return LIT64( 0x7FFFFFFFFFFFFFFF );
+                }
+            }
+            return (sbits64) LIT64( 0x8000000000000000 );
+        }
+        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
+        if ( (bits64) ( aSig1<<shiftCount ) ) {
+            STATUS(float_exception_flags) |= float_flag_inexact;
+        }
+    }
+    else {
+        if ( aExp < 0x3FFF ) {
+            if ( aExp | aSig0 | aSig1 ) {
+                STATUS(float_exception_flags) |= float_flag_inexact;
+            }
+            return 0;
+        }
+        z = aSig0>>( - shiftCount );
+        if (    aSig1
+             || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
+            STATUS(float_exception_flags) |= float_flag_inexact;
+        }
+    }
+    if ( aSign ) z = - z;
+    return z;
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the quadruple-precision floating-point
+| value `a' to the single-precision floating-point format.  The conversion
+| is performed according to the IEC/IEEE Standard for Binary Floating-Point
+| Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float32 float128_to_float32( float128 a STATUS_PARAM )
+{
+    flag aSign;
+    int32 aExp;
+    bits64 aSig0, aSig1;
+    bits32 zSig;
+
+    aSig1 = extractFloat128Frac1( a );
+    aSig0 = extractFloat128Frac0( a );
+    aExp = extractFloat128Exp( a );
+    aSign = extractFloat128Sign( a );
+    if ( aExp == 0x7FFF ) {
+        if ( aSig0 | aSig1 ) {
+            return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
+        }
+        return packFloat32( aSign, 0xFF, 0 );
+    }
+    aSig0 |= ( aSig1 != 0 );
+    shift64RightJamming( aSig0, 18, &aSig0 );
+    zSig = aSig0;
+    if ( aExp || zSig ) {
+        zSig |= 0x40000000;
+        aExp -= 0x3F81;
+    }
+    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the quadruple-precision floating-point
+| value `a' to the double-precision floating-point format.  The conversion
+| is performed according to the IEC/IEEE Standard for Binary Floating-Point
+| Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float64 float128_to_float64( float128 a STATUS_PARAM )
+{
+    flag aSign;
+    int32 aExp;
+    bits64 aSig0, aSig1;
+
+    aSig1 = extractFloat128Frac1( a );
+    aSig0 = extractFloat128Frac0( a );
+    aExp = extractFloat128Exp( a );
+    aSign = extractFloat128Sign( a );
+    if ( aExp == 0x7FFF ) {
+        if ( aSig0 | aSig1 ) {
+            return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
+        }
+        return packFloat64( aSign, 0x7FF, 0 );
+    }
+    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
+    aSig0 |= ( aSig1 != 0 );
+    if ( aExp || aSig0 ) {
+        aSig0 |= LIT64( 0x4000000000000000 );
+        aExp -= 0x3C01;
+    }
+    return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
+
+}
+
+#ifdef FLOATX80
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the quadruple-precision floating-point
+| value `a' to the extended double-precision floating-point format.  The
+| conversion is performed according to the IEC/IEEE Standard for Binary
+| Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
+{
+    flag aSign;
+    int32 aExp;
+    bits64 aSig0, aSig1;
+
+    aSig1 = extractFloat128Frac1( a );
+    aSig0 = extractFloat128Frac0( a );
+    aExp = extractFloat128Exp( a );
+    aSign = extractFloat128Sign( a );
+    if ( aExp == 0x7FFF ) {
+        if ( aSig0 | aSig1 ) {
+            return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
+        }
+        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
+    }
+    if ( aExp == 0 ) {
+        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
+        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
+    }
+    else {
+        aSig0 |= LIT64( 0x0001000000000000 );
+    }
+    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
+    return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
+
+}
+
+#endif
+
+/*----------------------------------------------------------------------------
+| Rounds the quadruple-precision floating-point value `a' to an integer, and
+| returns the result as a quadruple-precision floating-point value.  The
+| operation is performed according to the IEC/IEEE Standard for Binary
+| Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float128 float128_round_to_int( float128 a STATUS_PARAM )
+{
+    flag aSign;
+    int32 aExp;
+    bits64 lastBitMask, roundBitsMask;
+    int8 roundingMode;
+    float128 z;
+
+    aExp = extractFloat128Exp( a );
+    if ( 0x402F <= aExp ) {
+        if ( 0x406F <= aExp ) {
+            if (    ( aExp == 0x7FFF )
+                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
+               ) {
+                return propagateFloat128NaN( a, a STATUS_VAR );
+            }
+            return a;
+        }
+        lastBitMask = 1;
+        lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
+        roundBitsMask = lastBitMask - 1;
+        z = a;
+        roundingMode = STATUS(float_rounding_mode);
+        if ( roundingMode == float_round_nearest_even ) {
+            if ( lastBitMask ) {
+                add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
+                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
+            }
+            else {
+                if ( (sbits64) z.low < 0 ) {
+                    ++z.high;
+                    if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
+                }
+            }
+        }
+        else if ( roundingMode != float_round_to_zero ) {
+            if (   extractFloat128Sign( z )
+                 ^ ( roundingMode == float_round_up ) ) {
+                add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
+            }
+        }
+        z.low &= ~ roundBitsMask;
+    }
+    else {
+        if ( aExp < 0x3FFF ) {
+            if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
+            STATUS(float_exception_flags) |= float_flag_inexact;
+            aSign = extractFloat128Sign( a );
+            switch ( STATUS(float_rounding_mode) ) {
+             case float_round_nearest_even:
+                if (    ( aExp == 0x3FFE )
+                     && (   extractFloat128Frac0( a )
+                          | extractFloat128Frac1( a ) )
+                   ) {
+                    return packFloat128( aSign, 0x3FFF, 0, 0 );
+                }
+                break;
+             case float_round_down:
+                return
+                      aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
+                    : packFloat128( 0, 0, 0, 0 );
+             case float_round_up:
+                return
+                      aSign ? packFloat128( 1, 0, 0, 0 )
+                    : packFloat128( 0, 0x3FFF, 0, 0 );
+            }
+            return packFloat128( aSign, 0, 0, 0 );
+        }
+        lastBitMask = 1;
+        lastBitMask <<= 0x402F - aExp;
+        roundBitsMask = lastBitMask - 1;
+        z.low = 0;
+        z.high = a.high;
+        roundingMode = STATUS(float_rounding_mode);
+        if ( roundingMode == float_round_nearest_even ) {
+            z.high += lastBitMask>>1;
+            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
+                z.high &= ~ lastBitMask;
+            }
+        }
+        else if ( roundingMode != float_round_to_zero ) {
+            if (   extractFloat128Sign( z )
+                 ^ ( roundingMode == float_round_up ) ) {
+                z.high |= ( a.low != 0 );
+                z.high += roundBitsMask;
+            }
+        }
+        z.high &= ~ roundBitsMask;
+    }
+    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
+        STATUS(float_exception_flags) |= float_flag_inexact;
+    }
+    return z;
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of adding the absolute values of the quadruple-precision
+| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
+| before being returned.  `zSign' is ignored if the result is a NaN.
+| The addition is performed according to the IEC/IEEE Standard for Binary
+| Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
+{
+    int32 aExp, bExp, zExp;
+    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
+    int32 expDiff;
+
+    aSig1 = extractFloat128Frac1( a );
+    aSig0 = extractFloat128Frac0( a );
+    aExp = extractFloat128Exp( a );
+    bSig1 = extractFloat128Frac1( b );
+    bSig0 = extractFloat128Frac0( b );
+    bExp = extractFloat128Exp( b );
+    expDiff = aExp - bExp;
+    if ( 0 < expDiff ) {
+        if ( aExp == 0x7FFF ) {
+            if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
+            return a;
+        }
+        if ( bExp == 0 ) {
+            --expDiff;
+        }
+        else {
+            bSig0 |= LIT64( 0x0001000000000000 );
+        }
+        shift128ExtraRightJamming(
+            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
+        zExp = aExp;
+    }
+    else if ( expDiff < 0 ) {
+        if ( bExp == 0x7FFF ) {
+            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
+            return packFloat128( zSign, 0x7FFF, 0, 0 );
+        }
+        if ( aExp == 0 ) {
+            ++expDiff;
+        }
+        else {
+            aSig0 |= LIT64( 0x0001000000000000 );
+        }
+        shift128ExtraRightJamming(
+            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
+        zExp = bExp;
+    }
+    else {
+        if ( aExp == 0x7FFF ) {
+            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
+                return propagateFloat128NaN( a, b STATUS_VAR );
+            }
+            return a;
+        }
+        add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
+        if ( aExp == 0 ) {
+            if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
+            return packFloat128( zSign, 0, zSig0, zSig1 );
+        }
+        zSig2 = 0;
+        zSig0 |= LIT64( 0x0002000000000000 );
+        zExp = aExp;
+        goto shiftRight1;
+    }
+    aSig0 |= LIT64( 0x0001000000000000 );
+    add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
+    --zExp;
+    if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
+    ++zExp;
+ shiftRight1:
+    shift128ExtraRightJamming(
+        zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
+ roundAndPack:
+    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of subtracting the absolute values of the quadruple-
+| precision floating-point values `a' and `b'.  If `zSign' is 1, the
+| difference is negated before being returned.  `zSign' is ignored if the
+| result is a NaN.  The subtraction is performed according to the IEC/IEEE
+| Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
+{
+    int32 aExp, bExp, zExp;
+    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
+    int32 expDiff;
+    float128 z;
+
+    aSig1 = extractFloat128Frac1( a );
+    aSig0 = extractFloat128Frac0( a );
+    aExp = extractFloat128Exp( a );
+    bSig1 = extractFloat128Frac1( b );
+    bSig0 = extractFloat128Frac0( b );
+    bExp = extractFloat128Exp( b );
+    expDiff = aExp - bExp;
+    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
+    shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
+    if ( 0 < expDiff ) goto aExpBigger;
+    if ( expDiff < 0 ) goto bExpBigger;
+    if ( aExp == 0x7FFF ) {
+        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
+            return propagateFloat128NaN( a, b STATUS_VAR );
+        }
+        float_raise( float_flag_invalid STATUS_VAR);
+        z.low = float128_default_nan_low;
+        z.high = float128_default_nan_high;
+        return z;
+    }
+    if ( aExp == 0 ) {
+        aExp = 1;
+        bExp = 1;
+    }
+    if ( bSig0 < aSig0 ) goto aBigger;
+    if ( aSig0 < bSig0 ) goto bBigger;
+    if ( bSig1 < aSig1 ) goto aBigger;
+    if ( aSig1 < bSig1 ) goto bBigger;
+    return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
+ bExpBigger:
+    if ( bExp == 0x7FFF ) {
+        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
+        return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
+    }
+    if ( aExp == 0 ) {
+        ++expDiff;
+    }
+    else {
+        aSig0 |= LIT64( 0x4000000000000000 );
+    }
+    shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
+    bSig0 |= LIT64( 0x4000000000000000 );
+ bBigger:
+    sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
+    zExp = bExp;
+    zSign ^= 1;
+    goto normalizeRoundAndPack;
+ aExpBigger:
+    if ( aExp == 0x7FFF ) {
+        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
+        return a;
+    }
+    if ( bExp == 0 ) {
+        --expDiff;
+    }
+    else {
+        bSig0 |= LIT64( 0x4000000000000000 );
+    }
+    shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
+    aSig0 |= LIT64( 0x4000000000000000 );
+ aBigger:
+    sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
+    zExp = aExp;
+ normalizeRoundAndPack:
+    --zExp;
+    return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of adding the quadruple-precision floating-point values
+| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
+| for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float128 float128_add( float128 a, float128 b STATUS_PARAM )
+{
+    flag aSign, bSign;
+
+    aSign = extractFloat128Sign( a );
+    bSign = extractFloat128Sign( b );
+    if ( aSign == bSign ) {
+        return addFloat128Sigs( a, b, aSign STATUS_VAR );
+    }
+    else {
+        return subFloat128Sigs( a, b, aSign STATUS_VAR );
+    }
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of subtracting the quadruple-precision floating-point
+| values `a' and `b'.  The operation is performed according to the IEC/IEEE
+| Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float128 float128_sub( float128 a, float128 b STATUS_PARAM )
+{
+    flag aSign, bSign;
+
+    aSign = extractFloat128Sign( a );
+    bSign = extractFloat128Sign( b );
+    if ( aSign == bSign ) {
+        return subFloat128Sigs( a, b, aSign STATUS_VAR );
+    }
+    else {
+        return addFloat128Sigs( a, b, aSign STATUS_VAR );
+    }
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of multiplying the quadruple-precision floating-point
+| values `a' and `b'.  The operation is performed according to the IEC/IEEE
+| Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float128 float128_mul( float128 a, float128 b STATUS_PARAM )
+{
+    flag aSign, bSign, zSign;
+    int32 aExp, bExp, zExp;
+    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
+    float128 z;
+
+    aSig1 = extractFloat128Frac1( a );
+    aSig0 = extractFloat128Frac0( a );
+    aExp = extractFloat128Exp( a );
+    aSign = extractFloat128Sign( a );
+    bSig1 = extractFloat128Frac1( b );
+    bSig0 = extractFloat128Frac0( b );
+    bExp = extractFloat128Exp( b );
+    bSign = extractFloat128Sign( b );
+    zSign = aSign ^ bSign;
+    if ( aExp == 0x7FFF ) {
+        if (    ( aSig0 | aSig1 )
+             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
+            return propagateFloat128NaN( a, b STATUS_VAR );
+        }
+        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
+        return packFloat128( zSign, 0x7FFF, 0, 0 );
+    }
+    if ( bExp == 0x7FFF ) {
+        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
+        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
+ invalid:
+            float_raise( float_flag_invalid STATUS_VAR);
+            z.low = float128_default_nan_low;
+            z.high = float128_default_nan_high;
+            return z;
+        }
+        return packFloat128( zSign, 0x7FFF, 0, 0 );
+    }
+    if ( aExp == 0 ) {
+        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
+        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
+    }
+    if ( bExp == 0 ) {
+        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
+        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
+    }
+    zExp = aExp + bExp - 0x4000;
+    aSig0 |= LIT64( 0x0001000000000000 );
+    shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
+    mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
+    add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
+    zSig2 |= ( zSig3 != 0 );
+    if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
+        shift128ExtraRightJamming(
+            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
+        ++zExp;
+    }
+    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of dividing the quadruple-precision floating-point value
+| `a' by the corresponding value `b'.  The operation is performed according to
+| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float128 float128_div( float128 a, float128 b STATUS_PARAM )
+{
+    flag aSign, bSign, zSign;
+    int32 aExp, bExp, zExp;
+    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
+    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
+    float128 z;
+
+    aSig1 = extractFloat128Frac1( a );
+    aSig0 = extractFloat128Frac0( a );
+    aExp = extractFloat128Exp( a );
+    aSign = extractFloat128Sign( a );
+    bSig1 = extractFloat128Frac1( b );
+    bSig0 = extractFloat128Frac0( b );
+    bExp = extractFloat128Exp( b );
+    bSign = extractFloat128Sign( b );
+    zSign = aSign ^ bSign;
+    if ( aExp == 0x7FFF ) {
+        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
+        if ( bExp == 0x7FFF ) {
+            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
+            goto invalid;
+        }
+        return packFloat128( zSign, 0x7FFF, 0, 0 );
+    }
+    if ( bExp == 0x7FFF ) {
+        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
+        return packFloat128( zSign, 0, 0, 0 );
+    }
+    if ( bExp == 0 ) {
+        if ( ( bSig0 | bSig1 ) == 0 ) {
+            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
+ invalid:
+                float_raise( float_flag_invalid STATUS_VAR);
+                z.low = float128_default_nan_low;
+                z.high = float128_default_nan_high;
+                return z;
+            }
+            float_raise( float_flag_divbyzero STATUS_VAR);
+            return packFloat128( zSign, 0x7FFF, 0, 0 );
+        }
+        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
+    }
+    if ( aExp == 0 ) {
+        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
+        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
+    }
+    zExp = aExp - bExp + 0x3FFD;
+    shortShift128Left(
+        aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
+    shortShift128Left(
+        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
+    if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
+        shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
+        ++zExp;
+    }
+    zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
+    mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
+    sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
+    while ( (sbits64) rem0 < 0 ) {
+        --zSig0;
+        add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
+    }
+    zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
+    if ( ( zSig1 & 0x3FFF ) <= 4 ) {
+        mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
+        sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
+        while ( (sbits64) rem1 < 0 ) {
+            --zSig1;
+            add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
+        }
+        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
+    }
+    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
+    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the remainder of the quadruple-precision floating-point value `a'
+| with respect to the corresponding value `b'.  The operation is performed
+| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float128 float128_rem( float128 a, float128 b STATUS_PARAM )
+{
+    flag aSign, zSign;
+    int32 aExp, bExp, expDiff;
+    bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
+    bits64 allZero, alternateASig0, alternateASig1, sigMean1;
+    sbits64 sigMean0;
+    float128 z;
+
+    aSig1 = extractFloat128Frac1( a );
+    aSig0 = extractFloat128Frac0( a );
+    aExp = extractFloat128Exp( a );
+    aSign = extractFloat128Sign( a );
+    bSig1 = extractFloat128Frac1( b );
+    bSig0 = extractFloat128Frac0( b );
+    bExp = extractFloat128Exp( b );
+    if ( aExp == 0x7FFF ) {
+        if (    ( aSig0 | aSig1 )
+             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
+            return propagateFloat128NaN( a, b STATUS_VAR );
+        }
+        goto invalid;
+    }
+    if ( bExp == 0x7FFF ) {
+        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
+        return a;
+    }
+    if ( bExp == 0 ) {
+        if ( ( bSig0 | bSig1 ) == 0 ) {
+ invalid:
+            float_raise( float_flag_invalid STATUS_VAR);
+            z.low = float128_default_nan_low;
+            z.high = float128_default_nan_high;
+            return z;
+        }
+        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
+    }
+    if ( aExp == 0 ) {
+        if ( ( aSig0 | aSig1 ) == 0 ) return a;
+        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
+    }
+    expDiff = aExp - bExp;
+    if ( expDiff < -1 ) return a;
+    shortShift128Left(
+        aSig0 | LIT64( 0x0001000000000000 ),
+        aSig1,
+        15 - ( expDiff < 0 ),
+        &aSig0,
+        &aSig1
+    );
+    shortShift128Left(
+        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
+    q = le128( bSig0, bSig1, aSig0, aSig1 );
+    if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
+    expDiff -= 64;
+    while ( 0 < expDiff ) {
+        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
+        q = ( 4 < q ) ? q - 4 : 0;
+        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
+        shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
+        shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
+        sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
+        expDiff -= 61;
+    }
+    if ( -64 < expDiff ) {
+        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
+        q = ( 4 < q ) ? q - 4 : 0;
+        q >>= - expDiff;
+        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
+        expDiff += 52;
+        if ( expDiff < 0 ) {
+            shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
+        }
+        else {
+            shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
+        }
+        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
+        sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
+    }
+    else {
+        shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
+        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
+    }
+    do {
+        alternateASig0 = aSig0;
+        alternateASig1 = aSig1;
+        ++q;
+        sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
+    } while ( 0 <= (sbits64) aSig0 );
+    add128(
+        aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
+    if (    ( sigMean0 < 0 )
+         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
+        aSig0 = alternateASig0;
+        aSig1 = alternateASig1;
+    }
+    zSign = ( (sbits64) aSig0 < 0 );
+    if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
+    return
+        normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the square root of the quadruple-precision floating-point value `a'.
+| The operation is performed according to the IEC/IEEE Standard for Binary
+| Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float128 float128_sqrt( float128 a STATUS_PARAM )
+{
+    flag aSign;
+    int32 aExp, zExp;
+    bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
+    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
+    float128 z;
+
+    aSig1 = extractFloat128Frac1( a );
+    aSig0 = extractFloat128Frac0( a );
+    aExp = extractFloat128Exp( a );
+    aSign = extractFloat128Sign( a );
+    if ( aExp == 0x7FFF ) {
+        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
+        if ( ! aSign ) return a;
+        goto invalid;
+    }
+    if ( aSign ) {
+        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
+ invalid:
+        float_raise( float_flag_invalid STATUS_VAR);
+        z.low = float128_default_nan_low;
+        z.high = float128_default_nan_high;
+        return z;
+    }
+    if ( aExp == 0 ) {
+        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
+        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
+    }
+    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
+    aSig0 |= LIT64( 0x0001000000000000 );
+    zSig0 = estimateSqrt32( aExp, aSig0>>17 );
+    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
+    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
+    doubleZSig0 = zSig0<<1;
+    mul64To128( zSig0, zSig0, &term0, &term1 );
+    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
+    while ( (sbits64) rem0 < 0 ) {
+        --zSig0;
+        doubleZSig0 -= 2;
+        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
+    }
+    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
+    if ( ( zSig1 & 0x1FFF ) <= 5 ) {
+        if ( zSig1 == 0 ) zSig1 = 1;
+        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
+        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
+        mul64To128( zSig1, zSig1, &term2, &term3 );
+        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
+        while ( (sbits64) rem1 < 0 ) {
+            --zSig1;
+            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
+            term3 |= 1;
+            term2 |= doubleZSig0;
+            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
+        }
+        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
+    }
+    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
+    return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the quadruple-precision floating-point value `a' is equal to
+| the corresponding value `b', and 0 otherwise.  The comparison is performed
+| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+int float128_eq( float128 a, float128 b STATUS_PARAM )
+{
+
+    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
+              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
+         || (    ( extractFloat128Exp( b ) == 0x7FFF )
+              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
+       ) {
+        if (    float128_is_signaling_nan( a )
+             || float128_is_signaling_nan( b ) ) {
+            float_raise( float_flag_invalid STATUS_VAR);
+        }
+        return 0;
+    }
+    return
+           ( a.low == b.low )
+        && (    ( a.high == b.high )
+             || (    ( a.low == 0 )
+                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
+           );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the quadruple-precision floating-point value `a' is less than
+| or equal to the corresponding value `b', and 0 otherwise.  The comparison
+| is performed according to the IEC/IEEE Standard for Binary Floating-Point
+| Arithmetic.
+*----------------------------------------------------------------------------*/
+
+int float128_le( float128 a, float128 b STATUS_PARAM )
+{
+    flag aSign, bSign;
+
+    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
+              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
+         || (    ( extractFloat128Exp( b ) == 0x7FFF )
+              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
+       ) {
+        float_raise( float_flag_invalid STATUS_VAR);
+        return 0;
+    }
+    aSign = extractFloat128Sign( a );
+    bSign = extractFloat128Sign( b );
+    if ( aSign != bSign ) {
+        return
+               aSign
+            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
+                 == 0 );
+    }
+    return
+          aSign ? le128( b.high, b.low, a.high, a.low )
+        : le128( a.high, a.low, b.high, b.low );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the quadruple-precision floating-point value `a' is less than
+| the corresponding value `b', and 0 otherwise.  The comparison is performed
+| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+int float128_lt( float128 a, float128 b STATUS_PARAM )
+{
+    flag aSign, bSign;
+
+    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
+              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
+         || (    ( extractFloat128Exp( b ) == 0x7FFF )
+              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
+       ) {
+        float_raise( float_flag_invalid STATUS_VAR);
+        return 0;
+    }
+    aSign = extractFloat128Sign( a );
+    bSign = extractFloat128Sign( b );
+    if ( aSign != bSign ) {
+        return
+               aSign
+            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
+                 != 0 );
+    }
+    return
+          aSign ? lt128( b.high, b.low, a.high, a.low )
+        : lt128( a.high, a.low, b.high, b.low );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the quadruple-precision floating-point value `a' is equal to
+| the corresponding value `b', and 0 otherwise.  The invalid exception is
+| raised if either operand is a NaN.  Otherwise, the comparison is performed
+| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
+{
+
+    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
+              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
+         || (    ( extractFloat128Exp( b ) == 0x7FFF )
+              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
+       ) {
+        float_raise( float_flag_invalid STATUS_VAR);
+        return 0;
+    }
+    return
+           ( a.low == b.low )
+        && (    ( a.high == b.high )
+             || (    ( a.low == 0 )
+                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
+           );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the quadruple-precision floating-point value `a' is less than
+| or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
+| cause an exception.  Otherwise, the comparison is performed according to the
+| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
+{
+    flag aSign, bSign;
+
+    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
+              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
+         || (    ( extractFloat128Exp( b ) == 0x7FFF )
+              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
+       ) {
+        if (    float128_is_signaling_nan( a )
+             || float128_is_signaling_nan( b ) ) {
+            float_raise( float_flag_invalid STATUS_VAR);
+        }
+        return 0;
+    }
+    aSign = extractFloat128Sign( a );
+    bSign = extractFloat128Sign( b );
+    if ( aSign != bSign ) {
+        return
+               aSign
+            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
+                 == 0 );
+    }
+    return
+          aSign ? le128( b.high, b.low, a.high, a.low )
+        : le128( a.high, a.low, b.high, b.low );
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the quadruple-precision floating-point value `a' is less than
+| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
+| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
+| Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
+{
+    flag aSign, bSign;
+
+    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
+              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
+         || (    ( extractFloat128Exp( b ) == 0x7FFF )
+              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
+       ) {
+        if (    float128_is_signaling_nan( a )
+             || float128_is_signaling_nan( b ) ) {
+            float_raise( float_flag_invalid STATUS_VAR);
+        }
+        return 0;
+    }
+    aSign = extractFloat128Sign( a );
+    bSign = extractFloat128Sign( b );
+    if ( aSign != bSign ) {
+        return
+               aSign
+            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
+                 != 0 );
+    }
+    return
+          aSign ? lt128( b.high, b.low, a.high, a.low )
+        : lt128( a.high, a.low, b.high, b.low );
+
+}
+
+#endif
+
+/* misc functions */
+float32 uint32_to_float32( unsigned int a STATUS_PARAM )
+{
+    return int64_to_float32(a STATUS_VAR);
+}
+
+float64 uint32_to_float64( unsigned int a STATUS_PARAM )
+{
+    return int64_to_float64(a STATUS_VAR);
+}
+
+unsigned int float32_to_uint32( float32 a STATUS_PARAM )
+{
+    int64_t v;
+    unsigned int res;
+
+    v = float32_to_int64(a STATUS_VAR);
+    if (v < 0) {
+        res = 0;
+        float_raise( float_flag_invalid STATUS_VAR);
+    } else if (v > 0xffffffff) {
+        res = 0xffffffff;
+        float_raise( float_flag_invalid STATUS_VAR);
+    } else {
+        res = v;
+    }
+    return res;
+}
+
+unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
+{
+    int64_t v;
+    unsigned int res;
+
+    v = float32_to_int64_round_to_zero(a STATUS_VAR);
+    if (v < 0) {
+        res = 0;
+        float_raise( float_flag_invalid STATUS_VAR);
+    } else if (v > 0xffffffff) {
+        res = 0xffffffff;
+        float_raise( float_flag_invalid STATUS_VAR);
+    } else {
+        res = v;
+    }
+    return res;
+}
+
+unsigned int float64_to_uint32( float64 a STATUS_PARAM )
+{
+    int64_t v;
+    unsigned int res;
+
+    v = float64_to_int64(a STATUS_VAR);
+    if (v < 0) {
+        res = 0;
+        float_raise( float_flag_invalid STATUS_VAR);
+    } else if (v > 0xffffffff) {
+        res = 0xffffffff;
+        float_raise( float_flag_invalid STATUS_VAR);
+    } else {
+        res = v;
+    }
+    return res;
+}
+
+unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
+{
+    int64_t v;
+    unsigned int res;
+
+    v = float64_to_int64_round_to_zero(a STATUS_VAR);
+    if (v < 0) {
+        res = 0;
+        float_raise( float_flag_invalid STATUS_VAR);
+    } else if (v > 0xffffffff) {
+        res = 0xffffffff;
+        float_raise( float_flag_invalid STATUS_VAR);
+    } else {
+        res = v;
+    }
+    return res;
+}
+
+/* FIXME: This looks broken.  */
+uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
+{
+    int64_t v;
+
+    v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
+    v += float64_val(a);
+    v = float64_to_int64(make_float64(v) STATUS_VAR);
+
+    return v - INT64_MIN;
+}
+
+uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
+{
+    int64_t v;
+
+    v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
+    v += float64_val(a);
+    v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
+
+    return v - INT64_MIN;
+}
+
+#define COMPARE(s, nan_exp)                                                  \
+INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
+                                      int is_quiet STATUS_PARAM )            \
+{                                                                            \
+    flag aSign, bSign;                                                       \
+    bits ## s av, bv;                                                        \
+                                                                             \
+    if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
+         extractFloat ## s ## Frac( a ) ) ||                                 \
+        ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
+          extractFloat ## s ## Frac( b ) )) {                                \
+        if (!is_quiet ||                                                     \
+            float ## s ## _is_signaling_nan( a ) ||                          \
+            float ## s ## _is_signaling_nan( b ) ) {                         \
+            float_raise( float_flag_invalid STATUS_VAR);                     \
+        }                                                                    \
+        return float_relation_unordered;                                     \
+    }                                                                        \
+    aSign = extractFloat ## s ## Sign( a );                                  \
+    bSign = extractFloat ## s ## Sign( b );                                  \
+    av = float ## s ## _val(a);                                              \
+    bv = float ## s ## _val(b);                                              \
+    if ( aSign != bSign ) {                                                  \
+        if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) {                         \
+            /* zero case */                                                  \
+            return float_relation_equal;                                     \
+        } else {                                                             \
+            return 1 - (2 * aSign);                                          \
+        }                                                                    \
+    } else {                                                                 \
+        if (av == bv) {                                                      \
+            return float_relation_equal;                                     \
+        } else {                                                             \
+            return 1 - 2 * (aSign ^ ( av < bv ));                            \
+        }                                                                    \
+    }                                                                        \
+}                                                                            \
+                                                                             \
+int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
+{                                                                            \
+    return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
+}                                                                            \
+                                                                             \
+int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
+{                                                                            \
+    return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
+}
+
+COMPARE(32, 0xff)
+COMPARE(64, 0x7ff)
+
+INLINE int float128_compare_internal( float128 a, float128 b,
+                                      int is_quiet STATUS_PARAM )
+{
+    flag aSign, bSign;
+
+    if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
+          ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
+        ( ( extractFloat128Exp( b ) == 0x7fff ) &&
+          ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
+        if (!is_quiet ||
+            float128_is_signaling_nan( a ) ||
+            float128_is_signaling_nan( b ) ) {
+            float_raise( float_flag_invalid STATUS_VAR);
+        }
+        return float_relation_unordered;
+    }
+    aSign = extractFloat128Sign( a );
+    bSign = extractFloat128Sign( b );
+    if ( aSign != bSign ) {
+        if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
+            /* zero case */
+            return float_relation_equal;
+        } else {
+            return 1 - (2 * aSign);
+        }
+    } else {
+        if (a.low == b.low && a.high == b.high) {
+            return float_relation_equal;
+        } else {
+            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
+        }
+    }
+}
+
+int float128_compare( float128 a, float128 b STATUS_PARAM )
+{
+    return float128_compare_internal(a, b, 0 STATUS_VAR);
+}
+
+int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
+{
+    return float128_compare_internal(a, b, 1 STATUS_VAR);
+}
+
+/* Multiply A by 2 raised to the power N.  */
+float32 float32_scalbn( float32 a, int n STATUS_PARAM )
+{
+    flag aSign;
+    int16 aExp;
+    bits32 aSig;
+
+    aSig = extractFloat32Frac( a );
+    aExp = extractFloat32Exp( a );
+    aSign = extractFloat32Sign( a );
+
+    if ( aExp == 0xFF ) {
+        return a;
+    }
+    if ( aExp != 0 )
+        aSig |= 0x00800000;
+    else if ( aSig == 0 )
+        return a;
+
+    aExp += n - 1;
+    aSig <<= 7;
+    return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
+}
+
+float64 float64_scalbn( float64 a, int n STATUS_PARAM )
+{
+    flag aSign;
+    int16 aExp;
+    bits64 aSig;
+
+    aSig = extractFloat64Frac( a );
+    aExp = extractFloat64Exp( a );
+    aSign = extractFloat64Sign( a );
+
+    if ( aExp == 0x7FF ) {
+        return a;
+    }
+    if ( aExp != 0 )
+        aSig |= LIT64( 0x0010000000000000 );
+    else if ( aSig == 0 )
+        return a;
+
+    aExp += n - 1;
+    aSig <<= 10;
+    return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
+}
+
+#ifdef FLOATX80
+floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
+{
+    flag aSign;
+    int16 aExp;
+    bits64 aSig;
+
+    aSig = extractFloatx80Frac( a );
+    aExp = extractFloatx80Exp( a );
+    aSign = extractFloatx80Sign( a );
+
+    if ( aExp == 0x7FF ) {
+        return a;
+    }
+    if (aExp == 0 && aSig == 0)
+        return a;
+
+    aExp += n;
+    return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
+                                          aSign, aExp, aSig, 0 STATUS_VAR );
+}
+#endif
+
+#ifdef FLOAT128
+float128 float128_scalbn( float128 a, int n STATUS_PARAM )
+{
+    flag aSign;
+    int32 aExp;
+    bits64 aSig0, aSig1;
+
+    aSig1 = extractFloat128Frac1( a );
+    aSig0 = extractFloat128Frac0( a );
+    aExp = extractFloat128Exp( a );
+    aSign = extractFloat128Sign( a );
+    if ( aExp == 0x7FFF ) {
+        return a;
+    }
+    if ( aExp != 0 )
+        aSig0 |= LIT64( 0x0001000000000000 );
+    else if ( aSig0 == 0 && aSig1 == 0 )
+        return a;
+
+    aExp += n - 1;
+    return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
+                                          STATUS_VAR );
+
+}
+#endif
diff --git a/src/recompiler/fpu/softfloat.h b/src/recompiler/fpu/softfloat.h
new file mode 100644
index 00000000..c209cd97
--- /dev/null
+++ b/src/recompiler/fpu/softfloat.h
@@ -0,0 +1,537 @@
+/*============================================================================
+
+This C header file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
+Package, Release 2b.
+
+Written by John R. Hauser.  This work was made possible in part by the
+International Computer Science Institute, located at Suite 600, 1947 Center
+Street, Berkeley, California 94704.  Funding was partially provided by the
+National Science Foundation under grant MIP-9311980.  The original version
+of this code was written as part of a project to build a fixed-point vector
+processor in collaboration with the University of California at Berkeley,
+overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
+is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
+arithmetic/SoftFloat.html'.
+
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
+been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
+RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
+AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
+COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
+EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
+INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
+OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
+
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+
+=============================================================================*/
+
+#ifndef SOFTFLOAT_H
+#define SOFTFLOAT_H
+
+#ifdef VBOX
+#include <VBox/types.h>
+#endif
+
+#if defined(CONFIG_SOLARIS) && defined(CONFIG_NEEDS_LIBSUNMATH)
+#include <sunmath.h>
+#endif
+
+#include <inttypes.h>
+#include "config.h"
+
+/*----------------------------------------------------------------------------
+| Each of the following `typedef's defines the most convenient type that holds
+| integers of at least as many bits as specified.  For example, `uint8' should
+| be the most convenient type that can hold unsigned integers of as many as
+| 8 bits.  The `flag' type must be able to hold either a 0 or 1.  For most
+| implementations of C, `flag', `uint8', and `int8' should all be `typedef'ed
+| to the same as `int'.
+*----------------------------------------------------------------------------*/
+typedef uint8_t flag;
+typedef uint8_t uint8;
+typedef int8_t int8;
+#ifndef _AIX
+typedef int uint16;
+typedef int int16;
+#endif
+typedef unsigned int uint32;
+typedef signed int int32;
+typedef uint64_t uint64;
+typedef int64_t int64;
+
+/*----------------------------------------------------------------------------
+| Each of the following `typedef's defines a type that holds integers
+| of _exactly_ the number of bits specified.  For instance, for most
+| implementation of C, `bits16' and `sbits16' should be `typedef'ed to
+| `unsigned short int' and `signed short int' (or `short int'), respectively.
+*----------------------------------------------------------------------------*/
+typedef uint8_t bits8;
+typedef int8_t sbits8;
+typedef uint16_t bits16;
+typedef int16_t sbits16;
+typedef uint32_t bits32;
+typedef int32_t sbits32;
+typedef uint64_t bits64;
+typedef int64_t sbits64;
+
+#define LIT64( a ) a##LL
+#define INLINE static inline
+
+/*----------------------------------------------------------------------------
+| The macro `FLOATX80' must be defined to enable the extended double-precision
+| floating-point format `floatx80'.  If this macro is not defined, the
+| `floatx80' type will not be defined, and none of the functions that either
+| input or output the `floatx80' type will be defined.  The same applies to
+| the `FLOAT128' macro and the quadruple-precision format `float128'.
+*----------------------------------------------------------------------------*/
+#ifdef CONFIG_SOFTFLOAT
+/* bit exact soft float support */
+#define FLOATX80
+#define FLOAT128
+#else
+/* native float support */
+#if (defined(__i386__) || defined(__x86_64__)) && (!defined(CONFIG_BSD) || defined(VBOX)) /** @todo VBOX: not correct on windows */
+#define FLOATX80
+#endif
+#endif /* !CONFIG_SOFTFLOAT */
+
+#if defined(VBOX) && (!defined(FLOATX80) || defined(CONFIG_SOFTFLOAT))
+# error misconfigured
+#endif
+
+#define STATUS_PARAM , float_status *status
+#define STATUS(field) status->field
+#define STATUS_VAR , status
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE floating-point ordering relations
+*----------------------------------------------------------------------------*/
+enum {
+    float_relation_less      = -1,
+    float_relation_equal     =  0,
+    float_relation_greater   =  1,
+    float_relation_unordered =  2
+};
+
+#ifdef CONFIG_SOFTFLOAT
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE floating-point types.
+*----------------------------------------------------------------------------*/
+/* Use structures for soft-float types.  This prevents accidentally mixing
+   them with native int/float types.  A sufficiently clever compiler and
+   sane ABI should be able to see though these structs.  However
+   x86/gcc 3.x seems to struggle a bit, so leave them disabled by default.  */
+//#define USE_SOFTFLOAT_STRUCT_TYPES
+#ifdef USE_SOFTFLOAT_STRUCT_TYPES
+typedef struct {
+    uint32_t v;
+} float32;
+/* The cast ensures an error if the wrong type is passed.  */
+#define float32_val(x) (((float32)(x)).v)
+#define make_float32(x) __extension__ ({ float32 f32_val = {x}; f32_val; })
+typedef struct {
+    uint64_t v;
+} float64;
+#define float64_val(x) (((float64)(x)).v)
+#define make_float64(x) __extension__ ({ float64 f64_val = {x}; f64_val; })
+#else
+typedef uint32_t float32;
+typedef uint64_t float64;
+#define float32_val(x) (x)
+#define float64_val(x) (x)
+#define make_float32(x) (x)
+#define make_float64(x) (x)
+#endif
+#ifdef FLOATX80
+typedef struct {
+    uint64_t low;
+    uint16_t high;
+} floatx80;
+#endif
+#ifdef FLOAT128
+typedef struct {
+#ifdef HOST_WORDS_BIGENDIAN
+    uint64_t high, low;
+#else
+    uint64_t low, high;
+#endif
+} float128;
+#endif
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE floating-point underflow tininess-detection mode.
+*----------------------------------------------------------------------------*/
+enum {
+    float_tininess_after_rounding  = 0,
+    float_tininess_before_rounding = 1
+};
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE floating-point rounding mode.
+*----------------------------------------------------------------------------*/
+enum {
+    float_round_nearest_even = 0,
+    float_round_down         = 1,
+    float_round_up           = 2,
+    float_round_to_zero      = 3
+};
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE floating-point exception flags.
+*----------------------------------------------------------------------------*/
+enum {
+    float_flag_invalid   =  1,
+    float_flag_divbyzero =  4,
+    float_flag_overflow  =  8,
+    float_flag_underflow = 16,
+    float_flag_inexact   = 32
+};
+
+typedef struct float_status {
+    signed char float_detect_tininess;
+    signed char float_rounding_mode;
+    signed char float_exception_flags;
+#ifdef FLOATX80
+    signed char floatx80_rounding_precision;
+#endif
+    flag flush_to_zero;
+    flag default_nan_mode;
+} float_status;
+
+void set_float_rounding_mode(int val STATUS_PARAM);
+void set_float_exception_flags(int val STATUS_PARAM);
+INLINE void set_flush_to_zero(flag val STATUS_PARAM)
+{
+    STATUS(flush_to_zero) = val;
+}
+INLINE void set_default_nan_mode(flag val STATUS_PARAM)
+{
+    STATUS(default_nan_mode) = val;
+}
+INLINE int get_float_exception_flags(float_status *status)
+{
+    return STATUS(float_exception_flags);
+}
+#ifdef FLOATX80
+void set_floatx80_rounding_precision(int val STATUS_PARAM);
+#endif
+
+/*----------------------------------------------------------------------------
+| Routine to raise any or all of the software IEC/IEEE floating-point
+| exception flags.
+*----------------------------------------------------------------------------*/
+void float_raise( int8 flags STATUS_PARAM);
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE integer-to-floating-point conversion routines.
+*----------------------------------------------------------------------------*/
+float32 int32_to_float32( int STATUS_PARAM );
+float64 int32_to_float64( int STATUS_PARAM );
+float32 uint32_to_float32( unsigned int STATUS_PARAM );
+float64 uint32_to_float64( unsigned int STATUS_PARAM );
+#ifdef FLOATX80
+floatx80 int32_to_floatx80( int STATUS_PARAM );
+#endif
+#ifdef FLOAT128
+float128 int32_to_float128( int STATUS_PARAM );
+#endif
+float32 int64_to_float32( int64_t STATUS_PARAM );
+float32 uint64_to_float32( uint64_t STATUS_PARAM );
+float64 int64_to_float64( int64_t STATUS_PARAM );
+float64 uint64_to_float64( uint64_t STATUS_PARAM );
+#ifdef FLOATX80
+floatx80 int64_to_floatx80( int64_t STATUS_PARAM );
+#endif
+#ifdef FLOAT128
+float128 int64_to_float128( int64_t STATUS_PARAM );
+#endif
+
+/*----------------------------------------------------------------------------
+| Software half-precision conversion routines.
+*----------------------------------------------------------------------------*/
+bits16 float32_to_float16( float32, flag STATUS_PARAM );
+float32 float16_to_float32( bits16, flag STATUS_PARAM );
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE single-precision conversion routines.
+*----------------------------------------------------------------------------*/
+int float32_to_int32( float32 STATUS_PARAM );
+int float32_to_int32_round_to_zero( float32 STATUS_PARAM );
+unsigned int float32_to_uint32( float32 STATUS_PARAM );
+unsigned int float32_to_uint32_round_to_zero( float32 STATUS_PARAM );
+int64_t float32_to_int64( float32 STATUS_PARAM );
+int64_t float32_to_int64_round_to_zero( float32 STATUS_PARAM );
+float64 float32_to_float64( float32 STATUS_PARAM );
+#ifdef FLOATX80
+floatx80 float32_to_floatx80( float32 STATUS_PARAM );
+#endif
+#ifdef FLOAT128
+float128 float32_to_float128( float32 STATUS_PARAM );
+#endif
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE single-precision operations.
+*----------------------------------------------------------------------------*/
+float32 float32_round_to_int( float32 STATUS_PARAM );
+float32 float32_add( float32, float32 STATUS_PARAM );
+float32 float32_sub( float32, float32 STATUS_PARAM );
+float32 float32_mul( float32, float32 STATUS_PARAM );
+float32 float32_div( float32, float32 STATUS_PARAM );
+float32 float32_rem( float32, float32 STATUS_PARAM );
+float32 float32_sqrt( float32 STATUS_PARAM );
+float32 float32_exp2( float32 STATUS_PARAM );
+float32 float32_log2( float32 STATUS_PARAM );
+int float32_eq( float32, float32 STATUS_PARAM );
+int float32_le( float32, float32 STATUS_PARAM );
+int float32_lt( float32, float32 STATUS_PARAM );
+int float32_eq_signaling( float32, float32 STATUS_PARAM );
+int float32_le_quiet( float32, float32 STATUS_PARAM );
+int float32_lt_quiet( float32, float32 STATUS_PARAM );
+int float32_compare( float32, float32 STATUS_PARAM );
+int float32_compare_quiet( float32, float32 STATUS_PARAM );
+int float32_is_nan( float32 );
+int float32_is_signaling_nan( float32 );
+float32 float32_scalbn( float32, int STATUS_PARAM );
+
+INLINE float32 float32_abs(float32 a)
+{
+    return make_float32(float32_val(a) & 0x7fffffff);
+}
+
+INLINE float32 float32_chs(float32 a)
+{
+    return make_float32(float32_val(a) ^ 0x80000000);
+}
+
+INLINE int float32_is_infinity(float32 a)
+{
+    return (float32_val(a) & 0x7fffffff) == 0x7f800000;
+}
+
+INLINE int float32_is_neg(float32 a)
+{
+    return float32_val(a) >> 31;
+}
+
+INLINE int float32_is_zero(float32 a)
+{
+    return (float32_val(a) & 0x7fffffff) == 0;
+}
+
+#define float32_zero make_float32(0)
+#define float32_one make_float32(0x3f800000)
+#define float32_ln2 make_float32(0x3f317218)
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE double-precision conversion routines.
+*----------------------------------------------------------------------------*/
+int float64_to_int32( float64 STATUS_PARAM );
+int float64_to_int32_round_to_zero( float64 STATUS_PARAM );
+unsigned int float64_to_uint32( float64 STATUS_PARAM );
+unsigned int float64_to_uint32_round_to_zero( float64 STATUS_PARAM );
+int64_t float64_to_int64( float64 STATUS_PARAM );
+int64_t float64_to_int64_round_to_zero( float64 STATUS_PARAM );
+uint64_t float64_to_uint64 (float64 a STATUS_PARAM);
+uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM);
+float32 float64_to_float32( float64 STATUS_PARAM );
+#ifdef FLOATX80
+floatx80 float64_to_floatx80( float64 STATUS_PARAM );
+#endif
+#ifdef FLOAT128
+float128 float64_to_float128( float64 STATUS_PARAM );
+#endif
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE double-precision operations.
+*----------------------------------------------------------------------------*/
+float64 float64_round_to_int( float64 STATUS_PARAM );
+float64 float64_trunc_to_int( float64 STATUS_PARAM );
+float64 float64_add( float64, float64 STATUS_PARAM );
+float64 float64_sub( float64, float64 STATUS_PARAM );
+float64 float64_mul( float64, float64 STATUS_PARAM );
+float64 float64_div( float64, float64 STATUS_PARAM );
+float64 float64_rem( float64, float64 STATUS_PARAM );
+float64 float64_sqrt( float64 STATUS_PARAM );
+float64 float64_log2( float64 STATUS_PARAM );
+int float64_eq( float64, float64 STATUS_PARAM );
+int float64_le( float64, float64 STATUS_PARAM );
+int float64_lt( float64, float64 STATUS_PARAM );
+int float64_eq_signaling( float64, float64 STATUS_PARAM );
+int float64_le_quiet( float64, float64 STATUS_PARAM );
+int float64_lt_quiet( float64, float64 STATUS_PARAM );
+int float64_compare( float64, float64 STATUS_PARAM );
+int float64_compare_quiet( float64, float64 STATUS_PARAM );
+int float64_is_nan( float64 a );
+int float64_is_signaling_nan( float64 );
+float64 float64_scalbn( float64, int STATUS_PARAM );
+
+INLINE float64 float64_abs(float64 a)
+{
+    return make_float64(float64_val(a) & 0x7fffffffffffffffLL);
+}
+
+INLINE float64 float64_chs(float64 a)
+{
+    return make_float64(float64_val(a) ^ 0x8000000000000000LL);
+}
+
+INLINE int float64_is_infinity(float64 a)
+{
+    return (float64_val(a) & 0x7fffffffffffffffLL ) == 0x7ff0000000000000LL;
+}
+
+INLINE int float64_is_neg(float64 a)
+{
+    return float64_val(a) >> 63;
+}
+
+INLINE int float64_is_zero(float64 a)
+{
+    return (float64_val(a) & 0x7fffffffffffffffLL) == 0;
+}
+
+#define float64_zero make_float64(0)
+#define float64_one make_float64(0x3ff0000000000000LL)
+#define float64_ln2 make_float64(0x3fe62e42fefa39efLL)
+
+#ifdef FLOATX80
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE extended double-precision conversion routines.
+*----------------------------------------------------------------------------*/
+int floatx80_to_int32( floatx80 STATUS_PARAM );
+int floatx80_to_int32_round_to_zero( floatx80 STATUS_PARAM );
+int64_t floatx80_to_int64( floatx80 STATUS_PARAM );
+int64_t floatx80_to_int64_round_to_zero( floatx80 STATUS_PARAM );
+float32 floatx80_to_float32( floatx80 STATUS_PARAM );
+float64 floatx80_to_float64( floatx80 STATUS_PARAM );
+#ifdef FLOAT128
+float128 floatx80_to_float128( floatx80 STATUS_PARAM );
+#endif
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE extended double-precision operations.
+*----------------------------------------------------------------------------*/
+floatx80 floatx80_round_to_int( floatx80 STATUS_PARAM );
+floatx80 floatx80_add( floatx80, floatx80 STATUS_PARAM );
+floatx80 floatx80_sub( floatx80, floatx80 STATUS_PARAM );
+floatx80 floatx80_mul( floatx80, floatx80 STATUS_PARAM );
+floatx80 floatx80_div( floatx80, floatx80 STATUS_PARAM );
+floatx80 floatx80_rem( floatx80, floatx80 STATUS_PARAM );
+floatx80 floatx80_sqrt( floatx80 STATUS_PARAM );
+int floatx80_eq( floatx80, floatx80 STATUS_PARAM );
+int floatx80_le( floatx80, floatx80 STATUS_PARAM );
+int floatx80_lt( floatx80, floatx80 STATUS_PARAM );
+int floatx80_eq_signaling( floatx80, floatx80 STATUS_PARAM );
+int floatx80_le_quiet( floatx80, floatx80 STATUS_PARAM );
+int floatx80_lt_quiet( floatx80, floatx80 STATUS_PARAM );
+int floatx80_is_nan( floatx80 );
+int floatx80_is_signaling_nan( floatx80 );
+floatx80 floatx80_scalbn( floatx80, int STATUS_PARAM );
+
+INLINE floatx80 floatx80_abs(floatx80 a)
+{
+    a.high &= 0x7fff;
+    return a;
+}
+
+INLINE floatx80 floatx80_chs(floatx80 a)
+{
+    a.high ^= 0x8000;
+    return a;
+}
+
+INLINE int floatx80_is_infinity(floatx80 a)
+{
+    return (a.high & 0x7fff) == 0x7fff && a.low == 0;
+}
+
+INLINE int floatx80_is_neg(floatx80 a)
+{
+    return a.high >> 15;
+}
+
+INLINE int floatx80_is_zero(floatx80 a)
+{
+    return (a.high & 0x7fff) == 0 && a.low == 0;
+}
+
+#endif
+
+#ifdef FLOAT128
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE quadruple-precision conversion routines.
+*----------------------------------------------------------------------------*/
+int float128_to_int32( float128 STATUS_PARAM );
+int float128_to_int32_round_to_zero( float128 STATUS_PARAM );
+int64_t float128_to_int64( float128 STATUS_PARAM );
+int64_t float128_to_int64_round_to_zero( float128 STATUS_PARAM );
+float32 float128_to_float32( float128 STATUS_PARAM );
+float64 float128_to_float64( float128 STATUS_PARAM );
+#ifdef FLOATX80
+floatx80 float128_to_floatx80( float128 STATUS_PARAM );
+#endif
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE quadruple-precision operations.
+*----------------------------------------------------------------------------*/
+float128 float128_round_to_int( float128 STATUS_PARAM );
+float128 float128_add( float128, float128 STATUS_PARAM );
+float128 float128_sub( float128, float128 STATUS_PARAM );
+float128 float128_mul( float128, float128 STATUS_PARAM );
+float128 float128_div( float128, float128 STATUS_PARAM );
+float128 float128_rem( float128, float128 STATUS_PARAM );
+float128 float128_sqrt( float128 STATUS_PARAM );
+int float128_eq( float128, float128 STATUS_PARAM );
+int float128_le( float128, float128 STATUS_PARAM );
+int float128_lt( float128, float128 STATUS_PARAM );
+int float128_eq_signaling( float128, float128 STATUS_PARAM );
+int float128_le_quiet( float128, float128 STATUS_PARAM );
+int float128_lt_quiet( float128, float128 STATUS_PARAM );
+int float128_compare( float128, float128 STATUS_PARAM );
+int float128_compare_quiet( float128, float128 STATUS_PARAM );
+int float128_is_nan( float128 );
+int float128_is_signaling_nan( float128 );
+float128 float128_scalbn( float128, int STATUS_PARAM );
+
+INLINE float128 float128_abs(float128 a)
+{
+    a.high &= 0x7fffffffffffffffLL;
+    return a;
+}
+
+INLINE float128 float128_chs(float128 a)
+{
+    a.high ^= 0x8000000000000000LL;
+    return a;
+}
+
+INLINE int float128_is_infinity(float128 a)
+{
+    return (a.high & 0x7fffffffffffffffLL) == 0x7fff000000000000LL && a.low == 0;
+}
+
+INLINE int float128_is_neg(float128 a)
+{
+    return a.high >> 63;
+}
+
+INLINE int float128_is_zero(float128 a)
+{
+    return (a.high & 0x7fffffffffffffffLL) == 0 && a.low == 0;
+}
+
+#endif
+
+#else /* CONFIG_SOFTFLOAT */
+
+#include "softfloat-native.h"
+
+#endif /* !CONFIG_SOFTFLOAT */
+
+#endif /* !SOFTFLOAT_H */
diff --git a/src/recompiler/gen-icount.h b/src/recompiler/gen-icount.h
new file mode 100644
index 00000000..13512960
--- /dev/null
+++ b/src/recompiler/gen-icount.h
@@ -0,0 +1,48 @@
+#include "qemu-timer.h"
+
+/* Helpers for instruction counting code generation.  */
+
+static TCGArg *icount_arg;
+static int icount_label;
+
+static inline void gen_icount_start(void)
+{
+    TCGv_i32 count;
+
+    if (!use_icount)
+        return;
+
+    icount_label = gen_new_label();
+    count = tcg_temp_local_new_i32();
+    tcg_gen_ld_i32(count, cpu_env, offsetof(CPUState, icount_decr.u32));
+    /* This is a horrid hack to allow fixing up the value later.  */
+    icount_arg = gen_opparam_ptr + 1;
+    tcg_gen_subi_i32(count, count, 0xdeadbeef);
+
+    tcg_gen_brcondi_i32(TCG_COND_LT, count, 0, icount_label);
+    tcg_gen_st16_i32(count, cpu_env, offsetof(CPUState, icount_decr.u16.low));
+    tcg_temp_free_i32(count);
+}
+
+static void gen_icount_end(TranslationBlock *tb, int num_insns)
+{
+    if (use_icount) {
+        *icount_arg = num_insns;
+        gen_set_label(icount_label);
+        tcg_gen_exit_tb((uintptr_t)(tb + 2));
+    }
+}
+
+static inline void gen_io_start(void)
+{
+    TCGv_i32 tmp = tcg_const_i32(1);
+    tcg_gen_st_i32(tmp, cpu_env, offsetof(CPUState, can_do_io));
+    tcg_temp_free_i32(tmp);
+}
+
+static inline void gen_io_end(void)
+{
+    TCGv_i32 tmp = tcg_const_i32(0);
+    tcg_gen_st_i32(tmp, cpu_env, offsetof(CPUState, can_do_io));
+    tcg_temp_free_i32(tmp);
+}
diff --git a/src/recompiler/host-utils.c b/src/recompiler/host-utils.c
new file mode 100644
index 00000000..4afee79e
--- /dev/null
+++ b/src/recompiler/host-utils.c
@@ -0,0 +1,109 @@
+/*
+ * Utility compute operations used by translated code.
+ *
+ * Copyright (c) 2003 Fabrice Bellard
+ * Copyright (c) 2007 Aurelien Jarno
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <stdlib.h>
+#ifndef VBOX
+#include <stdint.h>
+#else
+# include <iprt/types.h>
+#endif
+#include "host-utils.h"
+
+//#define DEBUG_MULDIV
+
+/* Long integer helpers */
+#if !defined(__x86_64__)
+static void add128 (uint64_t *plow, uint64_t *phigh, uint64_t a, uint64_t b)
+{
+    *plow += a;
+    /* carry test */
+    if (*plow < a)
+        (*phigh)++;
+    *phigh += b;
+}
+
+static void neg128 (uint64_t *plow, uint64_t *phigh)
+{
+    *plow = ~*plow;
+    *phigh = ~*phigh;
+    add128(plow, phigh, 1, 0);
+}
+
+static void mul64 (uint64_t *plow, uint64_t *phigh, uint64_t a, uint64_t b)
+{
+    uint32_t a0, a1, b0, b1;
+    uint64_t v;
+
+    a0 = a;
+    a1 = a >> 32;
+
+    b0 = b;
+    b1 = b >> 32;
+
+    v = (uint64_t)a0 * (uint64_t)b0;
+    *plow = v;
+    *phigh = 0;
+
+    v = (uint64_t)a0 * (uint64_t)b1;
+    add128(plow, phigh, v << 32, v >> 32);
+
+    v = (uint64_t)a1 * (uint64_t)b0;
+    add128(plow, phigh, v << 32, v >> 32);
+
+    v = (uint64_t)a1 * (uint64_t)b1;
+    *phigh += v;
+}
+
+/* Unsigned 64x64 -> 128 multiplication */
+void mulu64 (uint64_t *plow, uint64_t *phigh, uint64_t a, uint64_t b)
+{
+    mul64(plow, phigh, a, b);
+#if defined(DEBUG_MULDIV)
+    printf("mulu64: 0x%016llx * 0x%016llx = 0x%016llx%016llx\n",
+           a, b, *phigh, *plow);
+#endif
+}
+
+/* Signed 64x64 -> 128 multiplication */
+void muls64 (uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b)
+{
+    int sa, sb;
+
+    sa = (a < 0);
+    if (sa)
+        a = -a;
+    sb = (b < 0);
+    if (sb)
+        b = -b;
+    mul64(plow, phigh, a, b);
+    if (sa ^ sb) {
+        neg128(plow, phigh);
+    }
+#if defined(DEBUG_MULDIV)
+    printf("muls64: 0x%016llx * 0x%016llx = 0x%016llx%016llx\n",
+           a, b, *phigh, *plow);
+#endif
+}
+#endif /* !defined(__x86_64__) */
diff --git a/src/recompiler/host-utils.h b/src/recompiler/host-utils.h
new file mode 100644
index 00000000..0ddc1765
--- /dev/null
+++ b/src/recompiler/host-utils.h
@@ -0,0 +1,236 @@
+/*
+ * Utility compute operations used by translated code.
+ *
+ * Copyright (c) 2007 Thiemo Seufer
+ * Copyright (c) 2007 Jocelyn Mayer
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "osdep.h"
+
+#if defined(__x86_64__)
+#define __HAVE_FAST_MULU64__
+static inline void mulu64(uint64_t *plow, uint64_t *phigh,
+                          uint64_t a, uint64_t b)
+{
+    __asm__ ("mul %0\n\t"
+             : "=d" (*phigh), "=a" (*plow)
+             : "a" (a), "0" (b));
+}
+#define __HAVE_FAST_MULS64__
+static inline void muls64(uint64_t *plow, uint64_t *phigh,
+                          int64_t a, int64_t b)
+{
+    __asm__ ("imul %0\n\t"
+             : "=d" (*phigh), "=a" (*plow)
+             : "a" (a), "0" (b));
+}
+#else
+void muls64(uint64_t *phigh, uint64_t *plow, int64_t a, int64_t b);
+void mulu64(uint64_t *phigh, uint64_t *plow, uint64_t a, uint64_t b);
+#endif
+
+/* Binary search for leading zeros.  */
+
+static inline int clz32(uint32_t val)
+{
+#if QEMU_GNUC_PREREQ(3, 4)
+    if (val)
+        return __builtin_clz(val);
+    else
+        return 32;
+#else
+    int cnt = 0;
+
+    if (!(val & 0xFFFF0000U)) {
+        cnt += 16;
+        val <<= 16;
+    }
+    if (!(val & 0xFF000000U)) {
+        cnt += 8;
+        val <<= 8;
+    }
+    if (!(val & 0xF0000000U)) {
+        cnt += 4;
+        val <<= 4;
+    }
+    if (!(val & 0xC0000000U)) {
+        cnt += 2;
+        val <<= 2;
+    }
+    if (!(val & 0x80000000U)) {
+        cnt++;
+        val <<= 1;
+    }
+    if (!(val & 0x80000000U)) {
+        cnt++;
+    }
+    return cnt;
+#endif
+}
+
+static inline int clo32(uint32_t val)
+{
+    return clz32(~val);
+}
+
+static inline int clz64(uint64_t val)
+{
+#if QEMU_GNUC_PREREQ(3, 4)
+    if (val)
+        return __builtin_clzll(val);
+    else
+        return 64;
+#else
+    int cnt = 0;
+
+    if (!(val >> 32)) {
+        cnt += 32;
+    } else {
+        val >>= 32;
+    }
+
+    return cnt + clz32(val);
+#endif
+}
+
+static inline int clo64(uint64_t val)
+{
+    return clz64(~val);
+}
+
+static inline int ctz32(uint32_t val)
+{
+#if QEMU_GNUC_PREREQ(3, 4)
+    if (val)
+        return __builtin_ctz(val);
+    else
+        return 32;
+#else
+    int cnt;
+
+    cnt = 0;
+    if (!(val & 0x0000FFFFUL)) {
+        cnt += 16;
+        val >>= 16;
+    }
+    if (!(val & 0x000000FFUL)) {
+        cnt += 8;
+        val >>= 8;
+    }
+    if (!(val & 0x0000000FUL)) {
+        cnt += 4;
+        val >>= 4;
+    }
+    if (!(val & 0x00000003UL)) {
+        cnt += 2;
+        val >>= 2;
+    }
+    if (!(val & 0x00000001UL)) {
+        cnt++;
+        val >>= 1;
+    }
+    if (!(val & 0x00000001UL)) {
+        cnt++;
+    }
+
+    return cnt;
+#endif
+}
+
+static inline int cto32(uint32_t val)
+{
+    return ctz32(~val);
+}
+
+static inline int ctz64(uint64_t val)
+{
+#if QEMU_GNUC_PREREQ(3, 4)
+    if (val)
+        return __builtin_ctzll(val);
+    else
+        return 64;
+#else
+    int cnt;
+
+    cnt = 0;
+    if (!((uint32_t)val)) {
+        cnt += 32;
+        val >>= 32;
+    }
+
+    return cnt + ctz32(val);
+#endif
+}
+
+static inline int cto64(uint64_t val)
+{
+    return ctz64(~val);
+}
+
+static inline int ctpop8(uint8_t val)
+{
+    val = (val & 0x55) + ((val >> 1) & 0x55);
+    val = (val & 0x33) + ((val >> 2) & 0x33);
+    val = (val & 0x0f) + ((val >> 4) & 0x0f);
+
+    return val;
+}
+
+static inline int ctpop16(uint16_t val)
+{
+    val = (val & 0x5555) + ((val >> 1) & 0x5555);
+    val = (val & 0x3333) + ((val >> 2) & 0x3333);
+    val = (val & 0x0f0f) + ((val >> 4) & 0x0f0f);
+    val = (val & 0x00ff) + ((val >> 8) & 0x00ff);
+
+    return val;
+}
+
+static inline int ctpop32(uint32_t val)
+{
+#if QEMU_GNUC_PREREQ(3, 4)
+    return __builtin_popcount(val);
+#else
+    val = (val & 0x55555555) + ((val >>  1) & 0x55555555);
+    val = (val & 0x33333333) + ((val >>  2) & 0x33333333);
+    val = (val & 0x0f0f0f0f) + ((val >>  4) & 0x0f0f0f0f);
+    val = (val & 0x00ff00ff) + ((val >>  8) & 0x00ff00ff);
+    val = (val & 0x0000ffff) + ((val >> 16) & 0x0000ffff);
+
+    return val;
+#endif
+}
+
+static inline int ctpop64(uint64_t val)
+{
+#if QEMU_GNUC_PREREQ(3, 4)
+    return __builtin_popcountll(val);
+#else
+    val = (val & 0x5555555555555555ULL) + ((val >>  1) & 0x5555555555555555ULL);
+    val = (val & 0x3333333333333333ULL) + ((val >>  2) & 0x3333333333333333ULL);
+    val = (val & 0x0f0f0f0f0f0f0f0fULL) + ((val >>  4) & 0x0f0f0f0f0f0f0f0fULL);
+    val = (val & 0x00ff00ff00ff00ffULL) + ((val >>  8) & 0x00ff00ff00ff00ffULL);
+    val = (val & 0x0000ffff0000ffffULL) + ((val >> 16) & 0x0000ffff0000ffffULL);
+    val = (val & 0x00000000ffffffffULL) + ((val >> 32) & 0x00000000ffffffffULL);
+
+    return val;
+#endif
+}
diff --git a/src/recompiler/hostregs_helper.h b/src/recompiler/hostregs_helper.h
new file mode 100644
index 00000000..c6c1ddb9
--- /dev/null
+++ b/src/recompiler/hostregs_helper.h
@@ -0,0 +1,70 @@
+/*
+ *  Save/restore host registers.
+ *
+ *  Copyright (c) 2007 CodeSourcery
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Oracle LGPL Disclaimer: For the avoidance of doubt, except that if any license choice
+ * other than GPL or LGPL is available it will apply instead, Oracle elects to use only
+ * the Lesser General Public License version 2.1 (LGPLv2) at this time for any software where
+ * a choice of LGPL license versions is made available with the language indicating
+ * that LGPLv2 or any later version may be used, or where a choice of which version
+ * of the LGPL is applied is otherwise unspecified.
+ */
+
+/* The GCC global register variable extension is used to reserve some
+   host registers for use by generated code.  However only the core parts of
+   the translation engine are compiled with these settings.  We must manually
+   save/restore these registers when called from regular code.
+   It is not sufficient to save/restore T0 et. al. as these may be declared
+   with a datatype smaller than the actual register.  */
+
+#if defined(DECLARE_HOST_REGS)
+
+#define DO_REG(REG)					\
+    register host_reg_t reg_AREG##REG asm(AREG##REG);	\
+    volatile host_reg_t saved_AREG##REG;
+
+#elif defined(SAVE_HOST_REGS)
+
+#define DO_REG(REG)					\
+    __asm__ __volatile__ ("" : "=r" (reg_AREG##REG));	\
+    saved_AREG##REG = reg_AREG##REG;
+
+#else
+
+#define DO_REG(REG)                                     \
+    reg_AREG##REG = saved_AREG##REG;		        \
+    __asm__ __volatile__ ("" : : "r" (reg_AREG##REG));
+
+#endif
+
+#ifdef AREG0
+DO_REG(0)
+#endif
+
+#ifdef AREG1
+DO_REG(1)
+#endif
+
+#ifdef AREG2
+DO_REG(2)
+#endif
+
+#undef SAVE_HOST_REGS
+#undef DECLARE_HOST_REGS
+#undef DO_REG
diff --git a/src/recompiler/ioport.h b/src/recompiler/ioport.h
new file mode 100644
index 00000000..8df11ce0
--- /dev/null
+++ b/src/recompiler/ioport.h
@@ -0,0 +1,71 @@
+/*
+ * defines ioport related functions
+ *
+ *  Copyright (c) 2003 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Oracle LGPL Disclaimer: For the avoidance of doubt, except that if any license choice
+ * other than GPL or LGPL is available it will apply instead, Oracle elects to use only
+ * the Lesser General Public License version 2.1 (LGPLv2) at this time for any software where
+ * a choice of LGPL license versions is made available with the language indicating
+ * that LGPLv2 or any later version may be used, or where a choice of which version
+ * of the LGPL is applied is otherwise unspecified.
+ */
+
+/**************************************************************************
+ * IO ports API
+ */
+
+#ifndef IOPORT_H
+#define IOPORT_H
+
+#include "qemu-common.h"
+
+typedef uint32_t pio_addr_t;
+#define FMT_pioaddr     PRIx32
+
+#define MAX_IOPORTS     (64 * 1024)
+#define IOPORTS_MASK    (MAX_IOPORTS - 1)
+
+/* These should really be in isa.h, but are here to make pc.h happy.  */
+typedef void (IOPortWriteFunc)(void *opaque, uint32_t address, uint32_t data);
+typedef uint32_t (IOPortReadFunc)(void *opaque, uint32_t address);
+
+int register_ioport_read(pio_addr_t start, int length, int size,
+                         IOPortReadFunc *func, void *opaque);
+int register_ioport_write(pio_addr_t start, int length, int size,
+                          IOPortWriteFunc *func, void *opaque);
+void isa_unassign_ioport(pio_addr_t start, int length);
+
+
+#ifndef VBOX
+void cpu_outb(pio_addr_t addr, uint8_t val);
+void cpu_outw(pio_addr_t addr, uint16_t val);
+void cpu_outl(pio_addr_t addr, uint32_t val);
+uint8_t cpu_inb(pio_addr_t addr);
+uint16_t cpu_inw(pio_addr_t addr);
+uint32_t cpu_inl(pio_addr_t addr);
+#else
+void cpu_outb(CPUX86State *env, pio_addr_t addr, uint8_t val);
+void cpu_outw(CPUX86State *env, pio_addr_t addr, uint16_t val);
+void cpu_outl(CPUX86State *env, pio_addr_t addr, uint32_t val);
+uint8_t cpu_inb(CPUX86State *env, pio_addr_t addr);
+uint16_t cpu_inw(CPUX86State *env, pio_addr_t addr);
+uint32_t cpu_inl(CPUX86State *env, pio_addr_t addr);
+#endif
+
+#endif /* IOPORT_H */
diff --git a/src/recompiler/osdep.h b/src/recompiler/osdep.h
new file mode 100644
index 00000000..b69d157f
--- /dev/null
+++ b/src/recompiler/osdep.h
@@ -0,0 +1,166 @@
+#ifndef QEMU_OSDEP_H
+#define QEMU_OSDEP_H
+
+#ifdef VBOX /** @todo clean up this, it's not fully synched. */
+
+#include <iprt/alloc.h>
+#ifndef RT_OS_WINDOWS
+# include <iprt/alloca.h>
+#endif
+#include <iprt/stdarg.h>
+#include <iprt/string.h>
+
+#include "config.h"
+
+#define VBOX_ONLY(x) x
+
+#define qemu_snprintf(pszBuf, cbBuf, ...) RTStrPrintf((pszBuf), (cbBuf), __VA_ARGS__)
+#define qemu_vsnprintf(pszBuf, cbBuf, pszFormat, args) \
+                                RTStrPrintfV((pszBuf), (cbBuf), (pszFormat), (args))
+#define qemu_vprintf(pszFormat, args) \
+                                RTLogPrintfV((pszFormat), (args))
+
+/**@todo the following macros belongs elsewhere */
+#define qemu_malloc(cb)         RTMemAlloc(cb)
+#define qemu_mallocz(cb)        RTMemAllocZ(cb)
+#define qemu_realloc(ptr, cb)   RTMemRealloc(ptr, cb)
+#define qemu_free(pv)           RTMemFree(pv)
+#define qemu_strdup(psz)        RTStrDup(psz)
+
+/* Misc wrappers */
+#define fflush(file)            RTLogFlush(NULL)
+#define printf(...)             LogIt(0, LOG_GROUP_REM_PRINTF, (__VA_ARGS__))
+/* If DEBUG_TMP_LOGGING - goes to QEMU log file */
+#ifndef DEBUG_TMP_LOGGING
+# define fprintf(logfile, ...)  LogIt(0, LOG_GROUP_REM_PRINTF, (__VA_ARGS__))
+#endif
+
+#define assert(cond)            Assert(cond)
+
+#else /* !VBOX */
+
+#include <stdarg.h>
+#include <stddef.h>
+
+#define VBOX_ONLY(x)             /* nike */
+#define qemu_snprintf snprintf   /* bird */
+#define qemu_vsnprintf vsnprintf /* bird */
+#define qemu_vprintf vprintf     /* bird */
+
+#endif /* !VBOX */
+
+#ifdef __OpenBSD__
+#include <sys/types.h>
+#include <sys/signal.h>
+#endif
+
+#ifndef VBOX
+#ifndef _WIN32
+#include <sys/time.h>
+#endif
+#endif /* !VBOX */
+
+#ifndef glue
+#define xglue(x, y) x ## y
+#define glue(x, y) xglue(x, y)
+#define stringify(s)	tostring(s)
+#define tostring(s)	#s
+#endif
+
+#ifndef likely
+#if __GNUC__ < 3
+#define __builtin_expect(x, n) (x)
+#endif
+
+#define likely(x)   __builtin_expect(!!(x), 1)
+#define unlikely(x)   __builtin_expect(!!(x), 0)
+#endif
+
+#ifdef CONFIG_NEED_OFFSETOF
+#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *) 0)->MEMBER)
+#endif
+#ifndef container_of
+#define container_of(ptr, type, member) ({                      \
+        const typeof(((type *) 0)->member) *__mptr = (ptr);     \
+        (type *) ((char *) __mptr - offsetof(type, member));})
+#endif
+
+/* Convert from a base type to a parent type, with compile time checking.  */
+#ifdef __GNUC__
+#define DO_UPCAST(type, field, dev) ( __extension__ ( { \
+    char __attribute__((unused)) offset_must_be_zero[ \
+        -offsetof(type, field)]; \
+    container_of(dev, type, field);}))
+#else
+#define DO_UPCAST(type, field, dev) container_of(dev, type, field)
+#endif
+
+#define typeof_field(type, field) typeof(((type *)0)->field)
+#define type_check(t1,t2) ((t1*)0 - (t2*)0)
+
+#ifndef MIN
+#define MIN(a, b) (((a) < (b)) ? (a) : (b))
+#endif
+#ifndef MAX
+#define MAX(a, b) (((a) > (b)) ? (a) : (b))
+#endif
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
+#ifndef always_inline
+#if !((__GNUC__ < 3) || defined(__APPLE__))
+#ifdef __OPTIMIZE__
+#define inline __attribute__ (( always_inline )) __inline__
+#endif
+#endif
+#else
+#define inline always_inline
+#endif
+
+#ifdef __i386__
+#define REGPARM __attribute((regparm(3)))
+#else
+#define REGPARM
+#endif
+
+#ifndef VBOX
+#define qemu_printf printf
+#else  /*VBOX*/
+#define qemu_printf RTLogPrintf
+#endif /*VBOX*/
+
+#if defined (__GNUC__) && defined (__GNUC_MINOR__)
+# define QEMU_GNUC_PREREQ(maj, min) \
+         ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min))
+#else
+# define QEMU_GNUC_PREREQ(maj, min) 0
+#endif
+
+#ifndef VBOX
+void *qemu_memalign(size_t alignment, size_t size);
+void *qemu_vmalloc(size_t size);
+void qemu_vfree(void *ptr);
+
+int qemu_create_pidfile(const char *filename);
+
+#ifdef _WIN32
+int ffs(int i);
+
+typedef struct {
+    long tv_sec;
+    long tv_usec;
+} qemu_timeval;
+int qemu_gettimeofday(qemu_timeval *tp);
+#else
+typedef struct timeval qemu_timeval;
+#define qemu_gettimeofday(tp) gettimeofday(tp, NULL);
+#endif /* !_WIN32 */
+#else  /* VBOX */
+# define qemu_memalign(alignment, size) ( (alignment) <= PAGE_SIZE ? RTMemPageAlloc((size)) : NULL )
+# define qemu_vfree(pv)                 RTMemPageFree(pv, missing_size_parameter)
+# define qemu_vmalloc(cb)               RTMemPageAlloc(cb)
+#endif /* VBOX */
+
+#endif
diff --git a/src/recompiler/qemu-barrier.h b/src/recompiler/qemu-barrier.h
new file mode 100644
index 00000000..b77fce23
--- /dev/null
+++ b/src/recompiler/qemu-barrier.h
@@ -0,0 +1,10 @@
+#ifndef __QEMU_BARRIER_H
+#define __QEMU_BARRIER_H 1
+
+/* FIXME: arch dependant, x86 version */
+#define smp_wmb()   asm volatile("" ::: "memory")
+
+/* Compiler barrier */
+#define barrier()   asm volatile("" ::: "memory")
+
+#endif
diff --git a/src/recompiler/qemu-common.h b/src/recompiler/qemu-common.h
new file mode 100644
index 00000000..ca6e3ce2
--- /dev/null
+++ b/src/recompiler/qemu-common.h
@@ -0,0 +1,348 @@
+/* Common header file that is included by all of qemu.  */
+#ifndef QEMU_COMMON_H
+#define QEMU_COMMON_H
+
+#include "config-host.h"
+
+#ifdef VBOX
+
+# include <iprt/string.h>
+# include <iprt/types.h>
+# include <iprt/ctype.h>
+
+void pstrcpy(char *buf, int buf_size, const char *str);
+char *pstrcat(char *buf, int buf_size, const char *s);
+# define snprintf               RTStrPrintf
+
+# define qemu_isalnum(c)        RT_C_IS_ALNUM((unsigned char)(c))
+# define qemu_isalpha(c)        RT_C_IS_ALPHA((unsigned char)(c))
+# define qemu_iscntrl(c)        RT_C_IS_CNTRL((unsigned char)(c))
+# define qemu_isdigit(c)        RT_C_IS_DIGIT((unsigned char)(c))
+# define qemu_isgraph(c)        RT_C_IS_GRAPH((unsigned char)(c))
+# define qemu_islower(c)        RT_C_IS_LOWER((unsigned char)(c))
+# define qemu_isprint(c)        RT_C_IS_PRINT((unsigned char)(c))
+# define qemu_ispunct(c)        RT_C_IS_PUNCT((unsigned char)(c))
+# define qemu_isspace(c)        RT_C_IS_SPACE((unsigned char)(c))
+# define qemu_isupper(c)        RT_C_IS_UPPER((unsigned char)(c))
+# define qemu_isxdigit(c)	RT_C_IS_XDIGIT((unsigned char)(c))
+# define qemu_tolower(c)        RT_C_TO_LOWER((unsigned char)(c))
+# define qemu_toupper(c)        RT_C_TO_UPPER((unsigned char)(c))
+# define qemu_isascii(c)        RT_C_IS_ASCII((unsigned char)(c))
+# define qemu_toascii(c)        RT_C_TO_ASCII((unsigned char)(c))
+
+# define qemu_init_vcpu(env)    do { } while (0) /* we don't need this :-) */
+
+# define QEMU_NORETURN              __attribute__((__noreturn__))
+# ifdef CONFIG_GCC_ATTRIBUTE_WARN_UNUSED_RESULT
+#  define QEMU_WARN_UNUSED_RESULT   __attribute__((warn_unused_result))
+# else
+#  define QEMU_WARN_UNUSED_RESULT
+# endif
+#define QEMU_BUILD_BUG_ON(x) typedef char __build_bug_on__##__LINE__[(x)?-1:1];
+
+#include <stdio.h>
+#include "cpu.h"
+
+
+#else /* !VBOX */
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#define WINVER 0x0501  /* needed for ipv6 bits */
+#include <windows.h>
+#endif
+
+#define QEMU_NORETURN __attribute__ ((__noreturn__))
+#ifdef CONFIG_GCC_ATTRIBUTE_WARN_UNUSED_RESULT
+#define QEMU_WARN_UNUSED_RESULT __attribute__((warn_unused_result))
+#else
+#define QEMU_WARN_UNUSED_RESULT
+#endif
+
+#define QEMU_BUILD_BUG_ON(x) typedef char __build_bug_on__##__LINE__[(x)?-1:1];
+
+typedef struct QEMUTimer QEMUTimer;
+typedef struct QEMUFile QEMUFile;
+typedef struct QEMUBH QEMUBH;
+typedef struct DeviceState DeviceState;
+
+
+/* Hack around the mess dyngen-exec.h causes: We need QEMU_NORETURN in files that
+   cannot include the following headers without conflicts. This condition has
+   to be removed once dyngen is gone. */
+#ifndef __DYNGEN_EXEC_H__
+
+/* we put basic includes here to avoid repeating them in device drivers */
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <string.h>
+#include <strings.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <time.h>
+#include <ctype.h>
+#include <errno.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <assert.h>
+
+#ifndef O_LARGEFILE
+#define O_LARGEFILE 0
+#endif
+#ifndef O_BINARY
+#define O_BINARY 0
+#endif
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+#ifndef ENOMEDIUM
+#define ENOMEDIUM ENODEV
+#endif
+#if !defined(ENOTSUP)
+#define ENOTSUP 4096
+#endif
+
+#ifndef CONFIG_IOVEC
+#define CONFIG_IOVEC
+struct iovec {
+    void *iov_base;
+    size_t iov_len;
+};
+/*
+ * Use the same value as Linux for now.
+ */
+#define IOV_MAX		1024
+#else
+#include <sys/uio.h>
+#endif
+
+#ifdef _WIN32
+#define fsync _commit
+#define lseek _lseeki64
+extern int qemu_ftruncate64(int, int64_t);
+#define ftruncate qemu_ftruncate64
+
+static inline char *realpath(const char *path, char *resolved_path)
+{
+    _fullpath(resolved_path, path, _MAX_PATH);
+    return resolved_path;
+}
+
+#define PRId64 "I64d"
+#define PRIx64 "I64x"
+#define PRIu64 "I64u"
+#define PRIo64 "I64o"
+#endif
+
+/* FIXME: Remove NEED_CPU_H.  */
+#ifndef NEED_CPU_H
+
+#include <setjmp.h>
+#include "osdep.h"
+#include "bswap.h"
+
+#else
+
+#include "cpu.h"
+
+#endif /* !defined(NEED_CPU_H) */
+
+/* bottom halves */
+typedef void QEMUBHFunc(void *opaque);
+
+void async_context_push(void);
+void async_context_pop(void);
+int get_async_context_id(void);
+
+QEMUBH *qemu_bh_new(QEMUBHFunc *cb, void *opaque);
+void qemu_bh_schedule(QEMUBH *bh);
+/* Bottom halfs that are scheduled from a bottom half handler are instantly
+ * invoked.  This can create an infinite loop if a bottom half handler
+ * schedules itself.  qemu_bh_schedule_idle() avoids this infinite loop by
+ * ensuring that the bottom half isn't executed until the next main loop
+ * iteration.
+ */
+void qemu_bh_schedule_idle(QEMUBH *bh);
+void qemu_bh_cancel(QEMUBH *bh);
+void qemu_bh_delete(QEMUBH *bh);
+int qemu_bh_poll(void);
+void qemu_bh_update_timeout(int *timeout);
+
+uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c);
+
+void qemu_get_timedate(struct tm *tm, int offset);
+int qemu_timedate_diff(struct tm *tm);
+
+/* cutils.c */
+void pstrcpy(char *buf, int buf_size, const char *str);
+char *pstrcat(char *buf, int buf_size, const char *s);
+int strstart(const char *str, const char *val, const char **ptr);
+int stristart(const char *str, const char *val, const char **ptr);
+int qemu_strnlen(const char *s, int max_len);
+time_t mktimegm(struct tm *tm);
+int qemu_fls(int i);
+int qemu_fdatasync(int fd);
+int fcntl_setfl(int fd, int flag);
+
+/* path.c */
+void init_paths(const char *prefix);
+const char *path(const char *pathname);
+
+#define qemu_isalnum(c)		isalnum((unsigned char)(c))
+#define qemu_isalpha(c)		isalpha((unsigned char)(c))
+#define qemu_iscntrl(c)		iscntrl((unsigned char)(c))
+#define qemu_isdigit(c)		isdigit((unsigned char)(c))
+#define qemu_isgraph(c)		isgraph((unsigned char)(c))
+#define qemu_islower(c)		islower((unsigned char)(c))
+#define qemu_isprint(c)		isprint((unsigned char)(c))
+#define qemu_ispunct(c)		ispunct((unsigned char)(c))
+#define qemu_isspace(c)		isspace((unsigned char)(c))
+#define qemu_isupper(c)		isupper((unsigned char)(c))
+#define qemu_isxdigit(c)	isxdigit((unsigned char)(c))
+#define qemu_tolower(c)		tolower((unsigned char)(c))
+#define qemu_toupper(c)		toupper((unsigned char)(c))
+#define qemu_isascii(c)		isascii((unsigned char)(c))
+#define qemu_toascii(c)		toascii((unsigned char)(c))
+
+void *qemu_malloc(size_t size);
+void *qemu_realloc(void *ptr, size_t size);
+void *qemu_mallocz(size_t size);
+void qemu_free(void *ptr);
+char *qemu_strdup(const char *str);
+char *qemu_strndup(const char *str, size_t size);
+
+void qemu_mutex_lock_iothread(void);
+void qemu_mutex_unlock_iothread(void);
+
+int qemu_open(const char *name, int flags, ...);
+ssize_t qemu_write_full(int fd, const void *buf, size_t count)
+    QEMU_WARN_UNUSED_RESULT;
+void qemu_set_cloexec(int fd);
+
+#ifndef _WIN32
+int qemu_eventfd(int pipefd[2]);
+int qemu_pipe(int pipefd[2]);
+#endif
+
+/* Error handling.  */
+
+void QEMU_NORETURN hw_error(const char *fmt, ...)
+    __attribute__ ((__format__ (__printf__, 1, 2)));
+
+/* IO callbacks.  */
+typedef void IOReadHandler(void *opaque, const uint8_t *buf, int size);
+typedef int IOCanReadHandler(void *opaque);
+typedef void IOHandler(void *opaque);
+
+struct ParallelIOArg {
+    void *buffer;
+    int count;
+};
+
+typedef int (*DMA_transfer_handler) (void *opaque, int nchan, int pos, int size);
+
+/* A load of opaque types so that device init declarations don't have to
+   pull in all the real definitions.  */
+typedef struct NICInfo NICInfo;
+typedef struct HCIInfo HCIInfo;
+typedef struct AudioState AudioState;
+typedef struct BlockDriverState BlockDriverState;
+typedef struct DisplayState DisplayState;
+typedef struct DisplayChangeListener DisplayChangeListener;
+typedef struct DisplaySurface DisplaySurface;
+typedef struct DisplayAllocator DisplayAllocator;
+typedef struct PixelFormat PixelFormat;
+typedef struct TextConsole TextConsole;
+typedef TextConsole QEMUConsole;
+typedef struct CharDriverState CharDriverState;
+typedef struct MACAddr MACAddr;
+typedef struct VLANState VLANState;
+typedef struct VLANClientState VLANClientState;
+typedef struct i2c_bus i2c_bus;
+typedef struct i2c_slave i2c_slave;
+typedef struct SMBusDevice SMBusDevice;
+typedef struct PCIHostState PCIHostState;
+typedef struct PCIExpressHost PCIExpressHost;
+typedef struct PCIBus PCIBus;
+typedef struct PCIDevice PCIDevice;
+typedef struct SerialState SerialState;
+typedef struct IRQState *qemu_irq;
+typedef struct PCMCIACardState PCMCIACardState;
+typedef struct MouseTransformInfo MouseTransformInfo;
+typedef struct uWireSlave uWireSlave;
+typedef struct I2SCodec I2SCodec;
+typedef struct SSIBus SSIBus;
+typedef struct EventNotifier EventNotifier;
+typedef struct VirtIODevice VirtIODevice;
+
+typedef uint64_t pcibus_t;
+
+void cpu_exec_init_all(unsigned long tb_size);
+
+/* CPU save/load.  */
+void cpu_save(QEMUFile *f, void *opaque);
+int cpu_load(QEMUFile *f, void *opaque, int version_id);
+
+/* Force QEMU to stop what it's doing and service IO */
+void qemu_service_io(void);
+
+/* Force QEMU to process pending events */
+void qemu_notify_event(void);
+
+/* Unblock cpu */
+void qemu_cpu_kick(void *env);
+int qemu_cpu_self(void *env);
+
+/* work queue */
+struct qemu_work_item {
+    struct qemu_work_item *next;
+    void (*func)(void *data);
+    void *data;
+    int done;
+};
+
+#ifdef CONFIG_USER_ONLY
+#define qemu_init_vcpu(env) do { } while (0)
+#else
+void qemu_init_vcpu(void *env);
+#endif
+
+typedef struct QEMUIOVector {
+    struct iovec *iov;
+    int niov;
+    int nalloc;
+    size_t size;
+} QEMUIOVector;
+
+void qemu_iovec_init(QEMUIOVector *qiov, int alloc_hint);
+void qemu_iovec_init_external(QEMUIOVector *qiov, struct iovec *iov, int niov);
+void qemu_iovec_add(QEMUIOVector *qiov, void *base, size_t len);
+void qemu_iovec_concat(QEMUIOVector *dst, QEMUIOVector *src, size_t size);
+void qemu_iovec_destroy(QEMUIOVector *qiov);
+void qemu_iovec_reset(QEMUIOVector *qiov);
+void qemu_iovec_to_buffer(QEMUIOVector *qiov, void *buf);
+void qemu_iovec_from_buffer(QEMUIOVector *qiov, const void *buf, size_t count);
+
+struct Monitor;
+typedef struct Monitor Monitor;
+
+/* Convert a byte between binary and BCD.  */
+static inline uint8_t to_bcd(uint8_t val)
+{
+    return ((val / 10) << 4) | (val % 10);
+}
+
+static inline uint8_t from_bcd(uint8_t val)
+{
+    return ((val >> 4) * 10) + (val & 0x0f);
+}
+
+#include "module.h"
+
+#endif /* dyngen-exec.h hack */
+
+#endif /* !VBOX */
+
+#endif
diff --git a/src/recompiler/qemu-lock.h b/src/recompiler/qemu-lock.h
new file mode 100644
index 00000000..1234a5b6
--- /dev/null
+++ b/src/recompiler/qemu-lock.h
@@ -0,0 +1,261 @@
+/*
+ *  Copyright (c) 2003 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>
+ */
+
+/*
+ * Oracle LGPL Disclaimer: For the avoidance of doubt, except that if any license choice
+ * other than GPL or LGPL is available it will apply instead, Oracle elects to use only
+ * the Lesser General Public License version 2.1 (LGPLv2) at this time for any software where
+ * a choice of LGPL license versions is made available with the language indicating
+ * that LGPLv2 or any later version may be used, or where a choice of which version
+ * of the LGPL is applied is otherwise unspecified.
+ */
+
+/* Locking primitives.  Most of this code should be redundant -
+   system emulation doesn't need/use locking, NPTL userspace uses
+   pthread mutexes, and non-NPTL userspace isn't threadsafe anyway.
+   In either case a spinlock is probably the wrong kind of lock.
+   Spinlocks are only good if you know annother CPU has the lock and is
+   likely to release it soon.  In environments where you have more threads
+   than physical CPUs (the extreme case being a single CPU host) a spinlock
+   simply wastes CPU until the OS decides to preempt it.  */
+#if defined(CONFIG_USE_NPTL)
+
+#include <pthread.h>
+#define spin_lock pthread_mutex_lock
+#define spin_unlock pthread_mutex_unlock
+#define spinlock_t pthread_mutex_t
+#define SPIN_LOCK_UNLOCKED PTHREAD_MUTEX_INITIALIZER
+
+#else
+
+#if defined(__hppa__)
+
+typedef int spinlock_t[4];
+
+#define SPIN_LOCK_UNLOCKED { 1, 1, 1, 1 }
+
+static inline void resetlock (spinlock_t *p)
+{
+    (*p)[0] = (*p)[1] = (*p)[2] = (*p)[3] = 1;
+}
+
+#else
+
+typedef int spinlock_t;
+
+#define SPIN_LOCK_UNLOCKED 0
+
+static inline void resetlock (spinlock_t *p)
+{
+    *p = SPIN_LOCK_UNLOCKED;
+}
+
+#endif
+
+#ifdef VBOX
+DECLINLINE(int) testandset (int *p)
+{
+    return ASMAtomicCmpXchgU32((volatile uint32_t *)p, 1, 0) ? 0 : 1;
+}
+#elif defined(_ARCH_PPC)
+static inline int testandset (int *p)
+{
+    int ret;
+    __asm__ __volatile__ (
+                          "      lwarx %0,0,%1\n"
+                          "      xor. %0,%3,%0\n"
+                          "      bne $+12\n"
+                          "      stwcx. %2,0,%1\n"
+                          "      bne- $-16\n"
+                          : "=&r" (ret)
+                          : "r" (p), "r" (1), "r" (0)
+                          : "cr0", "memory");
+    return ret;
+}
+#elif defined(__i386__)
+static inline int testandset (int *p)
+{
+    long int readval = 0;
+
+    __asm__ __volatile__ ("lock; cmpxchgl %2, %0"
+                          : "+m" (*p), "+a" (readval)
+                          : "r" (1)
+                          : "cc");
+    return readval;
+}
+#elif defined(__x86_64__)
+static inline int testandset (int *p)
+{
+    long int64_t readval = 0;
+
+    __asm__ __volatile__ ("lock; cmpxchgl %2, %0"
+                          : "+m" (*p), "+a" (readval)
+                          : "r" (1)
+                          : "cc");
+    return readval;
+}
+#elif defined(__s390__)
+static inline int testandset (int *p)
+{
+    int ret;
+
+    __asm__ __volatile__ ("0: cs    %0,%1,0(%2)\n"
+			  "   jl    0b"
+			  : "=&d" (ret)
+			  : "r" (1), "a" (p), "0" (*p)
+			  : "cc", "memory" );
+    return ret;
+}
+#elif defined(__alpha__)
+static inline int testandset (int *p)
+{
+    int ret;
+    unsigned long one;
+
+    __asm__ __volatile__ ("0:	mov 1,%2\n"
+			  "	ldl_l %0,%1\n"
+			  "	stl_c %2,%1\n"
+			  "	beq %2,1f\n"
+			  ".subsection 2\n"
+			  "1:	br 0b\n"
+			  ".previous"
+			  : "=r" (ret), "=m" (*p), "=r" (one)
+			  : "m" (*p));
+    return ret;
+}
+#elif defined(__sparc__)
+static inline int testandset (int *p)
+{
+	int ret;
+
+	__asm__ __volatile__("ldstub	[%1], %0"
+			     : "=r" (ret)
+			     : "r" (p)
+			     : "memory");
+
+	return (ret ? 1 : 0);
+}
+#elif defined(__arm__)
+static inline int testandset (int *spinlock)
+{
+    register unsigned int ret;
+    __asm__ __volatile__("swp %0, %1, [%2]"
+                         : "=r"(ret)
+                         : "0"(1), "r"(spinlock));
+
+    return ret;
+}
+#elif defined(__mc68000)
+static inline int testandset (int *p)
+{
+    char ret;
+    __asm__ __volatile__("tas %1; sne %0"
+                         : "=r" (ret)
+                         : "m" (p)
+                         : "cc","memory");
+    return ret;
+}
+#elif defined(__hppa__)
+
+/* Because malloc only guarantees 8-byte alignment for malloc'd data,
+   and GCC only guarantees 8-byte alignment for stack locals, we can't
+   be assured of 16-byte alignment for atomic lock data even if we
+   specify "__attribute ((aligned(16)))" in the type declaration.  So,
+   we use a struct containing an array of four ints for the atomic lock
+   type and dynamically select the 16-byte aligned int from the array
+   for the semaphore.  */
+#define __PA_LDCW_ALIGNMENT 16
+static inline void *ldcw_align (void *p) {
+    unsigned long a = (unsigned long)p;
+    a = (a + __PA_LDCW_ALIGNMENT - 1) & ~(__PA_LDCW_ALIGNMENT - 1);
+    return (void *)a;
+}
+
+static inline int testandset (spinlock_t *p)
+{
+    unsigned int ret;
+    p = ldcw_align(p);
+    __asm__ __volatile__("ldcw 0(%1),%0"
+                         : "=r" (ret)
+                         : "r" (p)
+                         : "memory" );
+    return !ret;
+}
+
+#elif defined(__ia64)
+
+#include <ia64intrin.h>
+
+static inline int testandset (int *p)
+{
+    return __sync_lock_test_and_set (p, 1);
+}
+#elif defined(__mips__)
+static inline int testandset (int *p)
+{
+    int ret;
+
+    __asm__ __volatile__ (
+	"	.set push		\n"
+	"	.set noat		\n"
+	"	.set mips2		\n"
+	"1:	li	$1, 1		\n"
+	"	ll	%0, %1		\n"
+	"	sc	$1, %1		\n"
+	"	beqz	$1, 1b		\n"
+	"	.set pop		"
+	: "=r" (ret), "+R" (*p)
+	:
+	: "memory");
+
+    return ret;
+}
+#else
+#error unimplemented CPU support
+#endif
+
+#if defined(CONFIG_USER_ONLY)
+static inline void spin_lock(spinlock_t *lock)
+{
+    while (testandset(lock));
+}
+
+static inline void spin_unlock(spinlock_t *lock)
+{
+    resetlock(lock);
+}
+
+static inline int spin_trylock(spinlock_t *lock)
+{
+    return !testandset(lock);
+}
+#else
+static inline void spin_lock(spinlock_t *lock)
+{
+}
+
+static inline void spin_unlock(spinlock_t *lock)
+{
+}
+
+static inline int spin_trylock(spinlock_t *lock)
+{
+    return 1;
+}
+#endif
+
+#endif
diff --git a/src/recompiler/qemu-log.h b/src/recompiler/qemu-log.h
new file mode 100644
index 00000000..64d375b4
--- /dev/null
+++ b/src/recompiler/qemu-log.h
@@ -0,0 +1,135 @@
+#ifndef QEMU_LOG_H
+#define QEMU_LOG_H
+
+/* The deprecated global variables: */
+extern FILE *logfile;
+extern int loglevel;
+
+
+/*
+ * The new API:
+ *
+ */
+
+/* Log settings checking macros: */
+
+/* Returns true if qemu_log() will really write somewhere
+ */
+#ifndef VBOX
+#define qemu_log_enabled() (logfile != NULL)
+#else
+# define qemu_log_enabled() LogIsEnabled()
+#endif
+
+/* Returns true if a bit is set in the current loglevel mask
+ */
+#define qemu_loglevel_mask(b) ((loglevel & (b)) != 0)
+
+
+/* Logging functions: */
+
+/* main logging function
+ */
+#ifndef VBOX
+#define qemu_log(...) do {                 \
+        if (logfile)                       \
+            fprintf(logfile, ## __VA_ARGS__); \
+    } while (0)
+#else
+# define qemu_log(...) Log((__VA_ARGS__))
+#endif
+
+/* vfprintf-like logging function
+ */
+#ifndef VBOX
+#define qemu_log_vprintf(fmt, va) do {     \
+        if (logfile)                       \
+            vfprintf(logfile, fmt, va);    \
+    } while (0)
+#else
+# define qemu_log_vprintf(fmt, va) do { \
+        if (LogIsEnabled()) \
+            RTLogLoggerExV(RTLOGGRPFLAGS_LEVEL_1, LOG_GROUP, fmt, va); \
+    } while (0)
+#endif
+
+/* log only if a bit is set on the current loglevel mask
+ */
+#ifndef VBOX
+#define qemu_log_mask(b, ...) do {         \
+        if (loglevel & (b))                \
+            fprintf(logfile, ## __VA_ARGS__); \
+    } while (0)
+#else
+# define qemu_log_mask(b, ...) do { \
+        if (loglevel & (b)) \
+            Log((__VA_ARGS__)); \
+    } while (0)
+#endif
+
+
+
+
+/* Special cases: */
+
+/* cpu_dump_state() logging functions: */
+#ifndef VBOX
+#define log_cpu_state(env, f) cpu_dump_state((env), logfile, fprintf, (f));
+#else
+#define log_cpu_state(env, f) cpu_dump_state((env), NULL, NULL, (f));
+#endif
+#define log_cpu_state_mask(b, env, f) do {           \
+      if (loglevel & (b)) log_cpu_state((env), (f)); \
+  } while (0)
+
+/* disas() and target_disas() to logfile: */
+#define log_target_disas(start, len, flags) \
+        target_disas(logfile, (start), (len), (flags))
+#define log_disas(start, len) \
+        disas(logfile, (start), (len))
+
+/* page_dump() output to the log file: */
+#define log_page_dump() page_dump(logfile)
+
+
+
+/* Maintenance: */
+
+/* fflush() the log file */
+#ifndef VBOX
+#define qemu_log_flush() fflush(logfile)
+#else
+# define qemu_log_flush()           RTLogFlush(NULL)
+#endif
+
+/* Close the log file */
+#ifndef VBOX
+#define qemu_log_close() do { \
+        fclose(logfile);      \
+        logfile = NULL;       \
+    } while (0)
+#else
+# define qemu_log_close()           do { } while (0)
+#endif
+
+/* Set up a new log file */
+#ifndef VBOX
+#define qemu_log_set_file(f) do { \
+        logfile = (f);            \
+    } while (0)
+#else
+# define qemu_log_set_file(f)       do { } while (0)
+#endif
+
+/* Set up a new log file, only if none is set */
+#ifndef VBOX
+#define qemu_log_try_set_file(f) do { \
+        if (!logfile)                 \
+            logfile = (f);            \
+    } while (0)
+#else
+#define qemu_log_try_set_file(f)    do { } while (0)
+#endif
+
+
+#endif
diff --git a/src/recompiler/qemu-queue.h b/src/recompiler/qemu-queue.h
new file mode 100644
index 00000000..1d077458
--- /dev/null
+++ b/src/recompiler/qemu-queue.h
@@ -0,0 +1,449 @@
+/*      $NetBSD: queue.h,v 1.52 2009/04/20 09:56:08 mschuett Exp $ */
+
+/*
+ * Qemu version: Copy from netbsd, removed debug code, removed some of
+ * the implementations.  Left in lists, simple queues, tail queues and
+ * circular queues.
+ */
+
+/*
+ * Copyright (c) 1991, 1993
+ *      The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *      @(#)queue.h     8.5 (Berkeley) 8/20/94
+ */
+
+#ifndef QEMU_SYS_QUEUE_H_
+#define QEMU_SYS_QUEUE_H_
+
+/*
+ * This file defines four types of data structures:
+ * lists, simple queues, tail queues, and circular queues.
+ *
+ * A list is headed by a single forward pointer (or an array of forward
+ * pointers for a hash table header). The elements are doubly linked
+ * so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before
+ * or after an existing element or at the head of the list. A list
+ * may only be traversed in the forward direction.
+ *
+ * A simple queue is headed by a pair of pointers, one the head of the
+ * list and the other to the tail of the list. The elements are singly
+ * linked to save space, so elements can only be removed from the
+ * head of the list. New elements can be added to the list after
+ * an existing element, at the head of the list, or at the end of the
+ * list. A simple queue may only be traversed in the forward direction.
+ *
+ * A tail queue is headed by a pair of pointers, one to the head of the
+ * list and the other to the tail of the list. The elements are doubly
+ * linked so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before or
+ * after an existing element, at the head of the list, or at the end of
+ * the list. A tail queue may be traversed in either direction.
+ *
+ * A circle queue is headed by a pair of pointers, one to the head of the
+ * list and the other to the tail of the list. The elements are doubly
+ * linked so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before or after
+ * an existing element, at the head of the list, or at the end of the list.
+ * A circle queue may be traversed in either direction, but has a more
+ * complex end of list detection.
+ *
+ * For details on the use of these macros, see the queue(3) manual page.
+ */
+
+/*
+ * List definitions.
+ */
+#define QLIST_HEAD(name, type)                                          \
+struct name {                                                           \
+        struct type *lh_first;  /* first element */                     \
+}
+
+#define QLIST_HEAD_INITIALIZER(head)                                    \
+        { NULL }
+
+#define QLIST_ENTRY(type)                                               \
+struct {                                                                \
+        struct type *le_next;   /* next element */                      \
+        struct type **le_prev;  /* address of previous next element */  \
+}
+
+/*
+ * List functions.
+ */
+#define QLIST_INIT(head) do {                                           \
+        (head)->lh_first = NULL;                                        \
+} while (/*CONSTCOND*/0)
+
+#define QLIST_INSERT_AFTER(listelm, elm, field) do {                    \
+        if (((elm)->field.le_next = (listelm)->field.le_next) != NULL)  \
+                (listelm)->field.le_next->field.le_prev =               \
+                    &(elm)->field.le_next;                              \
+        (listelm)->field.le_next = (elm);                               \
+        (elm)->field.le_prev = &(listelm)->field.le_next;               \
+} while (/*CONSTCOND*/0)
+
+#define QLIST_INSERT_BEFORE(listelm, elm, field) do {                   \
+        (elm)->field.le_prev = (listelm)->field.le_prev;                \
+        (elm)->field.le_next = (listelm);                               \
+        *(listelm)->field.le_prev = (elm);                              \
+        (listelm)->field.le_prev = &(elm)->field.le_next;               \
+} while (/*CONSTCOND*/0)
+
+#define QLIST_INSERT_HEAD(head, elm, field) do {                        \
+        if (((elm)->field.le_next = (head)->lh_first) != NULL)          \
+                (head)->lh_first->field.le_prev = &(elm)->field.le_next;\
+        (head)->lh_first = (elm);                                       \
+        (elm)->field.le_prev = &(head)->lh_first;                       \
+} while (/*CONSTCOND*/0)
+
+#define QLIST_REMOVE(elm, field) do {                                   \
+        if ((elm)->field.le_next != NULL)                               \
+                (elm)->field.le_next->field.le_prev =                   \
+                    (elm)->field.le_prev;                               \
+        *(elm)->field.le_prev = (elm)->field.le_next;                   \
+} while (/*CONSTCOND*/0)
+
+#define QLIST_FOREACH(var, head, field)                                 \
+        for ((var) = ((head)->lh_first);                                \
+                (var);                                                  \
+                (var) = ((var)->field.le_next))
+
+#define QLIST_FOREACH_SAFE(var, head, field, next_var)                  \
+        for ((var) = ((head)->lh_first);                                \
+                (var) && ((next_var) = ((var)->field.le_next), 1);      \
+                (var) = (next_var))
+
+/*
+ * List access methods.
+ */
+#define QLIST_EMPTY(head)                ((head)->lh_first == NULL)
+#define QLIST_FIRST(head)                ((head)->lh_first)
+#define QLIST_NEXT(elm, field)           ((elm)->field.le_next)
+
+
+/*
+ * Simple queue definitions.
+ */
+#define QSIMPLEQ_HEAD(name, type)                                       \
+struct name {                                                           \
+    struct type *sqh_first;    /* first element */                      \
+    struct type **sqh_last;    /* addr of last next element */          \
+}
+
+#define QSIMPLEQ_HEAD_INITIALIZER(head)                                 \
+    { NULL, &(head).sqh_first }
+
+#define QSIMPLEQ_ENTRY(type)                                            \
+struct {                                                                \
+    struct type *sqe_next;    /* next element */                        \
+}
+
+/*
+ * Simple queue functions.
+ */
+#define QSIMPLEQ_INIT(head) do {                                        \
+    (head)->sqh_first = NULL;                                           \
+    (head)->sqh_last = &(head)->sqh_first;                              \
+} while (/*CONSTCOND*/0)
+
+#define QSIMPLEQ_INSERT_HEAD(head, elm, field) do {                     \
+    if (((elm)->field.sqe_next = (head)->sqh_first) == NULL)            \
+        (head)->sqh_last = &(elm)->field.sqe_next;                      \
+    (head)->sqh_first = (elm);                                          \
+} while (/*CONSTCOND*/0)
+
+#define QSIMPLEQ_INSERT_TAIL(head, elm, field) do {                     \
+    (elm)->field.sqe_next = NULL;                                       \
+    *(head)->sqh_last = (elm);                                          \
+    (head)->sqh_last = &(elm)->field.sqe_next;                          \
+} while (/*CONSTCOND*/0)
+
+#define QSIMPLEQ_INSERT_AFTER(head, listelm, elm, field) do {           \
+    if (((elm)->field.sqe_next = (listelm)->field.sqe_next) == NULL)    \
+        (head)->sqh_last = &(elm)->field.sqe_next;                      \
+    (listelm)->field.sqe_next = (elm);                                  \
+} while (/*CONSTCOND*/0)
+
+#define QSIMPLEQ_REMOVE_HEAD(head, field) do {                          \
+    if (((head)->sqh_first = (head)->sqh_first->field.sqe_next) == NULL)\
+        (head)->sqh_last = &(head)->sqh_first;                          \
+} while (/*CONSTCOND*/0)
+
+#define QSIMPLEQ_REMOVE(head, elm, type, field) do {                    \
+    if ((head)->sqh_first == (elm)) {                                   \
+        QSIMPLEQ_REMOVE_HEAD((head), field);                            \
+    } else {                                                            \
+        struct type *curelm = (head)->sqh_first;                        \
+        while (curelm->field.sqe_next != (elm))                         \
+            curelm = curelm->field.sqe_next;                            \
+        if ((curelm->field.sqe_next =                                   \
+            curelm->field.sqe_next->field.sqe_next) == NULL)            \
+                (head)->sqh_last = &(curelm)->field.sqe_next;           \
+    }                                                                   \
+} while (/*CONSTCOND*/0)
+
+#define QSIMPLEQ_FOREACH(var, head, field)                              \
+    for ((var) = ((head)->sqh_first);                                   \
+        (var);                                                          \
+        (var) = ((var)->field.sqe_next))
+
+#define QSIMPLEQ_FOREACH_SAFE(var, head, field, next)                   \
+    for ((var) = ((head)->sqh_first);                                   \
+        (var) && ((next = ((var)->field.sqe_next)), 1);                 \
+        (var) = (next))
+
+#define QSIMPLEQ_CONCAT(head1, head2) do {                              \
+    if (!QSIMPLEQ_EMPTY((head2))) {                                     \
+        *(head1)->sqh_last = (head2)->sqh_first;                        \
+        (head1)->sqh_last = (head2)->sqh_last;                          \
+        QSIMPLEQ_INIT((head2));                                         \
+    }                                                                   \
+} while (/*CONSTCOND*/0)
+
+#define QSIMPLEQ_LAST(head, type, field)                                \
+    (QSIMPLEQ_EMPTY((head)) ?                                           \
+        NULL :                                                          \
+            ((struct type *)(void *)                                    \
+        ((char *)((head)->sqh_last) - offsetof(struct type, field))))
+
+/*
+ * Simple queue access methods.
+ */
+#define QSIMPLEQ_EMPTY(head)        ((head)->sqh_first == NULL)
+#define QSIMPLEQ_FIRST(head)        ((head)->sqh_first)
+#define QSIMPLEQ_NEXT(elm, field)   ((elm)->field.sqe_next)
+
+
+/*
+ * Tail queue definitions.
+ */
+#define Q_TAILQ_HEAD(name, type, qual)                                  \
+struct name {                                                           \
+        qual type *tqh_first;           /* first element */             \
+        qual type *qual *tqh_last;      /* addr of last next element */ \
+}
+#define QTAILQ_HEAD(name, type)  Q_TAILQ_HEAD(name, struct type,)
+
+#define QTAILQ_HEAD_INITIALIZER(head)                                   \
+        { NULL, &(head).tqh_first }
+
+#define Q_TAILQ_ENTRY(type, qual)                                       \
+struct {                                                                \
+        qual type *tqe_next;            /* next element */              \
+        qual type *qual *tqe_prev;      /* address of previous next element */\
+}
+#define QTAILQ_ENTRY(type)       Q_TAILQ_ENTRY(struct type,)
+
+/*
+ * Tail queue functions.
+ */
+#define QTAILQ_INIT(head) do {                                          \
+        (head)->tqh_first = NULL;                                       \
+        (head)->tqh_last = &(head)->tqh_first;                          \
+} while (/*CONSTCOND*/0)
+
+#define QTAILQ_INSERT_HEAD(head, elm, field) do {                       \
+        if (((elm)->field.tqe_next = (head)->tqh_first) != NULL)        \
+                (head)->tqh_first->field.tqe_prev =                     \
+                    &(elm)->field.tqe_next;                             \
+        else                                                            \
+                (head)->tqh_last = &(elm)->field.tqe_next;              \
+        (head)->tqh_first = (elm);                                      \
+        (elm)->field.tqe_prev = &(head)->tqh_first;                     \
+} while (/*CONSTCOND*/0)
+
+#define QTAILQ_INSERT_TAIL(head, elm, field) do {                       \
+        (elm)->field.tqe_next = NULL;                                   \
+        (elm)->field.tqe_prev = (head)->tqh_last;                       \
+        *(head)->tqh_last = (elm);                                      \
+        (head)->tqh_last = &(elm)->field.tqe_next;                      \
+} while (/*CONSTCOND*/0)
+
+#define QTAILQ_INSERT_AFTER(head, listelm, elm, field) do {             \
+        if (((elm)->field.tqe_next = (listelm)->field.tqe_next) != NULL)\
+                (elm)->field.tqe_next->field.tqe_prev =                 \
+                    &(elm)->field.tqe_next;                             \
+        else                                                            \
+                (head)->tqh_last = &(elm)->field.tqe_next;              \
+        (listelm)->field.tqe_next = (elm);                              \
+        (elm)->field.tqe_prev = &(listelm)->field.tqe_next;             \
+} while (/*CONSTCOND*/0)
+
+#define QTAILQ_INSERT_BEFORE(listelm, elm, field) do {                  \
+        (elm)->field.tqe_prev = (listelm)->field.tqe_prev;              \
+        (elm)->field.tqe_next = (listelm);                              \
+        *(listelm)->field.tqe_prev = (elm);                             \
+        (listelm)->field.tqe_prev = &(elm)->field.tqe_next;             \
+} while (/*CONSTCOND*/0)
+
+#define QTAILQ_REMOVE(head, elm, field) do {                            \
+        if (((elm)->field.tqe_next) != NULL)                            \
+                (elm)->field.tqe_next->field.tqe_prev =                 \
+                    (elm)->field.tqe_prev;                              \
+        else                                                            \
+                (head)->tqh_last = (elm)->field.tqe_prev;               \
+        *(elm)->field.tqe_prev = (elm)->field.tqe_next;                 \
+} while (/*CONSTCOND*/0)
+
+#define QTAILQ_FOREACH(var, head, field)                                \
+        for ((var) = ((head)->tqh_first);                               \
+                (var);                                                  \
+                (var) = ((var)->field.tqe_next))
+
+#define QTAILQ_FOREACH_SAFE(var, head, field, next_var)                 \
+        for ((var) = ((head)->tqh_first);                               \
+                (var) && ((next_var) = ((var)->field.tqe_next), 1);     \
+                (var) = (next_var))
+
+#define QTAILQ_FOREACH_REVERSE(var, head, headname, field)              \
+        for ((var) = (*(((struct headname *)((head)->tqh_last))->tqh_last));    \
+                (var);                                                  \
+                (var) = (*(((struct headname *)((var)->field.tqe_prev))->tqh_last)))
+
+/*
+ * Tail queue access methods.
+ */
+#define QTAILQ_EMPTY(head)               ((head)->tqh_first == NULL)
+#define QTAILQ_FIRST(head)               ((head)->tqh_first)
+#define QTAILQ_NEXT(elm, field)          ((elm)->field.tqe_next)
+
+#define QTAILQ_LAST(head, headname) \
+        (*(((struct headname *)((head)->tqh_last))->tqh_last))
+#define QTAILQ_PREV(elm, headname, field) \
+        (*(((struct headname *)((elm)->field.tqe_prev))->tqh_last))
+
+
+/*
+ * Circular queue definitions.
+ */
+#define QCIRCLEQ_HEAD(name, type)                                       \
+struct name {                                                           \
+        struct type *cqh_first;         /* first element */             \
+        struct type *cqh_last;          /* last element */              \
+}
+
+#define QCIRCLEQ_HEAD_INITIALIZER(head)                                 \
+        { (void *)&head, (void *)&head }
+
+#define QCIRCLEQ_ENTRY(type)                                            \
+struct {                                                                \
+        struct type *cqe_next;          /* next element */              \
+        struct type *cqe_prev;          /* previous element */          \
+}
+
+/*
+ * Circular queue functions.
+ */
+#define QCIRCLEQ_INIT(head) do {                                        \
+        (head)->cqh_first = (void *)(head);                             \
+        (head)->cqh_last = (void *)(head);                              \
+} while (/*CONSTCOND*/0)
+
+#define QCIRCLEQ_INSERT_AFTER(head, listelm, elm, field) do {           \
+        (elm)->field.cqe_next = (listelm)->field.cqe_next;              \
+        (elm)->field.cqe_prev = (listelm);                              \
+        if ((listelm)->field.cqe_next == (void *)(head))                \
+                (head)->cqh_last = (elm);                               \
+        else                                                            \
+                (listelm)->field.cqe_next->field.cqe_prev = (elm);      \
+        (listelm)->field.cqe_next = (elm);                              \
+} while (/*CONSTCOND*/0)
+
+#define QCIRCLEQ_INSERT_BEFORE(head, listelm, elm, field) do {          \
+        (elm)->field.cqe_next = (listelm);                              \
+        (elm)->field.cqe_prev = (listelm)->field.cqe_prev;              \
+        if ((listelm)->field.cqe_prev == (void *)(head))                \
+                (head)->cqh_first = (elm);                              \
+        else                                                            \
+                (listelm)->field.cqe_prev->field.cqe_next = (elm);      \
+        (listelm)->field.cqe_prev = (elm);                              \
+} while (/*CONSTCOND*/0)
+
+#define QCIRCLEQ_INSERT_HEAD(head, elm, field) do {                     \
+        (elm)->field.cqe_next = (head)->cqh_first;                      \
+        (elm)->field.cqe_prev = (void *)(head);                         \
+        if ((head)->cqh_last == (void *)(head))                         \
+                (head)->cqh_last = (elm);                               \
+        else                                                            \
+                (head)->cqh_first->field.cqe_prev = (elm);              \
+        (head)->cqh_first = (elm);                                      \
+} while (/*CONSTCOND*/0)
+
+#define QCIRCLEQ_INSERT_TAIL(head, elm, field) do {                     \
+        (elm)->field.cqe_next = (void *)(head);                         \
+        (elm)->field.cqe_prev = (head)->cqh_last;                       \
+        if ((head)->cqh_first == (void *)(head))                        \
+                (head)->cqh_first = (elm);                              \
+        else                                                            \
+                (head)->cqh_last->field.cqe_next = (elm);               \
+        (head)->cqh_last = (elm);                                       \
+} while (/*CONSTCOND*/0)
+
+#define QCIRCLEQ_REMOVE(head, elm, field) do {                          \
+        if ((elm)->field.cqe_next == (void *)(head))                    \
+                (head)->cqh_last = (elm)->field.cqe_prev;               \
+        else                                                            \
+                (elm)->field.cqe_next->field.cqe_prev =                 \
+                    (elm)->field.cqe_prev;                              \
+        if ((elm)->field.cqe_prev == (void *)(head))                    \
+                (head)->cqh_first = (elm)->field.cqe_next;              \
+        else                                                            \
+                (elm)->field.cqe_prev->field.cqe_next =                 \
+                    (elm)->field.cqe_next;                              \
+} while (/*CONSTCOND*/0)
+
+#define QCIRCLEQ_FOREACH(var, head, field)                              \
+        for ((var) = ((head)->cqh_first);                               \
+                (var) != (const void *)(head);                          \
+                (var) = ((var)->field.cqe_next))
+
+#define QCIRCLEQ_FOREACH_REVERSE(var, head, field)                      \
+        for ((var) = ((head)->cqh_last);                                \
+                (var) != (const void *)(head);                          \
+                (var) = ((var)->field.cqe_prev))
+
+/*
+ * Circular queue access methods.
+ */
+#define QCIRCLEQ_EMPTY(head)             ((head)->cqh_first == (void *)(head))
+#define QCIRCLEQ_FIRST(head)             ((head)->cqh_first)
+#define QCIRCLEQ_LAST(head)              ((head)->cqh_last)
+#define QCIRCLEQ_NEXT(elm, field)        ((elm)->field.cqe_next)
+#define QCIRCLEQ_PREV(elm, field)        ((elm)->field.cqe_prev)
+
+#define QCIRCLEQ_LOOP_NEXT(head, elm, field)                            \
+        (((elm)->field.cqe_next == (void *)(head))                      \
+            ? ((head)->cqh_first)                                       \
+            : (elm->field.cqe_next))
+#define QCIRCLEQ_LOOP_PREV(head, elm, field)                            \
+        (((elm)->field.cqe_prev == (void *)(head))                      \
+            ? ((head)->cqh_last)                                        \
+            : (elm->field.cqe_prev))
+
+#endif  /* !QEMU_SYS_QUEUE_H_ */
diff --git a/src/recompiler/qemu-timer.h b/src/recompiler/qemu-timer.h
new file mode 100644
index 00000000..209c4d2c
--- /dev/null
+++ b/src/recompiler/qemu-timer.h
@@ -0,0 +1,272 @@
+#ifndef QEMU_TIMER_H
+#define QEMU_TIMER_H
+
+#include "qemu-common.h"
+
+/* timers */
+#ifndef VBOX
+
+typedef struct QEMUClock QEMUClock;
+typedef void QEMUTimerCB(void *opaque);
+
+/* The real time clock should be used only for stuff which does not
+   change the virtual machine state, as it is run even if the virtual
+   machine is stopped. The real time clock has a frequency of 1000
+   Hz. */
+extern QEMUClock *rt_clock;
+
+/* The virtual clock is only run during the emulation. It is stopped
+   when the virtual machine is stopped. Virtual timers use a high
+   precision clock, usually cpu cycles (use ticks_per_sec). */
+extern QEMUClock *vm_clock;
+
+/* The host clock should be use for device models that emulate accurate
+   real time sources. It will continue to run when the virtual machine
+   is suspended, and it will reflect system time changes the host may
+   undergo (e.g. due to NTP). The host clock has the same precision as
+   the virtual clock. */
+extern QEMUClock *host_clock;
+
+int64_t qemu_get_clock(QEMUClock *clock);
+int64_t qemu_get_clock_ns(QEMUClock *clock);
+void qemu_clock_enable(QEMUClock *clock, int enabled);
+
+QEMUTimer *qemu_new_timer(QEMUClock *clock, QEMUTimerCB *cb, void *opaque);
+void qemu_free_timer(QEMUTimer *ts);
+void qemu_del_timer(QEMUTimer *ts);
+void qemu_mod_timer(QEMUTimer *ts, int64_t expire_time);
+int qemu_timer_pending(QEMUTimer *ts);
+int qemu_timer_expired(QEMUTimer *timer_head, int64_t current_time);
+
+void qemu_run_all_timers(void);
+int qemu_alarm_pending(void);
+int64_t qemu_next_deadline(void);
+void configure_alarms(char const *opt);
+void configure_icount(const char *option);
+int qemu_calculate_timeout(void);
+void init_clocks(void);
+int init_timer_alarm(void);
+void quit_timers(void);
+
+static inline int64_t get_ticks_per_sec(void)
+{
+    return 1000000000LL;
+}
+
+
+void qemu_get_timer(QEMUFile *f, QEMUTimer *ts);
+void qemu_put_timer(QEMUFile *f, QEMUTimer *ts);
+
+/* ptimer.c */
+typedef struct ptimer_state ptimer_state;
+typedef void (*ptimer_cb)(void *opaque);
+
+ptimer_state *ptimer_init(QEMUBH *bh);
+void ptimer_set_period(ptimer_state *s, int64_t period);
+void ptimer_set_freq(ptimer_state *s, uint32_t freq);
+void ptimer_set_limit(ptimer_state *s, uint64_t limit, int reload);
+uint64_t ptimer_get_count(ptimer_state *s);
+void ptimer_set_count(ptimer_state *s, uint64_t count);
+void ptimer_run(ptimer_state *s, int oneshot);
+void ptimer_stop(ptimer_state *s);
+void qemu_put_ptimer(QEMUFile *f, ptimer_state *s);
+void qemu_get_ptimer(QEMUFile *f, ptimer_state *s);
+
+/* icount */
+int64_t qemu_icount_round(int64_t count);
+extern int64_t qemu_icount;
+#endif /* !VBOX */
+extern int use_icount;
+#ifndef VBOX
+extern int icount_time_shift;
+extern int64_t qemu_icount_bias;
+int64_t cpu_get_icount(void);
+
+/*******************************************/
+/* host CPU ticks (if available) */
+
+#if defined(_ARCH_PPC)
+
+static inline int64_t cpu_get_real_ticks(void)
+{
+    int64_t retval;
+#ifdef _ARCH_PPC64
+    /* This reads timebase in one 64bit go and includes Cell workaround from:
+       http://ozlabs.org/pipermail/linuxppc-dev/2006-October/027052.html
+    */
+    __asm__ __volatile__ ("mftb    %0\n\t"
+                          "cmpwi   %0,0\n\t"
+                          "beq-    $-8"
+                          : "=r" (retval));
+#else
+    /* http://ozlabs.org/pipermail/linuxppc-dev/1999-October/003889.html */
+    unsigned long junk;
+    __asm__ __volatile__ ("mfspr   %1,269\n\t"  /* mftbu */
+                          "mfspr   %L0,268\n\t" /* mftb */
+                          "mfspr   %0,269\n\t"  /* mftbu */
+                          "cmpw    %0,%1\n\t"
+                          "bne     $-16"
+                          : "=r" (retval), "=r" (junk));
+#endif
+    return retval;
+}
+
+#elif defined(__i386__)
+
+static inline int64_t cpu_get_real_ticks(void)
+{
+    int64_t val;
+    asm volatile ("rdtsc" : "=A" (val));
+    return val;
+}
+
+#elif defined(__x86_64__)
+
+static inline int64_t cpu_get_real_ticks(void)
+{
+    uint32_t low,high;
+    int64_t val;
+    asm volatile("rdtsc" : "=a" (low), "=d" (high));
+    val = high;
+    val <<= 32;
+    val |= low;
+    return val;
+}
+
+#elif defined(__hppa__)
+
+static inline int64_t cpu_get_real_ticks(void)
+{
+    int val;
+    asm volatile ("mfctl %%cr16, %0" : "=r"(val));
+    return val;
+}
+
+#elif defined(__ia64)
+
+static inline int64_t cpu_get_real_ticks(void)
+{
+    int64_t val;
+    asm volatile ("mov %0 = ar.itc" : "=r"(val) :: "memory");
+    return val;
+}
+
+#elif defined(__s390__)
+
+static inline int64_t cpu_get_real_ticks(void)
+{
+    int64_t val;
+    asm volatile("stck 0(%1)" : "=m" (val) : "a" (&val) : "cc");
+    return val;
+}
+
+#elif defined(__sparc_v8plus__) || defined(__sparc_v8plusa__) || defined(__sparc_v9__)
+
+static inline int64_t cpu_get_real_ticks (void)
+{
+#if defined(_LP64)
+    uint64_t        rval;
+    asm volatile("rd %%tick,%0" : "=r"(rval));
+    return rval;
+#else
+    union {
+        uint64_t i64;
+        struct {
+            uint32_t high;
+            uint32_t low;
+        }       i32;
+    } rval;
+    asm volatile("rd %%tick,%1; srlx %1,32,%0"
+                 : "=r"(rval.i32.high), "=r"(rval.i32.low));
+    return rval.i64;
+#endif
+}
+
+#elif defined(__mips__) && \
+    ((defined(__mips_isa_rev) && __mips_isa_rev >= 2) || defined(__linux__))
+/*
+ * binutils wants to use rdhwr only on mips32r2
+ * but as linux kernel emulate it, it's fine
+ * to use it.
+ *
+ */
+#define MIPS_RDHWR(rd, value) {                         \
+        __asm__ __volatile__ (".set   push\n\t"         \
+                              ".set mips32r2\n\t"       \
+                              "rdhwr  %0, "rd"\n\t"     \
+                              ".set   pop"              \
+                              : "=r" (value));          \
+    }
+
+static inline int64_t cpu_get_real_ticks(void)
+{
+    /* On kernels >= 2.6.25 rdhwr <reg>, $2 and $3 are emulated */
+    uint32_t count;
+    static uint32_t cyc_per_count = 0;
+
+    if (!cyc_per_count) {
+        MIPS_RDHWR("$3", cyc_per_count);
+    }
+
+    MIPS_RDHWR("$2", count);
+    return (int64_t)(count * cyc_per_count);
+}
+
+#elif defined(__alpha__)
+
+static inline int64_t cpu_get_real_ticks(void)
+{
+    uint64_t cc;
+    uint32_t cur, ofs;
+
+    asm volatile("rpcc %0" : "=r"(cc));
+    cur = cc;
+    ofs = cc >> 32;
+    return cur - ofs;
+}
+
+#else
+/* The host CPU doesn't have an easily accessible cycle counter.
+   Just return a monotonically increasing value.  This will be
+   totally wrong, but hopefully better than nothing.  */
+static inline int64_t cpu_get_real_ticks (void)
+{
+    static int64_t ticks = 0;
+    return ticks++;
+}
+#endif
+
+#endif /* !VBOX */
+
+#ifdef NEED_CPU_H
+/* Deterministic execution requires that IO only be performed on the last
+   instruction of a TB so that interrupts take effect immediately.  */
+static inline int can_do_io(CPUState *env)
+{
+    if (!use_icount)
+        return 1;
+
+    /* If not executing code then assume we are ok.  */
+    if (!env->current_tb)
+        return 1;
+
+    return env->can_do_io != 0;
+}
+#endif
+
+#ifndef VBOX
+
+#ifdef CONFIG_PROFILER
+static inline int64_t profile_getclock(void)
+{
+    return cpu_get_real_ticks();
+}
+
+extern int64_t qemu_time, qemu_time_start;
+extern int64_t tlb_flush_time;
+extern int64_t dev_time;
+#endif
+
+#endif /* !VBOX */
+
+#endif
diff --git a/src/recompiler/softmmu_defs.h b/src/recompiler/softmmu_defs.h
new file mode 100644
index 00000000..e23c3498
--- /dev/null
+++ b/src/recompiler/softmmu_defs.h
@@ -0,0 +1,57 @@
+#ifndef SOFTMMU_DEFS_H
+#define SOFTMMU_DEFS_H
+
+#ifndef VBOX
+uint8_t REGPARM __ldb_mmu(target_ulong addr, int mmu_idx);
+void REGPARM __stb_mmu(target_ulong addr, uint8_t val, int mmu_idx);
+uint16_t REGPARM __ldw_mmu(target_ulong addr, int mmu_idx);
+void REGPARM __stw_mmu(target_ulong addr, uint16_t val, int mmu_idx);
+uint32_t REGPARM __ldl_mmu(target_ulong addr, int mmu_idx);
+void REGPARM __stl_mmu(target_ulong addr, uint32_t val, int mmu_idx);
+uint64_t REGPARM __ldq_mmu(target_ulong addr, int mmu_idx);
+void REGPARM __stq_mmu(target_ulong addr, uint64_t val, int mmu_idx);
+
+uint8_t REGPARM __ldb_cmmu(target_ulong addr, int mmu_idx);
+void REGPARM __stb_cmmu(target_ulong addr, uint8_t val, int mmu_idx);
+uint16_t REGPARM __ldw_cmmu(target_ulong addr, int mmu_idx);
+void REGPARM __stw_cmmu(target_ulong addr, uint16_t val, int mmu_idx);
+uint32_t REGPARM __ldl_cmmu(target_ulong addr, int mmu_idx);
+void REGPARM __stl_cmmu(target_ulong addr, uint32_t val, int mmu_idx);
+uint64_t REGPARM __ldq_cmmu(target_ulong addr, int mmu_idx);
+void REGPARM __stq_cmmu(target_ulong addr, uint64_t val, int mmu_idx);
+#else /* VBOX */
+RTCCUINTREG REGPARM __ldb_mmu(target_ulong addr, int mmu_idx);
+void REGPARM __stb_mmu(target_ulong addr, uint8_t val, int mmu_idx);
+RTCCUINTREG REGPARM __ldw_mmu(target_ulong addr, int mmu_idx);
+void REGPARM __stw_mmu(target_ulong addr, uint16_t val, int mmu_idx);
+RTCCUINTREG REGPARM __ldl_mmu(target_ulong addr, int mmu_idx);
+void REGPARM __stl_mmu(target_ulong addr, uint32_t val, int mmu_idx);
+uint64_t REGPARM __ldq_mmu(target_ulong addr, int mmu_idx);
+void REGPARM __stq_mmu(target_ulong addr, uint64_t val, int mmu_idx);
+
+RTCCUINTREG REGPARM __ldb_cmmu(target_ulong addr, int mmu_idx);
+void REGPARM __stb_cmmu(target_ulong addr, uint8_t val, int mmu_idx);
+RTCCUINTREG REGPARM __ldw_cmmu(target_ulong addr, int mmu_idx);
+void REGPARM __stw_cmmu(target_ulong addr, uint16_t val, int mmu_idx);
+RTCCUINTREG REGPARM __ldl_cmmu(target_ulong addr, int mmu_idx);
+void REGPARM __stl_cmmu(target_ulong addr, uint32_t val, int mmu_idx);
+uint64_t REGPARM __ldq_cmmu(target_ulong addr, int mmu_idx);
+void REGPARM __stq_cmmu(target_ulong addr, uint64_t val, int mmu_idx);
+
+# ifdef REM_PHYS_ADDR_IN_TLB
+RTCCUINTREG REGPARM __ldb_vbox_phys(RTCCUINTREG addr);
+RTCCUINTREG REGPARM __ldub_vbox_phys(RTCCUINTREG addr);
+void REGPARM __stb_vbox_phys(RTCCUINTREG addr, RTCCUINTREG val);
+RTCCUINTREG REGPARM __ldw_vbox_phys(RTCCUINTREG addr);
+RTCCUINTREG REGPARM __lduw_vbox_phys(RTCCUINTREG addr);
+void REGPARM __stw_vbox_phys(RTCCUINTREG addr, RTCCUINTREG val);
+RTCCUINTREG REGPARM __ldl_vbox_phys(RTCCUINTREG addr);
+RTCCUINTREG REGPARM __ldul_vbox_phys(RTCCUINTREG addr);
+void REGPARM __stl_vbox_phys(RTCCUINTREG addr, RTCCUINTREG val);
+uint64_t REGPARM __ldq_vbox_phys(RTCCUINTREG addr);
+void REGPARM __stq_vbox_phys(RTCCUINTREG addr, uint64_t val);
+# endif
+
+#endif /* VBOX */
+
+#endif
diff --git a/src/recompiler/softmmu_exec.h b/src/recompiler/softmmu_exec.h
new file mode 100644
index 00000000..28d1d53d
--- /dev/null
+++ b/src/recompiler/softmmu_exec.h
@@ -0,0 +1,153 @@
+/* Common softmmu definitions and inline routines.  */
+
+/* XXX: find something cleaner.
+ * Furthermore, this is false for 64 bits targets
+ */
+#define ldul_user       ldl_user
+#define ldul_kernel     ldl_kernel
+#define ldul_hypv       ldl_hypv
+#define ldul_executive  ldl_executive
+#define ldul_supervisor ldl_supervisor
+
+#include "softmmu_defs.h"
+
+#define ACCESS_TYPE 0
+#define MEMSUFFIX MMU_MODE0_SUFFIX
+#define DATA_SIZE 1
+#include "softmmu_header.h"
+
+#define DATA_SIZE 2
+#include "softmmu_header.h"
+
+#define DATA_SIZE 4
+#include "softmmu_header.h"
+
+#define DATA_SIZE 8
+#include "softmmu_header.h"
+#undef ACCESS_TYPE
+#undef MEMSUFFIX
+
+#define ACCESS_TYPE 1
+#define MEMSUFFIX MMU_MODE1_SUFFIX
+#define DATA_SIZE 1
+#include "softmmu_header.h"
+
+#define DATA_SIZE 2
+#include "softmmu_header.h"
+
+#define DATA_SIZE 4
+#include "softmmu_header.h"
+
+#define DATA_SIZE 8
+#include "softmmu_header.h"
+#undef ACCESS_TYPE
+#undef MEMSUFFIX
+
+#if (NB_MMU_MODES >= 3)
+
+#define ACCESS_TYPE 2
+#define MEMSUFFIX MMU_MODE2_SUFFIX
+#define DATA_SIZE 1
+#include "softmmu_header.h"
+
+#define DATA_SIZE 2
+#include "softmmu_header.h"
+
+#define DATA_SIZE 4
+#include "softmmu_header.h"
+
+#define DATA_SIZE 8
+#include "softmmu_header.h"
+#undef ACCESS_TYPE
+#undef MEMSUFFIX
+#endif /* (NB_MMU_MODES >= 3) */
+
+#if (NB_MMU_MODES >= 4)
+
+#define ACCESS_TYPE 3
+#define MEMSUFFIX MMU_MODE3_SUFFIX
+#define DATA_SIZE 1
+#include "softmmu_header.h"
+
+#define DATA_SIZE 2
+#include "softmmu_header.h"
+
+#define DATA_SIZE 4
+#include "softmmu_header.h"
+
+#define DATA_SIZE 8
+#include "softmmu_header.h"
+#undef ACCESS_TYPE
+#undef MEMSUFFIX
+#endif /* (NB_MMU_MODES >= 4) */
+
+#if (NB_MMU_MODES >= 5)
+
+#define ACCESS_TYPE 4
+#define MEMSUFFIX MMU_MODE4_SUFFIX
+#define DATA_SIZE 1
+#include "softmmu_header.h"
+
+#define DATA_SIZE 2
+#include "softmmu_header.h"
+
+#define DATA_SIZE 4
+#include "softmmu_header.h"
+
+#define DATA_SIZE 8
+#include "softmmu_header.h"
+#undef ACCESS_TYPE
+#undef MEMSUFFIX
+#endif /* (NB_MMU_MODES >= 5) */
+
+#if (NB_MMU_MODES >= 6)
+
+#define ACCESS_TYPE 5
+#define MEMSUFFIX MMU_MODE5_SUFFIX
+#define DATA_SIZE 1
+#include "softmmu_header.h"
+
+#define DATA_SIZE 2
+#include "softmmu_header.h"
+
+#define DATA_SIZE 4
+#include "softmmu_header.h"
+
+#define DATA_SIZE 8
+#include "softmmu_header.h"
+#undef ACCESS_TYPE
+#undef MEMSUFFIX
+#endif /* (NB_MMU_MODES >= 6) */
+
+#if (NB_MMU_MODES > 6)
+#error "NB_MMU_MODES > 6 is not supported for now"
+#endif /* (NB_MMU_MODES > 6) */
+
+/* these access are slower, they must be as rare as possible */
+#define ACCESS_TYPE (NB_MMU_MODES)
+#define MEMSUFFIX _data
+#define DATA_SIZE 1
+#include "softmmu_header.h"
+
+#define DATA_SIZE 2
+#include "softmmu_header.h"
+
+#define DATA_SIZE 4
+#include "softmmu_header.h"
+
+#define DATA_SIZE 8
+#include "softmmu_header.h"
+#undef ACCESS_TYPE
+#undef MEMSUFFIX
+
+#define ldub(p) ldub_data(p)
+#define ldsb(p) ldsb_data(p)
+#define lduw(p) lduw_data(p)
+#define ldsw(p) ldsw_data(p)
+#define ldl(p) ldl_data(p)
+#define ldq(p) ldq_data(p)
+
+#define stb(p, v) stb_data(p, v)
+#define stw(p, v) stw_data(p, v)
+#define stl(p, v) stl_data(p, v)
+#define stq(p, v) stq_data(p, v)
diff --git a/src/recompiler/softmmu_header.h b/src/recompiler/softmmu_header.h
new file mode 100644
index 00000000..06e2466d
--- /dev/null
+++ b/src/recompiler/softmmu_header.h
@@ -0,0 +1,208 @@
+/*
+ *  Software MMU support
+ *
+ *  Copyright (c) 2003 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Oracle LGPL Disclaimer: For the avoidance of doubt, except that if any license choice
+ * other than GPL or LGPL is available it will apply instead, Oracle elects to use only
+ * the Lesser General Public License version 2.1 (LGPLv2) at this time for any software where
+ * a choice of LGPL license versions is made available with the language indicating
+ * that LGPLv2 or any later version may be used, or where a choice of which version
+ * of the LGPL is applied is otherwise unspecified.
+ */
+
+#if DATA_SIZE == 8
+#define SUFFIX q
+#define USUFFIX q
+#define DATA_TYPE uint64_t
+#elif DATA_SIZE == 4
+#define SUFFIX l
+#define USUFFIX l
+#define DATA_TYPE uint32_t
+#elif DATA_SIZE == 2
+#define SUFFIX w
+#define USUFFIX uw
+#define DATA_TYPE uint16_t
+#define DATA_STYPE int16_t
+#elif DATA_SIZE == 1
+#define SUFFIX b
+#define USUFFIX ub
+#define DATA_TYPE uint8_t
+#define DATA_STYPE int8_t
+#else
+#error unsupported data size
+#endif
+
+#if ACCESS_TYPE < (NB_MMU_MODES)
+
+#define CPU_MMU_INDEX ACCESS_TYPE
+#define MMUSUFFIX _mmu
+
+#elif ACCESS_TYPE == (NB_MMU_MODES)
+
+#define CPU_MMU_INDEX (cpu_mmu_index(env))
+#define MMUSUFFIX _mmu
+
+#elif ACCESS_TYPE == (NB_MMU_MODES + 1)
+
+#define CPU_MMU_INDEX (cpu_mmu_index(env))
+#define MMUSUFFIX _cmmu
+
+#else
+#error invalid ACCESS_TYPE
+#endif
+
+#if DATA_SIZE == 8
+#define RES_TYPE uint64_t
+#else
+#define RES_TYPE uint32_t
+#endif
+
+#if ACCESS_TYPE == (NB_MMU_MODES + 1)
+#define ADDR_READ addr_code
+#else
+#define ADDR_READ addr_read
+#endif
+
+/* generic load/store macros */
+
+static inline RES_TYPE glue(glue(ld, USUFFIX), MEMSUFFIX)(target_ulong ptr)
+{
+    int page_index;
+    RES_TYPE res;
+    target_ulong addr;
+    uintptr_t physaddr;
+    int mmu_idx;
+
+    addr = ptr;
+    page_index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+    mmu_idx = CPU_MMU_INDEX;
+    if (unlikely(env->tlb_table[mmu_idx][page_index].ADDR_READ !=
+                 (addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) {
+        res = glue(glue(__ld, SUFFIX), MMUSUFFIX)(addr, mmu_idx);
+    } else {
+        physaddr = addr + env->tlb_table[mmu_idx][page_index].addend;
+        res = glue(glue(ld, USUFFIX), _raw)((uint8_t *)physaddr);
+    }
+    return res;
+}
+
+#if DATA_SIZE <= 2
+static inline int glue(glue(lds, SUFFIX), MEMSUFFIX)(target_ulong ptr)
+{
+    int res, page_index;
+    target_ulong addr;
+    uintptr_t physaddr;
+    int mmu_idx;
+
+    addr = ptr;
+    page_index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+    mmu_idx = CPU_MMU_INDEX;
+    if (unlikely(env->tlb_table[mmu_idx][page_index].ADDR_READ !=
+                 (addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) {
+        res = (DATA_STYPE)glue(glue(__ld, SUFFIX), MMUSUFFIX)(addr, mmu_idx);
+    } else {
+        physaddr = addr + env->tlb_table[mmu_idx][page_index].addend;
+        res = glue(glue(lds, SUFFIX), _raw)((uint8_t *)physaddr);
+    }
+    return res;
+}
+#endif
+
+#if ACCESS_TYPE != (NB_MMU_MODES + 1)
+
+/* generic store macro */
+
+static inline void glue(glue(st, SUFFIX), MEMSUFFIX)(target_ulong ptr, RES_TYPE v)
+{
+    int page_index;
+    target_ulong addr;
+    uintptr_t physaddr;
+    int mmu_idx;
+
+    addr = ptr;
+    page_index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+    mmu_idx = CPU_MMU_INDEX;
+    if (unlikely(env->tlb_table[mmu_idx][page_index].addr_write !=
+                 (addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) {
+        glue(glue(__st, SUFFIX), MMUSUFFIX)(addr, v, mmu_idx);
+    } else {
+        physaddr = addr + env->tlb_table[mmu_idx][page_index].addend;
+        glue(glue(st, SUFFIX), _raw)((uint8_t *)physaddr, v);
+    }
+}
+
+#endif /* ACCESS_TYPE != (NB_MMU_MODES + 1) */
+
+#if ACCESS_TYPE != (NB_MMU_MODES + 1)
+
+#if DATA_SIZE == 8
+static inline float64 glue(ldfq, MEMSUFFIX)(target_ulong ptr)
+{
+    union {
+        float64 d;
+        uint64_t i;
+    } u;
+    u.i = glue(ldq, MEMSUFFIX)(ptr);
+    return u.d;
+}
+
+static inline void glue(stfq, MEMSUFFIX)(target_ulong ptr, float64 v)
+{
+    union {
+        float64 d;
+        uint64_t i;
+    } u;
+    u.d = v;
+    glue(stq, MEMSUFFIX)(ptr, u.i);
+}
+#endif /* DATA_SIZE == 8 */
+
+#if DATA_SIZE == 4
+static inline float32 glue(ldfl, MEMSUFFIX)(target_ulong ptr)
+{
+    union {
+        float32 f;
+        uint32_t i;
+    } u;
+    u.i = glue(ldl, MEMSUFFIX)(ptr);
+    return u.f;
+}
+
+static inline void glue(stfl, MEMSUFFIX)(target_ulong ptr, float32 v)
+{
+    union {
+        float32 f;
+        uint32_t i;
+    } u;
+    u.f = v;
+    glue(stl, MEMSUFFIX)(ptr, u.i);
+}
+#endif /* DATA_SIZE == 4 */
+
+#endif /* ACCESS_TYPE != (NB_MMU_MODES + 1) */
+
+#undef RES_TYPE
+#undef DATA_TYPE
+#undef DATA_STYPE
+#undef SUFFIX
+#undef USUFFIX
+#undef DATA_SIZE
+#undef CPU_MMU_INDEX
+#undef MMUSUFFIX
+#undef ADDR_READ
diff --git a/src/recompiler/softmmu_template.h b/src/recompiler/softmmu_template.h
new file mode 100644
index 00000000..a31411ba
--- /dev/null
+++ b/src/recompiler/softmmu_template.h
@@ -0,0 +1,366 @@
+/*
+ *  Software MMU support
+ *
+ *  Copyright (c) 2003 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Oracle LGPL Disclaimer: For the avoidance of doubt, except that if any license choice
+ * other than GPL or LGPL is available it will apply instead, Oracle elects to use only
+ * the Lesser General Public License version 2.1 (LGPLv2) at this time for any software where
+ * a choice of LGPL license versions is made available with the language indicating
+ * that LGPLv2 or any later version may be used, or where a choice of which version
+ * of the LGPL is applied is otherwise unspecified.
+ */
+
+#include "qemu-timer.h"
+
+#define DATA_SIZE (1 << SHIFT)
+
+#if DATA_SIZE == 8
+#define SUFFIX q
+#define USUFFIX q
+#define DATA_TYPE uint64_t
+#ifdef VBOX
+# define DATA_TYPE_PROMOTED uint64_t
+#endif
+#elif DATA_SIZE == 4
+#define SUFFIX l
+#define USUFFIX l
+#define DATA_TYPE uint32_t
+#ifdef VBOX
+# define DATA_TYPE_PROMOTED RTCCUINTREG
+#endif
+#elif DATA_SIZE == 2
+#define SUFFIX w
+#define USUFFIX uw
+#define DATA_TYPE uint16_t
+#ifdef VBOX
+# define DATA_TYPE_PROMOTED RTCCUINTREG
+#endif
+#elif DATA_SIZE == 1
+#define SUFFIX b
+#define USUFFIX ub
+#define DATA_TYPE uint8_t
+#ifdef VBOX
+# define DATA_TYPE_PROMOTED RTCCUINTREG
+#endif
+#else
+#error unsupported data size
+#endif
+
+#ifdef SOFTMMU_CODE_ACCESS
+#define READ_ACCESS_TYPE 2
+#define ADDR_READ addr_code
+#else
+#define READ_ACCESS_TYPE 0
+#define ADDR_READ addr_read
+#endif
+
+static DATA_TYPE glue(glue(slow_ld, SUFFIX), MMUSUFFIX)(target_ulong addr,
+                                                        int mmu_idx,
+                                                        void *retaddr);
+static inline DATA_TYPE glue(io_read, SUFFIX)(target_phys_addr_t physaddr,
+                                              target_ulong addr,
+                                              void *retaddr)
+{
+    DATA_TYPE res;
+    int index;
+    index = (physaddr >> IO_MEM_SHIFT) & (IO_MEM_NB_ENTRIES - 1);
+    physaddr = (physaddr & TARGET_PAGE_MASK) + addr;
+    env->mem_io_pc = (uintptr_t)retaddr;
+    if (index > (IO_MEM_NOTDIRTY >> IO_MEM_SHIFT)
+            && !can_do_io(env)) {
+        cpu_io_recompile(env, retaddr);
+    }
+
+    env->mem_io_vaddr = addr;
+#if SHIFT <= 2
+    res = io_mem_read[index][SHIFT](io_mem_opaque[index], physaddr);
+#else
+#ifdef TARGET_WORDS_BIGENDIAN
+    res = (uint64_t)io_mem_read[index][2](io_mem_opaque[index], physaddr) << 32;
+    res |= io_mem_read[index][2](io_mem_opaque[index], physaddr + 4);
+#else
+    res = io_mem_read[index][2](io_mem_opaque[index], physaddr);
+    res |= (uint64_t)io_mem_read[index][2](io_mem_opaque[index], physaddr + 4) << 32;
+#endif
+#endif /* SHIFT > 2 */
+    return res;
+}
+
+/* handle all cases except unaligned access which span two pages */
+#ifndef VBOX
+DATA_TYPE REGPARM glue(glue(__ld, SUFFIX), MMUSUFFIX)(target_ulong addr,
+                                                      int mmu_idx)
+#else
+/* Load helpers invoked from generated code, and TCG makes an assumption
+   that valid value takes the whole register, why gcc after 4.3 may
+   use only lower part of register for smaller types. So force promotion. */
+DATA_TYPE_PROMOTED REGPARM
+glue(glue(__ld, SUFFIX), MMUSUFFIX)(target_ulong addr,
+                                    int mmu_idx)
+#endif
+{
+    DATA_TYPE res;
+    int index;
+    target_ulong tlb_addr;
+    target_phys_addr_t ioaddr;
+    uintptr_t addend;
+    void *retaddr;
+
+    /* test if there is match for unaligned or IO access */
+    /* XXX: could done more in memory macro in a non portable way */
+    index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+ redo:
+    tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
+    if ((addr & TARGET_PAGE_MASK) == (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
+        if (tlb_addr & ~TARGET_PAGE_MASK) {
+            /* IO access */
+            if ((addr & (DATA_SIZE - 1)) != 0)
+                goto do_unaligned_access;
+            retaddr = GETPC();
+            ioaddr = env->iotlb[mmu_idx][index];
+            res = glue(io_read, SUFFIX)(ioaddr, addr, retaddr);
+        } else if (((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1) >= TARGET_PAGE_SIZE) {
+            /* slow unaligned access (it spans two pages or IO) */
+        do_unaligned_access:
+            retaddr = GETPC();
+#ifdef ALIGNED_ONLY
+            do_unaligned_access(addr, READ_ACCESS_TYPE, mmu_idx, retaddr);
+#endif
+            res = glue(glue(slow_ld, SUFFIX), MMUSUFFIX)(addr,
+                                                         mmu_idx, retaddr);
+        } else {
+            /* unaligned/aligned access in the same page */
+#ifdef ALIGNED_ONLY
+            if ((addr & (DATA_SIZE - 1)) != 0) {
+                retaddr = GETPC();
+                do_unaligned_access(addr, READ_ACCESS_TYPE, mmu_idx, retaddr);
+            }
+#endif
+            addend = env->tlb_table[mmu_idx][index].addend;
+            res = glue(glue(ld, USUFFIX), _raw)((uint8_t *)(uintptr_t)(addr+addend));
+        }
+    } else {
+        /* the page is not in the TLB : fill it */
+        retaddr = GETPC();
+#ifdef ALIGNED_ONLY
+        if ((addr & (DATA_SIZE - 1)) != 0)
+            do_unaligned_access(addr, READ_ACCESS_TYPE, mmu_idx, retaddr);
+#endif
+        tlb_fill(addr, READ_ACCESS_TYPE, mmu_idx, retaddr);
+        goto redo;
+    }
+    return res;
+}
+
+/* handle all unaligned cases */
+static DATA_TYPE glue(glue(slow_ld, SUFFIX), MMUSUFFIX)(target_ulong addr,
+                                                        int mmu_idx,
+                                                        void *retaddr)
+{
+    DATA_TYPE res, res1, res2;
+    int index, shift;
+    target_phys_addr_t ioaddr;
+    uintptr_t addend;
+    target_ulong tlb_addr, addr1, addr2;
+
+    index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+ redo:
+    tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
+    if ((addr & TARGET_PAGE_MASK) == (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
+        if (tlb_addr & ~TARGET_PAGE_MASK) {
+            /* IO access */
+            if ((addr & (DATA_SIZE - 1)) != 0)
+                goto do_unaligned_access;
+            ioaddr = env->iotlb[mmu_idx][index];
+            res = glue(io_read, SUFFIX)(ioaddr, addr, retaddr);
+        } else if (((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1) >= TARGET_PAGE_SIZE) {
+        do_unaligned_access:
+            /* slow unaligned access (it spans two pages) */
+            addr1 = addr & ~(DATA_SIZE - 1);
+            addr2 = addr1 + DATA_SIZE;
+            res1 = glue(glue(slow_ld, SUFFIX), MMUSUFFIX)(addr1,
+                                                          mmu_idx, retaddr);
+            res2 = glue(glue(slow_ld, SUFFIX), MMUSUFFIX)(addr2,
+                                                          mmu_idx, retaddr);
+            shift = (addr & (DATA_SIZE - 1)) * 8;
+#ifdef TARGET_WORDS_BIGENDIAN
+            res = (res1 << shift) | (res2 >> ((DATA_SIZE * 8) - shift));
+#else
+            res = (res1 >> shift) | (res2 << ((DATA_SIZE * 8) - shift));
+#endif
+            res = (DATA_TYPE)res;
+        } else {
+            /* unaligned/aligned access in the same page */
+            addend = env->tlb_table[mmu_idx][index].addend;
+            res = glue(glue(ld, USUFFIX), _raw)((uint8_t *)(uintptr_t)(addr+addend));
+        }
+    } else {
+        /* the page is not in the TLB : fill it */
+        tlb_fill(addr, READ_ACCESS_TYPE, mmu_idx, retaddr);
+        goto redo;
+    }
+    return res;
+}
+
+#ifndef SOFTMMU_CODE_ACCESS
+
+static void glue(glue(slow_st, SUFFIX), MMUSUFFIX)(target_ulong addr,
+                                                   DATA_TYPE val,
+                                                   int mmu_idx,
+                                                   void *retaddr);
+
+static inline void glue(io_write, SUFFIX)(target_phys_addr_t physaddr,
+                                          DATA_TYPE val,
+                                          target_ulong addr,
+                                          void *retaddr)
+{
+    int index;
+    index = (physaddr >> IO_MEM_SHIFT) & (IO_MEM_NB_ENTRIES - 1);
+    physaddr = (physaddr & TARGET_PAGE_MASK) + addr;
+    if (index > (IO_MEM_NOTDIRTY >> IO_MEM_SHIFT)
+            && !can_do_io(env)) {
+        cpu_io_recompile(env, retaddr);
+    }
+
+    env->mem_io_vaddr = addr;
+    env->mem_io_pc = (uintptr_t)retaddr;
+#if SHIFT <= 2
+    io_mem_write[index][SHIFT](io_mem_opaque[index], physaddr, val);
+#else
+#ifdef TARGET_WORDS_BIGENDIAN
+    io_mem_write[index][2](io_mem_opaque[index], physaddr, val >> 32);
+    io_mem_write[index][2](io_mem_opaque[index], physaddr + 4, val);
+#else
+    io_mem_write[index][2](io_mem_opaque[index], physaddr, val);
+    io_mem_write[index][2](io_mem_opaque[index], physaddr + 4, val >> 32);
+#endif
+#endif /* SHIFT > 2 */
+}
+
+void REGPARM glue(glue(__st, SUFFIX), MMUSUFFIX)(target_ulong addr,
+                                                 DATA_TYPE val,
+                                                 int mmu_idx)
+{
+    target_phys_addr_t ioaddr;
+    uintptr_t addend;
+    target_ulong tlb_addr;
+    void *retaddr;
+    int index;
+
+    index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+ redo:
+    tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
+    if ((addr & TARGET_PAGE_MASK) == (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
+        if (tlb_addr & ~TARGET_PAGE_MASK) {
+            /* IO access */
+            if ((addr & (DATA_SIZE - 1)) != 0)
+                goto do_unaligned_access;
+            retaddr = GETPC();
+            ioaddr = env->iotlb[mmu_idx][index];
+            glue(io_write, SUFFIX)(ioaddr, val, addr, retaddr);
+        } else if (((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1) >= TARGET_PAGE_SIZE) {
+        do_unaligned_access:
+            retaddr = GETPC();
+#ifdef ALIGNED_ONLY
+            do_unaligned_access(addr, 1, mmu_idx, retaddr);
+#endif
+            glue(glue(slow_st, SUFFIX), MMUSUFFIX)(addr, val,
+                                                   mmu_idx, retaddr);
+        } else {
+            /* aligned/unaligned access in the same page */
+#ifdef ALIGNED_ONLY
+            if ((addr & (DATA_SIZE - 1)) != 0) {
+                retaddr = GETPC();
+                do_unaligned_access(addr, 1, mmu_idx, retaddr);
+            }
+#endif
+            addend = env->tlb_table[mmu_idx][index].addend;
+            glue(glue(st, SUFFIX), _raw)((uint8_t *)(uintptr_t)(addr+addend), val);
+        }
+    } else {
+        /* the page is not in the TLB : fill it */
+        retaddr = GETPC();
+#ifdef ALIGNED_ONLY
+        if ((addr & (DATA_SIZE - 1)) != 0)
+            do_unaligned_access(addr, 1, mmu_idx, retaddr);
+#endif
+        tlb_fill(addr, 1, mmu_idx, retaddr);
+        goto redo;
+    }
+}
+
+/* handles all unaligned cases */
+static void glue(glue(slow_st, SUFFIX), MMUSUFFIX)(target_ulong addr,
+                                                   DATA_TYPE val,
+                                                   int mmu_idx,
+                                                   void *retaddr)
+{
+    target_phys_addr_t ioaddr;
+    uintptr_t addend;
+    target_ulong tlb_addr;
+    int index, i;
+
+    index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+ redo:
+    tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
+    if ((addr & TARGET_PAGE_MASK) == (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
+        if (tlb_addr & ~TARGET_PAGE_MASK) {
+            /* IO access */
+            if ((addr & (DATA_SIZE - 1)) != 0)
+                goto do_unaligned_access;
+            ioaddr = env->iotlb[mmu_idx][index];
+            glue(io_write, SUFFIX)(ioaddr, val, addr, retaddr);
+        } else if (((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1) >= TARGET_PAGE_SIZE) {
+        do_unaligned_access:
+            /* XXX: not efficient, but simple */
+            /* Note: relies on the fact that tlb_fill() does not remove the
+             * previous page from the TLB cache.  */
+            for(i = DATA_SIZE - 1; i >= 0; i--) {
+#ifdef TARGET_WORDS_BIGENDIAN
+                glue(slow_stb, MMUSUFFIX)(addr + i, val >> (((DATA_SIZE - 1) * 8) - (i * 8)),
+                                          mmu_idx, retaddr);
+#else
+                glue(slow_stb, MMUSUFFIX)(addr + i, val >> (i * 8),
+                                          mmu_idx, retaddr);
+#endif
+            }
+        } else {
+            /* aligned/unaligned access in the same page */
+            addend = env->tlb_table[mmu_idx][index].addend;
+            glue(glue(st, SUFFIX), _raw)((uint8_t *)(uintptr_t)(addr+addend), val);
+        }
+    } else {
+        /* the page is not in the TLB : fill it */
+        tlb_fill(addr, 1, mmu_idx, retaddr);
+        goto redo;
+    }
+}
+
+#endif /* !defined(SOFTMMU_CODE_ACCESS) */
+
+#ifdef VBOX
+# undef DATA_TYPE_PROMOTED
+#endif
+#undef READ_ACCESS_TYPE
+#undef SHIFT
+#undef DATA_TYPE
+#undef SUFFIX
+#undef USUFFIX
+#undef DATA_SIZE
+#undef ADDR_READ
diff --git a/src/recompiler/target-i386/Makefile.kup b/src/recompiler/target-i386/Makefile.kup
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/src/recompiler/target-i386/Makefile.kup
diff --git a/src/recompiler/target-i386/TODO b/src/recompiler/target-i386/TODO
new file mode 100644
index 00000000..8dfd4633
--- /dev/null
+++ b/src/recompiler/target-i386/TODO
@@ -0,0 +1,32 @@
+Correctness issues:
+
+- some eflags manipulation incorrectly reset the bit 0x2.
+- SVM: test, cpu save/restore, SMM save/restore.
+- x86_64: lcall/ljmp intel/amd differences ?
+- better code fetch (different exception handling + CS.limit support)
+- user/kernel PUSHL/POPL in helper.c
+- add missing cpuid tests
+- return UD exception if LOCK prefix incorrectly used
+- test ldt limit < 7 ?
+- fix some 16 bit sp push/pop overflow (pusha/popa, lcall lret)
+- full support of segment limit/rights
+- full x87 exception support
+- improve x87 bit exactness (use bochs code ?)
+- DRx register support
+- CR0.AC emulation
+- SSE alignment checks
+- fix SSE min/max with nans
+
+Optimizations/Features:
+
+- add SVM nested paging support
+- add VMX support
+- add AVX support
+- add SSE5 support
+- fxsave/fxrstor AMD extensions
+- improve monitor/mwait support
+- faster EFLAGS update: consider SZAP, C, O can be updated separately
+  with a bit field in CC_OP and more state variables.
+- evaluate x87 stack pointer statically
+- find a way to avoid translating several time the same TB if CR0.TS
+  is set or not.
diff --git a/src/recompiler/target-i386/cpu.h b/src/recompiler/target-i386/cpu.h
new file mode 100644
index 00000000..c643db8d
--- /dev/null
+++ b/src/recompiler/target-i386/cpu.h
@@ -0,0 +1,1215 @@
+/*
+ * i386 virtual CPU header
+ *
+ *  Copyright (c) 2003 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Oracle LGPL Disclaimer: For the avoidance of doubt, except that if any license choice
+ * other than GPL or LGPL is available it will apply instead, Oracle elects to use only
+ * the Lesser General Public License version 2.1 (LGPLv2) at this time for any software where
+ * a choice of LGPL license versions is made available with the language indicating
+ * that LGPLv2 or any later version may be used, or where a choice of which version
+ * of the LGPL is applied is otherwise unspecified.
+ */
+
+#ifndef CPU_I386_H
+#define CPU_I386_H
+
+#include "config.h"
+
+#ifdef TARGET_X86_64
+#define TARGET_LONG_BITS 64
+#else
+#define TARGET_LONG_BITS 32
+#endif
+
+/* target supports implicit self modifying code */
+#define TARGET_HAS_SMC
+/* support for self modifying code even if the modified instruction is
+   close to the modifying instruction */
+#define TARGET_HAS_PRECISE_SMC
+
+#define TARGET_HAS_ICE 1
+
+#ifdef TARGET_X86_64
+#define ELF_MACHINE	EM_X86_64
+#else
+#define ELF_MACHINE	EM_386
+#endif
+
+#define CPUState struct CPUX86State
+
+#include "cpu-defs.h"
+
+#include "softfloat.h"
+
+#ifdef VBOX
+# include <iprt/critsect.h>
+# include <iprt/thread.h>
+# include <iprt/assert.h>
+# include <iprt/asm.h>
+# include <VBox/vmm/vmm.h>
+# include <VBox/vmm/stam.h>
+# include <VBox/vmm/cpumctx.h>
+# undef MSR_IA32_APICBASE_BSP
+#endif /* VBOX */
+
+#define R_EAX 0
+#define R_ECX 1
+#define R_EDX 2
+#define R_EBX 3
+#define R_ESP 4
+#define R_EBP 5
+#define R_ESI 6
+#define R_EDI 7
+
+#define R_AL 0
+#define R_CL 1
+#define R_DL 2
+#define R_BL 3
+#define R_AH 4
+#define R_CH 5
+#define R_DH 6
+#define R_BH 7
+
+#define R_ES 0
+#define R_CS 1
+#define R_SS 2
+#define R_DS 3
+#define R_FS 4
+#define R_GS 5
+
+/* segment descriptor fields */
+#define DESC_G_MASK     (1 << 23)
+#define DESC_B_SHIFT    22
+#define DESC_B_MASK     (1 << DESC_B_SHIFT)
+#define DESC_L_SHIFT    21 /* x86_64 only : 64 bit code segment */
+#define DESC_L_MASK     (1 << DESC_L_SHIFT)
+#define DESC_AVL_MASK   (1 << 20)
+#define DESC_P_MASK     (1 << 15)
+#define DESC_DPL_SHIFT  13
+#define DESC_DPL_MASK   (3 << DESC_DPL_SHIFT)
+#define DESC_S_MASK     (1 << 12)
+#define DESC_TYPE_SHIFT 8
+#define DESC_TYPE_MASK  (15 << DESC_TYPE_SHIFT)
+#define DESC_A_MASK     (1 << 8)
+
+#define DESC_CS_MASK    (1 << 11) /* 1=code segment 0=data segment */
+#define DESC_C_MASK     (1 << 10) /* code: conforming */
+#define DESC_R_MASK     (1 << 9)  /* code: readable */
+
+#define DESC_E_MASK     (1 << 10) /* data: expansion direction */
+#define DESC_W_MASK     (1 << 9)  /* data: writable */
+
+#define DESC_TSS_BUSY_MASK (1 << 9)
+#ifdef VBOX
+# define DESC_INTEL_UNUSABLE    RT_BIT_32(16+8)      /**< Internal VT-x bit for NULL sectors. */
+# define DESC_RAW_FLAG_BITS     UINT32_C(0x00ffffff) /**< Flag bits we load from the descriptor. */
+#endif
+
+/* eflags masks */
+#define CC_C   	0x0001
+#define CC_P 	0x0004
+#define CC_A	0x0010
+#define CC_Z	0x0040
+#define CC_S    0x0080
+#define CC_O    0x0800
+
+#define TF_SHIFT   8
+#define IOPL_SHIFT 12
+#define VM_SHIFT   17
+
+#define TF_MASK 		0x00000100
+#define IF_MASK 		0x00000200
+#define DF_MASK 		0x00000400
+#define IOPL_MASK		0x00003000
+#define NT_MASK	         	0x00004000
+#define RF_MASK			0x00010000
+#define VM_MASK			0x00020000
+#define AC_MASK			0x00040000
+#define VIF_MASK                0x00080000
+#define VIP_MASK                0x00100000
+#define ID_MASK                 0x00200000
+
+/* hidden flags - used internally by qemu to represent additional cpu
+   states. Only the CPL, INHIBIT_IRQ, SMM and SVMI are not
+   redundant. We avoid using the IOPL_MASK, TF_MASK and VM_MASK bit
+   position to ease oring with eflags. */
+/* current cpl */
+#define HF_CPL_SHIFT         0
+/* true if soft mmu is being used */
+#define HF_SOFTMMU_SHIFT     2
+/* true if hardware interrupts must be disabled for next instruction */
+#define HF_INHIBIT_IRQ_SHIFT 3
+/* 16 or 32 segments */
+#define HF_CS32_SHIFT        4
+#define HF_SS32_SHIFT        5
+/* zero base for DS, ES and SS : can be '0' only in 32 bit CS segment */
+#define HF_ADDSEG_SHIFT      6
+/* copy of CR0.PE (protected mode) */
+#define HF_PE_SHIFT          7
+#define HF_TF_SHIFT          8 /* must be same as eflags */
+#define HF_MP_SHIFT          9 /* the order must be MP, EM, TS */
+#define HF_EM_SHIFT         10
+#define HF_TS_SHIFT         11
+#define HF_IOPL_SHIFT       12 /* must be same as eflags */
+#define HF_LMA_SHIFT        14 /* only used on x86_64: long mode active */
+#define HF_CS64_SHIFT       15 /* only used on x86_64: 64 bit code segment  */
+#define HF_RF_SHIFT         16 /* must be same as eflags */
+#define HF_VM_SHIFT         17 /* must be same as eflags */
+#define HF_SMM_SHIFT        19 /* CPU in SMM mode */
+#define HF_SVME_SHIFT       20 /* SVME enabled (copy of EFER.SVME) */
+#define HF_SVMI_SHIFT       21 /* SVM intercepts are active */
+#define HF_OSFXSR_SHIFT     22 /* CR4.OSFXSR */
+
+#define HF_CPL_MASK          (3 << HF_CPL_SHIFT)
+#define HF_SOFTMMU_MASK      (1 << HF_SOFTMMU_SHIFT)
+#define HF_INHIBIT_IRQ_MASK  (1 << HF_INHIBIT_IRQ_SHIFT)
+#define HF_CS32_MASK         (1 << HF_CS32_SHIFT)
+#define HF_SS32_MASK         (1 << HF_SS32_SHIFT)
+#define HF_ADDSEG_MASK       (1 << HF_ADDSEG_SHIFT)
+#define HF_PE_MASK           (1 << HF_PE_SHIFT)
+#define HF_TF_MASK           (1 << HF_TF_SHIFT)
+#define HF_MP_MASK           (1 << HF_MP_SHIFT)
+#define HF_EM_MASK           (1 << HF_EM_SHIFT)
+#define HF_TS_MASK           (1 << HF_TS_SHIFT)
+#define HF_IOPL_MASK         (3 << HF_IOPL_SHIFT)
+#define HF_LMA_MASK          (1 << HF_LMA_SHIFT)
+#define HF_CS64_MASK         (1 << HF_CS64_SHIFT)
+#define HF_RF_MASK           (1 << HF_RF_SHIFT)
+#define HF_VM_MASK           (1 << HF_VM_SHIFT)
+#define HF_SMM_MASK          (1 << HF_SMM_SHIFT)
+#define HF_SVME_MASK         (1 << HF_SVME_SHIFT)
+#define HF_SVMI_MASK         (1 << HF_SVMI_SHIFT)
+#define HF_OSFXSR_MASK       (1 << HF_OSFXSR_SHIFT)
+
+/* hflags2 */
+
+#define HF2_GIF_SHIFT        0 /* if set CPU takes interrupts */
+#define HF2_HIF_SHIFT        1 /* value of IF_MASK when entering SVM */
+#define HF2_NMI_SHIFT        2 /* CPU serving NMI */
+#define HF2_VINTR_SHIFT      3 /* value of V_INTR_MASKING bit */
+
+#define HF2_GIF_MASK          (1 << HF2_GIF_SHIFT)
+#define HF2_HIF_MASK          (1 << HF2_HIF_SHIFT)
+#define HF2_NMI_MASK          (1 << HF2_NMI_SHIFT)
+#define HF2_VINTR_MASK        (1 << HF2_VINTR_SHIFT)
+
+#define CR0_PE_SHIFT 0
+#define CR0_MP_SHIFT 1
+
+#define CR0_PE_MASK  (1 << 0)
+#define CR0_MP_MASK  (1 << 1)
+#define CR0_EM_MASK  (1 << 2)
+#define CR0_TS_MASK  (1 << 3)
+#define CR0_ET_MASK  (1 << 4)
+#define CR0_NE_MASK  (1 << 5)
+#define CR0_WP_MASK  (1 << 16)
+#define CR0_AM_MASK  (1 << 18)
+#define CR0_PG_MASK  (1U << 31)
+
+#define CR4_VME_MASK  (1 << 0)
+#define CR4_PVI_MASK  (1 << 1)
+#define CR4_TSD_MASK  (1 << 2)
+#define CR4_DE_MASK   (1 << 3)
+#define CR4_PSE_MASK  (1 << 4)
+#define CR4_PAE_MASK  (1 << 5)
+#define CR4_MCE_MASK  (1 << 6)
+#define CR4_PGE_MASK  (1 << 7)
+#define CR4_PCE_MASK  (1 << 8)
+#define CR4_OSFXSR_SHIFT 9
+#define CR4_OSFXSR_MASK (1 << CR4_OSFXSR_SHIFT)
+#define CR4_OSXMMEXCPT_MASK  (1 << 10)
+
+#define DR6_BD          (1 << 13)
+#define DR6_BS          (1 << 14)
+#define DR6_BT          (1 << 15)
+#define DR6_FIXED_1     0xffff0ff0
+
+#define DR7_GD          (1 << 13)
+#define DR7_TYPE_SHIFT  16
+#define DR7_LEN_SHIFT   18
+#define DR7_FIXED_1     0x00000400
+
+#define PG_PRESENT_BIT	0
+#define PG_RW_BIT	1
+#define PG_USER_BIT	2
+#define PG_PWT_BIT	3
+#define PG_PCD_BIT	4
+#define PG_ACCESSED_BIT	5
+#define PG_DIRTY_BIT	6
+#define PG_PSE_BIT	7
+#define PG_GLOBAL_BIT	8
+#define PG_NX_BIT	63
+
+#define PG_PRESENT_MASK  (1 << PG_PRESENT_BIT)
+#define PG_RW_MASK	 (1 << PG_RW_BIT)
+#define PG_USER_MASK	 (1 << PG_USER_BIT)
+#define PG_PWT_MASK	 (1 << PG_PWT_BIT)
+#define PG_PCD_MASK	 (1 << PG_PCD_BIT)
+#define PG_ACCESSED_MASK (1 << PG_ACCESSED_BIT)
+#define PG_DIRTY_MASK	 (1 << PG_DIRTY_BIT)
+#define PG_PSE_MASK	 (1 << PG_PSE_BIT)
+#define PG_GLOBAL_MASK	 (1 << PG_GLOBAL_BIT)
+#define PG_NX_MASK	 (1LL << PG_NX_BIT)
+
+#define PG_ERROR_W_BIT     1
+
+#define PG_ERROR_P_MASK    0x01
+#define PG_ERROR_W_MASK    (1 << PG_ERROR_W_BIT)
+#define PG_ERROR_U_MASK    0x04
+#define PG_ERROR_RSVD_MASK 0x08
+#define PG_ERROR_I_D_MASK  0x10
+
+#define MCG_CTL_P	(1UL<<8)   /* MCG_CAP register available */
+
+#define MCE_CAP_DEF	MCG_CTL_P
+#define MCE_BANKS_DEF	10
+
+#define MCG_STATUS_MCIP	(1ULL<<2)   /* machine check in progress */
+
+#define MCI_STATUS_VAL	(1ULL<<63)  /* valid error */
+#define MCI_STATUS_OVER	(1ULL<<62)  /* previous errors lost */
+#define MCI_STATUS_UC	(1ULL<<61)  /* uncorrected error */
+
+#define MSR_IA32_TSC                    0x10
+#define MSR_IA32_APICBASE               0x1b
+#define MSR_IA32_APICBASE_BSP           (1<<8)
+#define MSR_IA32_APICBASE_ENABLE        (1<<11)
+#define MSR_IA32_APICBASE_BASE          (0xfffff<<12)
+
+#define MSR_MTRRcap			0xfe
+#define MSR_MTRRcap_VCNT		8
+#define MSR_MTRRcap_FIXRANGE_SUPPORT	(1 << 8)
+#define MSR_MTRRcap_WC_SUPPORTED	(1 << 10)
+
+#define MSR_IA32_SYSENTER_CS            0x174
+#define MSR_IA32_SYSENTER_ESP           0x175
+#define MSR_IA32_SYSENTER_EIP           0x176
+
+#define MSR_MCG_CAP                     0x179
+#define MSR_MCG_STATUS                  0x17a
+#define MSR_MCG_CTL                     0x17b
+
+#define MSR_IA32_PERF_STATUS            0x198
+
+#define MSR_MTRRphysBase(reg)		(0x200 + 2 * (reg))
+#define MSR_MTRRphysMask(reg)		(0x200 + 2 * (reg) + 1)
+
+#define MSR_MTRRfix64K_00000		0x250
+#define MSR_MTRRfix16K_80000		0x258
+#define MSR_MTRRfix16K_A0000		0x259
+#define MSR_MTRRfix4K_C0000		0x268
+#define MSR_MTRRfix4K_C8000		0x269
+#define MSR_MTRRfix4K_D0000		0x26a
+#define MSR_MTRRfix4K_D8000		0x26b
+#define MSR_MTRRfix4K_E0000		0x26c
+#define MSR_MTRRfix4K_E8000		0x26d
+#define MSR_MTRRfix4K_F0000		0x26e
+#define MSR_MTRRfix4K_F8000		0x26f
+
+#define MSR_PAT                         0x277
+
+#define MSR_MTRRdefType			0x2ff
+
+#define MSR_MC0_CTL			0x400
+#define MSR_MC0_STATUS			0x401
+#define MSR_MC0_ADDR			0x402
+#define MSR_MC0_MISC			0x403
+
+#define MSR_EFER                        0xc0000080
+
+#define MSR_EFER_SCE   (1 << 0)
+#define MSR_EFER_LME   (1 << 8)
+#define MSR_EFER_LMA   (1 << 10)
+#define MSR_EFER_NXE   (1 << 11)
+#define MSR_EFER_SVME  (1 << 12)
+#define MSR_EFER_FFXSR (1 << 14)
+
+#ifdef VBOX
+# define MSR_APIC_RANGE_START           0x800
+# define MSR_APIC_RANGE_END             0x900
+#endif
+
+#define MSR_STAR                        0xc0000081
+#define MSR_LSTAR                       0xc0000082
+#define MSR_CSTAR                       0xc0000083
+#define MSR_FMASK                       0xc0000084
+#define MSR_FSBASE                      0xc0000100
+#define MSR_GSBASE                      0xc0000101
+#define MSR_KERNELGSBASE                0xc0000102
+#define MSR_TSC_AUX                     0xc0000103
+
+#define MSR_VM_HSAVE_PA                 0xc0010117
+
+/* cpuid_features bits */
+#define CPUID_FP87 (1 << 0)
+#define CPUID_VME  (1 << 1)
+#define CPUID_DE   (1 << 2)
+#define CPUID_PSE  (1 << 3)
+#define CPUID_TSC  (1 << 4)
+#define CPUID_MSR  (1 << 5)
+#define CPUID_PAE  (1 << 6)
+#define CPUID_MCE  (1 << 7)
+#define CPUID_CX8  (1 << 8)
+#define CPUID_APIC (1 << 9)
+#define CPUID_SEP  (1 << 11) /* sysenter/sysexit */
+#define CPUID_MTRR (1 << 12)
+#define CPUID_PGE  (1 << 13)
+#define CPUID_MCA  (1 << 14)
+#define CPUID_CMOV (1 << 15)
+#define CPUID_PAT  (1 << 16)
+#define CPUID_PSE36   (1 << 17)
+#define CPUID_PN   (1 << 18)
+#define CPUID_CLFLUSH (1 << 19)
+#define CPUID_DTS (1 << 21)
+#define CPUID_ACPI (1 << 22)
+#define CPUID_MMX  (1 << 23)
+#define CPUID_FXSR (1 << 24)
+#define CPUID_SSE  (1 << 25)
+#define CPUID_SSE2 (1 << 26)
+#define CPUID_SS (1 << 27)
+#define CPUID_HT (1 << 28)
+#define CPUID_TM (1 << 29)
+#define CPUID_IA64 (1 << 30)
+#define CPUID_PBE (1 << 31)
+
+#define CPUID_EXT_SSE3     (1 << 0)
+#define CPUID_EXT_DTES64   (1 << 2)
+#define CPUID_EXT_MONITOR  (1 << 3)
+#define CPUID_EXT_DSCPL    (1 << 4)
+#define CPUID_EXT_VMX      (1 << 5)
+#define CPUID_EXT_SMX      (1 << 6)
+#define CPUID_EXT_EST      (1 << 7)
+#define CPUID_EXT_TM2      (1 << 8)
+#define CPUID_EXT_SSSE3    (1 << 9)
+#define CPUID_EXT_CID      (1 << 10)
+#define CPUID_EXT_CX16     (1 << 13)
+#define CPUID_EXT_XTPR     (1 << 14)
+#define CPUID_EXT_PDCM     (1 << 15)
+#define CPUID_EXT_DCA      (1 << 18)
+#define CPUID_EXT_SSE41    (1 << 19)
+#define CPUID_EXT_SSE42    (1 << 20)
+#define CPUID_EXT_X2APIC   (1 << 21)
+#define CPUID_EXT_MOVBE    (1 << 22)
+#define CPUID_EXT_POPCNT   (1 << 23)
+#define CPUID_EXT_XSAVE    (1 << 26)
+#define CPUID_EXT_OSXSAVE  (1 << 27)
+#define CPUID_EXT_HYPERVISOR  (1 << 31)
+
+#define CPUID_EXT2_SYSCALL (1 << 11)
+#define CPUID_EXT2_MP      (1 << 19)
+#define CPUID_EXT2_NX      (1 << 20)
+#define CPUID_EXT2_MMXEXT  (1 << 22)
+#define CPUID_EXT2_FFXSR   (1 << 25)
+#define CPUID_EXT2_PDPE1GB (1 << 26)
+#define CPUID_EXT2_RDTSCP  (1 << 27)
+#define CPUID_EXT2_LM      (1 << 29)
+#define CPUID_EXT2_3DNOWEXT (1 << 30)
+#define CPUID_EXT2_3DNOW   (1 << 31)
+
+#define CPUID_EXT3_LAHF_LM (1 << 0)
+#define CPUID_EXT3_CMP_LEG (1 << 1)
+#define CPUID_EXT3_SVM     (1 << 2)
+#define CPUID_EXT3_EXTAPIC (1 << 3)
+#define CPUID_EXT3_CR8LEG  (1 << 4)
+#define CPUID_EXT3_ABM     (1 << 5)
+#define CPUID_EXT3_SSE4A   (1 << 6)
+#define CPUID_EXT3_MISALIGNSSE (1 << 7)
+#define CPUID_EXT3_3DNOWPREFETCH (1 << 8)
+#define CPUID_EXT3_OSVW    (1 << 9)
+#define CPUID_EXT3_IBS     (1 << 10)
+#define CPUID_EXT3_SKINIT  (1 << 12)
+
+#define CPUID_VENDOR_INTEL_1 0x756e6547 /* "Genu" */
+#define CPUID_VENDOR_INTEL_2 0x49656e69 /* "ineI" */
+#define CPUID_VENDOR_INTEL_3 0x6c65746e /* "ntel" */
+
+#define CPUID_VENDOR_AMD_1   0x68747541 /* "Auth" */
+#define CPUID_VENDOR_AMD_2   0x69746e65 /* "enti" */
+#define CPUID_VENDOR_AMD_3   0x444d4163 /* "cAMD" */
+
+#define CPUID_MWAIT_IBE     (1 << 1) /* Interrupts can exit capability */
+#define CPUID_MWAIT_EMX     (1 << 0) /* enumeration supported */
+
+#define EXCP00_DIVZ	0
+#define EXCP01_DB	1
+#define EXCP02_NMI	2
+#define EXCP03_INT3	3
+#define EXCP04_INTO	4
+#define EXCP05_BOUND	5
+#define EXCP06_ILLOP	6
+#define EXCP07_PREX	7
+#define EXCP08_DBLE	8
+#define EXCP09_XERR	9
+#define EXCP0A_TSS	10
+#define EXCP0B_NOSEG	11
+#define EXCP0C_STACK	12
+#define EXCP0D_GPF	13
+#define EXCP0E_PAGE	14
+#define EXCP10_COPR	16
+#define EXCP11_ALGN	17
+#define EXCP12_MCHK	18
+
+#define EXCP_SYSCALL    0x100 /* only happens in user only emulation
+                                 for syscall instruction */
+
+enum {
+    CC_OP_DYNAMIC, /* must use dynamic code to get cc_op */
+    CC_OP_EFLAGS,  /* all cc are explicitly computed, CC_SRC = flags */
+
+    CC_OP_MULB, /* modify all flags, C, O = (CC_SRC != 0) */
+    CC_OP_MULW,
+    CC_OP_MULL,
+    CC_OP_MULQ,
+
+    CC_OP_ADDB, /* modify all flags, CC_DST = res, CC_SRC = src1 */
+    CC_OP_ADDW,
+    CC_OP_ADDL,
+    CC_OP_ADDQ,
+
+    CC_OP_ADCB, /* modify all flags, CC_DST = res, CC_SRC = src1 */
+    CC_OP_ADCW,
+    CC_OP_ADCL,
+    CC_OP_ADCQ,
+
+    CC_OP_SUBB, /* modify all flags, CC_DST = res, CC_SRC = src1 */
+    CC_OP_SUBW,
+    CC_OP_SUBL,
+    CC_OP_SUBQ,
+
+    CC_OP_SBBB, /* modify all flags, CC_DST = res, CC_SRC = src1 */
+    CC_OP_SBBW,
+    CC_OP_SBBL,
+    CC_OP_SBBQ,
+
+    CC_OP_LOGICB, /* modify all flags, CC_DST = res */
+    CC_OP_LOGICW,
+    CC_OP_LOGICL,
+    CC_OP_LOGICQ,
+
+    CC_OP_INCB, /* modify all flags except, CC_DST = res, CC_SRC = C */
+    CC_OP_INCW,
+    CC_OP_INCL,
+    CC_OP_INCQ,
+
+    CC_OP_DECB, /* modify all flags except, CC_DST = res, CC_SRC = C  */
+    CC_OP_DECW,
+    CC_OP_DECL,
+    CC_OP_DECQ,
+
+    CC_OP_SHLB, /* modify all flags, CC_DST = res, CC_SRC.msb = C */
+    CC_OP_SHLW,
+    CC_OP_SHLL,
+    CC_OP_SHLQ,
+
+    CC_OP_SARB, /* modify all flags, CC_DST = res, CC_SRC.lsb = C */
+    CC_OP_SARW,
+    CC_OP_SARL,
+    CC_OP_SARQ,
+
+    CC_OP_NB,
+};
+
+#ifdef FLOATX80
+#define USE_X86LDOUBLE
+#endif
+
+#ifdef USE_X86LDOUBLE
+typedef floatx80 CPU86_LDouble;
+#else
+typedef float64 CPU86_LDouble;
+#endif
+
+typedef struct SegmentCache {
+    uint32_t selector;
+#ifdef VBOX
+    /** The new selector is saved here when we are unable to sync it before invoking the recompiled code. */
+    uint16_t newselector;
+    uint16_t fVBoxFlags;
+#endif
+    target_ulong base;
+    uint32_t limit;
+    uint32_t flags;
+} SegmentCache;
+
+typedef union {
+    uint8_t _b[16];
+    uint16_t _w[8];
+    uint32_t _l[4];
+    uint64_t _q[2];
+    float32 _s[4];
+    float64 _d[2];
+} XMMReg;
+
+typedef union {
+    uint8_t _b[8];
+    uint16_t _w[4];
+    uint32_t _l[2];
+    float32 _s[2];
+    uint64_t q;
+} MMXReg;
+
+#ifdef HOST_WORDS_BIGENDIAN
+#define XMM_B(n) _b[15 - (n)]
+#define XMM_W(n) _w[7 - (n)]
+#define XMM_L(n) _l[3 - (n)]
+#define XMM_S(n) _s[3 - (n)]
+#define XMM_Q(n) _q[1 - (n)]
+#define XMM_D(n) _d[1 - (n)]
+
+#define MMX_B(n) _b[7 - (n)]
+#define MMX_W(n) _w[3 - (n)]
+#define MMX_L(n) _l[1 - (n)]
+#define MMX_S(n) _s[1 - (n)]
+#else
+#define XMM_B(n) _b[n]
+#define XMM_W(n) _w[n]
+#define XMM_L(n) _l[n]
+#define XMM_S(n) _s[n]
+#define XMM_Q(n) _q[n]
+#define XMM_D(n) _d[n]
+
+#define MMX_B(n) _b[n]
+#define MMX_W(n) _w[n]
+#define MMX_L(n) _l[n]
+#define MMX_S(n) _s[n]
+#endif
+#define MMX_Q(n) q
+
+typedef union {
+#ifdef USE_X86LDOUBLE
+    CPU86_LDouble d __attribute__((aligned(16)));
+#else
+    CPU86_LDouble d;
+#endif
+    MMXReg mmx;
+} FPReg;
+
+typedef struct {
+    uint64_t base;
+    uint64_t mask;
+} MTRRVar;
+
+#define CPU_NB_REGS64 16
+#define CPU_NB_REGS32 8
+
+#ifdef TARGET_X86_64
+#define CPU_NB_REGS CPU_NB_REGS64
+#else
+#define CPU_NB_REGS CPU_NB_REGS32
+#endif
+
+#define NB_MMU_MODES 2
+
+typedef struct CPUX86State {
+    /* standard registers */
+    target_ulong regs[CPU_NB_REGS];
+    target_ulong eip;
+    target_ulong eflags; /* eflags register. During CPU emulation, CC
+                        flags and DF are set to zero because they are
+                        stored elsewhere */
+
+    /* emulator internal eflags handling */
+    target_ulong cc_src;
+    target_ulong cc_dst;
+    uint32_t cc_op;
+    int32_t df; /* D flag : 1 if D = 0, -1 if D = 1 */
+    uint32_t hflags; /* TB flags, see HF_xxx constants. These flags
+                        are known at translation time. */
+    uint32_t hflags2; /* various other flags, see HF2_xxx constants. */
+
+    /* segments */
+    SegmentCache segs[6]; /* selector values */
+    SegmentCache ldt;
+    SegmentCache tr;
+    SegmentCache gdt; /* only base and limit are used */
+    SegmentCache idt; /* only base and limit are used */
+
+    target_ulong cr[5]; /* NOTE: cr1 is unused */
+    int32_t a20_mask;
+
+    /* FPU state */
+    unsigned int fpstt; /* top of stack index */
+    uint16_t fpus;
+    uint16_t fpuc;
+    uint8_t fptags[8];   /* 0 = valid, 1 = empty */
+    FPReg fpregs[8];
+
+    /* emulator internal variables */
+    float_status fp_status;
+#ifdef VBOX
+    uint32_t alignment3[3]; /* force the long double to start a 16 byte line. */
+#endif
+    CPU86_LDouble ft0;
+#if defined(VBOX) && defined(RT_ARCH_X86) && !defined(RT_OS_DARWIN)
+    uint32_t alignment4; /* long double is 12 byte, pad it to 16. */
+#endif
+
+    float_status mmx_status; /* for 3DNow! float ops */
+    float_status sse_status;
+    uint32_t mxcsr;
+    XMMReg xmm_regs[CPU_NB_REGS];
+    XMMReg xmm_t0;
+    MMXReg mmx_t0;
+    target_ulong cc_tmp; /* temporary for rcr/rcl */
+
+    /* sysenter registers */
+    uint32_t sysenter_cs;
+#ifdef VBOX
+    uint32_t alignment0;
+#endif
+    target_ulong sysenter_esp;
+    target_ulong sysenter_eip;
+    uint64_t efer;
+    uint64_t star;
+
+    uint64_t vm_hsave;
+    uint64_t vm_vmcb;
+    uint64_t tsc_offset;
+    uint64_t intercept;
+    uint16_t intercept_cr_read;
+    uint16_t intercept_cr_write;
+    uint16_t intercept_dr_read;
+    uint16_t intercept_dr_write;
+    uint32_t intercept_exceptions;
+    uint8_t v_tpr;
+
+#ifdef TARGET_X86_64
+    target_ulong lstar;
+    target_ulong cstar;
+    target_ulong fmask;
+    target_ulong kernelgsbase;
+#endif
+    uint64_t system_time_msr;
+    uint64_t wall_clock_msr;
+
+    uint64_t tsc;
+
+    uint64_t pat;
+
+    /* exception/interrupt handling */
+    int error_code;
+    int exception_is_int;
+#ifdef VBOX
+# define EXCEPTION_IS_INT_VALUE_HARDWARE_IRQ 0x42 /**< Special CPUX86State::exception_is_int value indicating hardware irq.  (HACK ALERT) */
+#endif
+    target_ulong exception_next_eip;
+    target_ulong dr[8]; /* debug registers */
+    union {
+        CPUBreakpoint *cpu_breakpoint[4];
+        CPUWatchpoint *cpu_watchpoint[4];
+    }; /* break/watchpoints for dr[0..3] */
+    uint32_t smbase;
+    int old_exception;  /* exception in flight */
+
+    CPU_COMMON
+
+#ifdef VBOX
+    /** cpu state flags. (see defines below) */
+    uint32_t    state;
+    /** The VM handle. */
+    PVM         pVM;
+    /** The VMCPU handle. */
+    PVMCPU      pVCpu;
+    /** code buffer for instruction emulation */
+    void       *pvCodeBuffer;
+    /** code buffer size */
+    uint32_t    cbCodeBuffer;
+#endif /* VBOX */
+
+    /* processor features (e.g. for CPUID insn) */
+#ifndef VBOX /* remR3CpuId deals with these */
+    uint32_t cpuid_level;
+    uint32_t cpuid_vendor1;
+    uint32_t cpuid_vendor2;
+    uint32_t cpuid_vendor3;
+    uint32_t cpuid_version;
+#endif /* !VBOX */
+    uint32_t cpuid_features;
+    uint32_t cpuid_ext_features;
+#ifndef VBOX
+    uint32_t cpuid_xlevel;
+    uint32_t cpuid_model[12];
+#endif /* !VBOX */
+    uint32_t cpuid_ext2_features;
+    uint32_t cpuid_ext3_features;
+    uint32_t cpuid_apic_id;
+#ifndef VBOX
+    int cpuid_vendor_override;
+
+    /* MTRRs */
+    uint64_t mtrr_fixed[11];
+    uint64_t mtrr_deftype;
+    MTRRVar mtrr_var[8];
+
+    /* For KVM */
+    uint32_t mp_state;
+    int32_t exception_injected;
+    int32_t interrupt_injected;
+    uint8_t soft_interrupt;
+    uint8_t nmi_injected;
+    uint8_t nmi_pending;
+    uint8_t has_error_code;
+    uint32_t sipi_vector;
+
+    uint32_t cpuid_kvm_features;
+
+    /* in order to simplify APIC support, we leave this pointer to the
+       user */
+    struct DeviceState *apic_state;
+
+    uint64 mcg_cap;
+    uint64 mcg_status;
+    uint64 mcg_ctl;
+    uint64 mce_banks[MCE_BANKS_DEF*4];
+
+    uint64_t tsc_aux;
+
+    /* vmstate */
+    uint16_t fpus_vmstate;
+    uint16_t fptag_vmstate;
+    uint16_t fpregs_format_vmstate;
+
+    uint64_t xstate_bv;
+    XMMReg ymmh_regs[CPU_NB_REGS];
+
+    uint64_t xcr0;
+#else  /* VBOX */
+
+    /** Alignment padding. */
+# if HC_ARCH_BITS == 64 \
+  || (   HC_ARCH_BITS == 32 \
+      && !defined(RT_OS_WINDOWS) \
+      && (   (!defined(VBOX_ENABLE_VBOXREM64) && !defined(RT_OS_SOLARIS) && !defined(RT_OS_FREEBSD)) \
+          || (defined(VBOX_ENABLE_VBOXREM64) && (defined(RT_OS_SOLARIS) || defined(RT_OS_FREEBSD))) ) )
+    uint32_t alignment2[1];
+# endif
+
+    /** Profiling tb_flush. */
+    STAMPROFILE StatTbFlush;
+
+    /** Addends for HVA -> GPA translations. */
+    target_phys_addr_t phys_addends[NB_MMU_MODES][CPU_TLB_SIZE];
+#endif /* VBOX */
+} CPUX86State;
+
+#ifdef VBOX
+
+/* Version 1.6 structure; just for loading the old saved state */
+typedef struct SegmentCache_Ver16 {
+    uint32_t selector;
+    uint32_t base;
+    uint32_t limit;
+    uint32_t flags;
+    /** The new selector is saved here when we are unable to sync it before invoking the recompiled code. */
+    uint32_t newselector;
+} SegmentCache_Ver16;
+
+# define CPU_NB_REGS_VER16 8
+
+/* Version 1.6 structure; just for loading the old saved state */
+typedef struct CPUX86State_Ver16 {
+# if TARGET_LONG_BITS > HOST_LONG_BITS
+    /* temporaries if we cannot store them in host registers */
+    uint32_t t0, t1, t2;
+# endif
+
+    /* standard registers */
+    uint32_t regs[CPU_NB_REGS_VER16];
+    uint32_t eip;
+    uint32_t eflags; /* eflags register. During CPU emulation, CC
+                        flags and DF are set to zero because they are
+                        stored elsewhere */
+
+    /* emulator internal eflags handling */
+    uint32_t cc_src;
+    uint32_t cc_dst;
+    uint32_t cc_op;
+    int32_t df; /* D flag : 1 if D = 0, -1 if D = 1 */
+    uint32_t hflags; /* hidden flags, see HF_xxx constants */
+
+    /* segments */
+    SegmentCache_Ver16 segs[6]; /* selector values */
+    SegmentCache_Ver16 ldt;
+    SegmentCache_Ver16 tr;
+    SegmentCache_Ver16 gdt; /* only base and limit are used */
+    SegmentCache_Ver16 idt; /* only base and limit are used */
+
+    uint32_t cr[5]; /* NOTE: cr1 is unused */
+    uint32_t a20_mask;
+
+    /* FPU state */
+    unsigned int fpstt; /* top of stack index */
+    unsigned int fpus;
+    unsigned int fpuc;
+    uint8_t fptags[8];   /* 0 = valid, 1 = empty */
+    union {
+# ifdef USE_X86LDOUBLE
+        CPU86_LDouble d __attribute__((aligned(16)));
+# else
+        CPU86_LDouble d;
+# endif
+        MMXReg mmx;
+    } fpregs[8];
+
+    /* emulator internal variables */
+    float_status fp_status;
+# ifdef VBOX
+    uint32_t alignment3[3]; /* force the long double to start a 16 byte line. */
+# endif
+    CPU86_LDouble ft0;
+# if defined(VBOX) && defined(RT_ARCH_X86) && !defined(RT_OS_DARWIN)
+    uint32_t alignment4; /* long double is 12 byte, pad it to 16. */
+# endif
+    union {
+	float f;
+        double d;
+	int i32;
+        int64_t i64;
+    } fp_convert;
+
+    float_status sse_status;
+    uint32_t mxcsr;
+    XMMReg xmm_regs[CPU_NB_REGS_VER16];
+    XMMReg xmm_t0;
+    MMXReg mmx_t0;
+
+    /* sysenter registers */
+    uint32_t sysenter_cs;
+    uint32_t sysenter_esp;
+    uint32_t sysenter_eip;
+# ifdef VBOX
+    uint32_t alignment0;
+# endif
+    uint64_t efer;
+    uint64_t star;
+
+    uint64_t pat;
+
+    /* temporary data for USE_CODE_COPY mode */
+# ifdef USE_CODE_COPY
+    uint32_t tmp0;
+    uint32_t saved_esp;
+    int native_fp_regs; /* if true, the FPU state is in the native CPU regs */
+# endif
+
+    /* exception/interrupt handling */
+    jmp_buf jmp_env;
+} CPUX86State_Ver16;
+
+/** CPUX86State state flags
+ * @{ */
+# define CPU_RAW_RING0            0x0002 /* Set after first time RawR0 is executed, never cleared. */
+# define CPU_EMULATE_SINGLE_INSTR 0x0040 /* Execute a single instruction in emulation mode */
+# define CPU_EMULATE_SINGLE_STEP  0x0080 /* go into single step mode */
+# define CPU_RAW_HM            0x0100 /* Set after first time HWACC is executed, never cleared. */
+/** @} */
+#endif /* !VBOX */
+
+#ifdef VBOX
+CPUX86State *cpu_x86_init(CPUX86State *env, const char *cpu_model);
+#else  /* !VBOX */
+CPUX86State *cpu_x86_init(const char *cpu_model);
+#endif /* !VBOX */
+int cpu_x86_exec(CPUX86State *s);
+void cpu_x86_close(CPUX86State *s);
+void x86_cpu_list (FILE *f, int (*cpu_fprintf)(FILE *f, const char *fmt, ...),
+                   const char *optarg);
+void x86_cpudef_setup(void);
+
+int cpu_get_pic_interrupt(CPUX86State *s);
+/* MSDOS compatibility mode FPU exception support */
+void cpu_set_ferr(CPUX86State *s);
+
+/* this function must always be used to load data in the segment
+   cache: it synchronizes the hflags with the segment cache values */
+#ifndef VBOX
+static inline void cpu_x86_load_seg_cache(CPUX86State *env,
+                                          int seg_reg, unsigned int selector,
+                                          target_ulong base,
+                                          unsigned int limit,
+                                          unsigned int flags)
+#else
+static inline void cpu_x86_load_seg_cache_with_clean_flags(CPUX86State *env,
+                                                           int seg_reg, unsigned int selector,
+                                                           target_ulong base,
+                                                           unsigned int limit,
+                                                           unsigned int flags)
+#endif
+{
+    SegmentCache *sc;
+    unsigned int new_hflags;
+
+    sc = &env->segs[seg_reg];
+    sc->selector = selector;
+    sc->base = base;
+    sc->limit = limit;
+    sc->flags = flags;
+#ifdef VBOX
+    sc->newselector = 0;
+    sc->fVBoxFlags  = CPUMSELREG_FLAGS_VALID;
+#endif
+
+    /* update the hidden flags */
+    {
+        if (seg_reg == R_CS) {
+#ifdef TARGET_X86_64
+            if ((env->hflags & HF_LMA_MASK) && (flags & DESC_L_MASK)) {
+                /* long mode */
+                env->hflags |= HF_CS32_MASK | HF_SS32_MASK | HF_CS64_MASK;
+                env->hflags &= ~(HF_ADDSEG_MASK);
+            } else
+#endif
+            {
+                /* legacy / compatibility case */
+                new_hflags = (env->segs[R_CS].flags & DESC_B_MASK)
+                    >> (DESC_B_SHIFT - HF_CS32_SHIFT);
+                env->hflags = (env->hflags & ~(HF_CS32_MASK | HF_CS64_MASK)) |
+                    new_hflags;
+            }
+        }
+        new_hflags = (env->segs[R_SS].flags & DESC_B_MASK)
+            >> (DESC_B_SHIFT - HF_SS32_SHIFT);
+        if (env->hflags & HF_CS64_MASK) {
+            /* zero base assumed for DS, ES and SS in long mode */
+        } else if (!(env->cr[0] & CR0_PE_MASK) ||
+                   (env->eflags & VM_MASK) ||
+                   !(env->hflags & HF_CS32_MASK)) {
+            /* XXX: try to avoid this test. The problem comes from the
+               fact that is real mode or vm86 mode we only modify the
+               'base' and 'selector' fields of the segment cache to go
+               faster. A solution may be to force addseg to one in
+               translate-i386.c. */
+            new_hflags |= HF_ADDSEG_MASK;
+        } else {
+            new_hflags |= ((env->segs[R_DS].base |
+                            env->segs[R_ES].base |
+                            env->segs[R_SS].base) != 0) <<
+                HF_ADDSEG_SHIFT;
+        }
+        env->hflags = (env->hflags &
+                       ~(HF_SS32_MASK | HF_ADDSEG_MASK)) | new_hflags;
+    }
+}
+
+#ifdef VBOX
+/* Raw input, adjust the flags adding the stupid intel flag when applicable. */
+static inline void cpu_x86_load_seg_cache(CPUX86State *env,
+                                          int seg_reg, unsigned int selector,
+                                          target_ulong base,
+                                          unsigned int limit,
+                                          unsigned int flags)
+{
+    flags &= DESC_RAW_FLAG_BITS;
+    if (flags & DESC_P_MASK)
+        flags |= DESC_A_MASK;           /* Make sure the A bit is set to avoid trouble. */
+    else if (selector < 4U)
+        flags |= DESC_INTEL_UNUSABLE;
+    cpu_x86_load_seg_cache_with_clean_flags(env, seg_reg, selector, base, limit, flags);
+}
+#endif
+
+static inline void cpu_x86_load_seg_cache_sipi(CPUX86State *env,
+                                               int sipi_vector)
+{
+    env->eip = 0;
+    cpu_x86_load_seg_cache(env, R_CS, sipi_vector << 8,
+                           sipi_vector << 12,
+                           env->segs[R_CS].limit,
+                           env->segs[R_CS].flags);
+    env->halted = 0;
+}
+
+int cpu_x86_get_descr_debug(CPUX86State *env, unsigned int selector,
+                            target_ulong *base, unsigned int *limit,
+                            unsigned int *flags);
+
+/* wrapper, just in case memory mappings must be changed */
+static inline void cpu_x86_set_cpl(CPUX86State *s, int cpl)
+{
+#if HF_CPL_MASK == 3
+    s->hflags = (s->hflags & ~HF_CPL_MASK) | cpl;
+#else
+#error HF_CPL_MASK is hardcoded
+#endif
+}
+
+/* op_helper.c */
+/* used for debug or cpu save/restore */
+void cpu_get_fp80(uint64_t *pmant, uint16_t *pexp, CPU86_LDouble f);
+CPU86_LDouble cpu_set_fp80(uint64_t mant, uint16_t upper);
+
+/* cpu-exec.c */
+/* the following helpers are only usable in user mode simulation as
+   they can trigger unexpected exceptions */
+void cpu_x86_load_seg(CPUX86State *s, int seg_reg, int selector);
+void cpu_x86_fsave(CPUX86State *s, target_ulong ptr, int data32);
+void cpu_x86_frstor(CPUX86State *s, target_ulong ptr, int data32);
+
+/* you can call this signal handler from your SIGBUS and SIGSEGV
+   signal handlers to inform the virtual CPU of exceptions. non zero
+   is returned if the signal was handled by the virtual CPU.  */
+int cpu_x86_signal_handler(int host_signum, void *pinfo,
+                           void *puc);
+
+/* cpuid.c */
+void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count,
+                   uint32_t *eax, uint32_t *ebx,
+                   uint32_t *ecx, uint32_t *edx);
+int cpu_x86_register (CPUX86State *env, const char *cpu_model);
+void cpu_clear_apic_feature(CPUX86State *env);
+
+/* helper.c */
+int cpu_x86_handle_mmu_fault(CPUX86State *env, target_ulong addr,
+                             int is_write, int mmu_idx, int is_softmmu);
+#define cpu_handle_mmu_fault cpu_x86_handle_mmu_fault
+void cpu_x86_set_a20(CPUX86State *env, int a20_state);
+
+static inline int hw_breakpoint_enabled(unsigned long dr7, int index)
+{
+    return (dr7 >> (index * 2)) & 3;
+}
+
+static inline int hw_breakpoint_type(unsigned long dr7, int index)
+{
+    return (dr7 >> (DR7_TYPE_SHIFT + (index * 4))) & 3;
+}
+
+static inline int hw_breakpoint_len(unsigned long dr7, int index)
+{
+    int len = ((dr7 >> (DR7_LEN_SHIFT + (index * 4))) & 3);
+    return (len == 2) ? 8 : len + 1;
+}
+
+void hw_breakpoint_insert(CPUX86State *env, int index);
+void hw_breakpoint_remove(CPUX86State *env, int index);
+int check_hw_breakpoints(CPUX86State *env, int force_dr6_update);
+
+/* will be suppressed */
+void cpu_x86_update_cr0(CPUX86State *env, uint32_t new_cr0);
+void cpu_x86_update_cr3(CPUX86State *env, target_ulong new_cr3);
+void cpu_x86_update_cr4(CPUX86State *env, uint32_t new_cr4);
+
+/* hw/pc.c */
+void cpu_smm_update(CPUX86State *env);
+uint64_t cpu_get_tsc(CPUX86State *env);
+
+/* used to debug */
+#define X86_DUMP_FPU  0x0001 /* dump FPU state too */
+#define X86_DUMP_CCOP 0x0002 /* dump qemu flag cache */
+
+#ifdef VBOX
+int  cpu_rdmsr(CPUX86State *env, uint32_t idMsr, uint64_t *puValue);
+int  cpu_wrmsr(CPUX86State *env, uint32_t idMsr, uint64_t uValue);
+void cpu_trap_raw(CPUX86State *env1);
+
+/* in helper.c */
+uint8_t read_byte(CPUX86State *env1, target_ulong addr);
+uint16_t read_word(CPUX86State *env1, target_ulong addr);
+void write_byte(CPUX86State *env1, target_ulong addr, uint8_t val);
+uint32_t read_dword(CPUX86State *env1, target_ulong addr);
+void write_word(CPUX86State *env1, target_ulong addr, uint16_t val);
+void write_dword(CPUX86State *env1, target_ulong addr, uint32_t val);
+/* in helper.c */
+int emulate_single_instr(CPUX86State *env1);
+int get_ss_esp_from_tss_raw(CPUX86State *env1, uint32_t *ss_ptr, uint32_t *esp_ptr, int dpl);
+
+void restore_raw_fp_state(CPUX86State *env, uint8_t *ptr);
+void save_raw_fp_state(CPUX86State *env, uint8_t *ptr);
+#endif /* VBOX */
+
+#define TARGET_PAGE_BITS 12
+
+#ifdef TARGET_X86_64
+#define TARGET_PHYS_ADDR_SPACE_BITS 52
+/* ??? This is really 48 bits, sign-extended, but the only thing
+   accessible to userland with bit 48 set is the VSYSCALL, and that
+   is handled via other mechanisms.  */
+#define TARGET_VIRT_ADDR_SPACE_BITS 47
+#else
+#define TARGET_PHYS_ADDR_SPACE_BITS 36
+#define TARGET_VIRT_ADDR_SPACE_BITS 32
+#endif
+
+#define cpu_init cpu_x86_init
+#define cpu_exec cpu_x86_exec
+#define cpu_gen_code cpu_x86_gen_code
+#define cpu_signal_handler cpu_x86_signal_handler
+#define cpu_list_id x86_cpu_list
+#define cpudef_setup	x86_cpudef_setup
+
+#define CPU_SAVE_VERSION 12
+
+/* MMU modes definitions */
+#define MMU_MODE0_SUFFIX _kernel
+#define MMU_MODE1_SUFFIX _user
+#define MMU_USER_IDX 1
+static inline int cpu_mmu_index (CPUState *env)
+{
+    return (env->hflags & HF_CPL_MASK) == 3 ? 1 : 0;
+}
+
+/* translate.c */
+void optimize_flags_init(void);
+
+typedef struct CCTable {
+    int (*compute_all)(void); /* return all the flags */
+    int (*compute_c)(void);  /* return the C flag */
+} CCTable;
+
+#if defined(CONFIG_USER_ONLY)
+static inline void cpu_clone_regs(CPUState *env, target_ulong newsp)
+{
+    if (newsp)
+        env->regs[R_ESP] = newsp;
+    env->regs[R_EAX] = 0;
+}
+#endif
+
+#include "cpu-all.h"
+#include "svm.h"
+
+#ifndef VBOX
+#if !defined(CONFIG_USER_ONLY)
+#include "hw/apic.h"
+#endif
+#else  /* VBOX */
+extern void     cpu_set_apic_tpr(CPUX86State *env, uint8_t val);
+extern uint8_t  cpu_get_apic_tpr(CPUX86State *env);
+extern uint64_t cpu_get_apic_base(CPUX86State *env);
+#endif /* VBOX */
+
+static inline void cpu_get_tb_cpu_state(CPUState *env, target_ulong *pc,
+                                        target_ulong *cs_base, int *flags)
+{
+    *cs_base = env->segs[R_CS].base;
+    if (env->hflags & HF_CS64_MASK)
+        *pc = *cs_base + env->eip;
+    else
+        *pc = (uint32_t)(*cs_base + env->eip);
+    *flags = env->hflags |
+        (env->eflags & (IOPL_MASK | TF_MASK | RF_MASK | VM_MASK));
+}
+
+#ifndef VBOX
+void apic_init_reset(CPUState *env);
+void apic_sipi(CPUState *env);
+void do_cpu_init(CPUState *env);
+void do_cpu_sipi(CPUState *env);
+#endif /* !VBOX */
+#endif /* CPU_I386_H */
diff --git a/src/recompiler/target-i386/exec.h b/src/recompiler/target-i386/exec.h
new file mode 100644
index 00000000..355599fa
--- /dev/null
+++ b/src/recompiler/target-i386/exec.h
@@ -0,0 +1,370 @@
+/*
+ *  i386 execution defines
+ *
+ *  Copyright (c) 2003 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Oracle LGPL Disclaimer: For the avoidance of doubt, except that if any license choice
+ * other than GPL or LGPL is available it will apply instead, Oracle elects to use only
+ * the Lesser General Public License version 2.1 (LGPLv2) at this time for any software where
+ * a choice of LGPL license versions is made available with the language indicating
+ * that LGPLv2 or any later version may be used, or where a choice of which version
+ * of the LGPL is applied is otherwise unspecified.
+ */
+
+#include "config.h"
+#include "dyngen-exec.h"
+
+/* XXX: factorize this mess */
+#ifdef TARGET_X86_64
+#define TARGET_LONG_BITS 64
+#else
+#define TARGET_LONG_BITS 32
+#endif
+
+#include "cpu-defs.h"
+
+register struct CPUX86State *env asm(AREG0);
+
+#include "qemu-common.h"
+#include "qemu-log.h"
+
+#undef EAX
+#define EAX (env->regs[R_EAX])
+#undef ECX
+#define ECX (env->regs[R_ECX])
+#undef EDX
+#define EDX (env->regs[R_EDX])
+#undef EBX
+#define EBX (env->regs[R_EBX])
+#undef ESP
+#define ESP (env->regs[R_ESP])
+#undef EBP
+#define EBP (env->regs[R_EBP])
+#undef ESI
+#define ESI (env->regs[R_ESI])
+#undef EDI
+#define EDI (env->regs[R_EDI])
+#undef EIP
+#define EIP (env->eip)
+#define DF  (env->df)
+
+#define CC_SRC (env->cc_src)
+#define CC_DST (env->cc_dst)
+#define CC_OP  (env->cc_op)
+
+/* float macros */
+#define FT0    (env->ft0)
+#define ST0    (env->fpregs[env->fpstt].d)
+#define ST(n)  (env->fpregs[(env->fpstt + (n)) & 7].d)
+#define ST1    ST(1)
+
+#include "cpu.h"
+#include "exec-all.h"
+
+/* op_helper.c */
+void do_interrupt(int intno, int is_int, int error_code,
+                  target_ulong next_eip, int is_hw);
+void do_interrupt_user(int intno, int is_int, int error_code,
+                       target_ulong next_eip);
+void QEMU_NORETURN raise_exception_err(int exception_index, int error_code);
+void QEMU_NORETURN raise_exception(int exception_index);
+void QEMU_NORETURN raise_exception_env(int exception_index, CPUState *nenv);
+void do_smm_enter(void);
+
+/* n must be a constant to be efficient */
+static inline target_long lshift(target_long x, int n)
+{
+    if (n >= 0)
+        return x << n;
+    else
+        return x >> (-n);
+}
+
+#include "helper.h"
+
+static inline void svm_check_intercept(uint32_t type)
+{
+    helper_svm_check_intercept_param(type, 0);
+}
+
+#if !defined(CONFIG_USER_ONLY)
+
+#include "softmmu_exec.h"
+
+#endif /* !defined(CONFIG_USER_ONLY) */
+
+#ifdef USE_X86LDOUBLE
+/* use long double functions */
+#define floatx_to_int32 floatx80_to_int32
+#define floatx_to_int64 floatx80_to_int64
+#define floatx_to_int32_round_to_zero floatx80_to_int32_round_to_zero
+#define floatx_to_int64_round_to_zero floatx80_to_int64_round_to_zero
+#define int32_to_floatx int32_to_floatx80
+#define int64_to_floatx int64_to_floatx80
+#define float32_to_floatx float32_to_floatx80
+#define float64_to_floatx float64_to_floatx80
+#define floatx_to_float32 floatx80_to_float32
+#define floatx_to_float64 floatx80_to_float64
+#define floatx_abs floatx80_abs
+#define floatx_chs floatx80_chs
+#define floatx_round_to_int floatx80_round_to_int
+#define floatx_compare floatx80_compare
+#define floatx_compare_quiet floatx80_compare_quiet
+#else
+#define floatx_to_int32 float64_to_int32
+#define floatx_to_int64 float64_to_int64
+#define floatx_to_int32_round_to_zero float64_to_int32_round_to_zero
+#define floatx_to_int64_round_to_zero float64_to_int64_round_to_zero
+#define int32_to_floatx int32_to_float64
+#define int64_to_floatx int64_to_float64
+#define float32_to_floatx float32_to_float64
+#define float64_to_floatx(x, e) (x)
+#define floatx_to_float32 float64_to_float32
+#define floatx_to_float64(x, e) (x)
+#define floatx_abs float64_abs
+#define floatx_chs float64_chs
+#define floatx_round_to_int float64_round_to_int
+#define floatx_compare float64_compare
+#define floatx_compare_quiet float64_compare_quiet
+#endif
+
+#ifdef VBOX
+# ifdef IPRT_NO_CRT
+#  undef  sin
+#  undef  cos
+#  undef  sqrt
+#  undef  pow
+#  undef  log
+#  undef  tan
+#  undef  atan2
+#  undef  floor
+#  undef  ceil
+#  undef  ldexp
+#  define sin   sinl
+#  define cos   cosl
+#  define sqrt  sqrtl
+#  define pow   powl
+#  define log   logl
+#  define tan   tanl
+#  define atan2 atan2l
+#  define floor floorl
+#  define ceil  ceill
+#  define ldexp ldexpl
+# endif
+#endif
+
+#define RC_MASK         0xc00
+#define RC_NEAR		0x000
+#define RC_DOWN		0x400
+#define RC_UP		0x800
+#define RC_CHOP		0xc00
+
+#define MAXTAN 9223372036854775808.0
+
+#ifdef USE_X86LDOUBLE
+
+/* only for x86 */
+typedef union {
+    long double d;
+    struct {
+        unsigned long long lower;
+        unsigned short upper;
+    } l;
+} CPU86_LDoubleU;
+
+/* the following deal with x86 long double-precision numbers */
+#define MAXEXPD 0x7fff
+#define EXPBIAS 16383
+#define EXPD(fp)	(fp.l.upper & 0x7fff)
+#define SIGND(fp)	((fp.l.upper) & 0x8000)
+#define MANTD(fp)       (fp.l.lower)
+#define BIASEXPONENT(fp) fp.l.upper = (fp.l.upper & ~(0x7fff)) | EXPBIAS
+
+#else
+
+/* NOTE: arm is horrible as double 32 bit words are stored in big endian ! */
+typedef union {
+    double d;
+#if !defined(HOST_WORDS_BIGENDIAN) && !defined(__arm__)
+    struct {
+        uint32_t lower;
+        int32_t upper;
+    } l;
+#else
+    struct {
+        int32_t upper;
+        uint32_t lower;
+    } l;
+#endif
+#ifndef __arm__
+    int64_t ll;
+#endif
+} CPU86_LDoubleU;
+
+/* the following deal with IEEE double-precision numbers */
+#define MAXEXPD 0x7ff
+#define EXPBIAS 1023
+#define EXPD(fp)	(((fp.l.upper) >> 20) & 0x7FF)
+#define SIGND(fp)	((fp.l.upper) & 0x80000000)
+#ifdef __arm__
+#define MANTD(fp)	(fp.l.lower | ((uint64_t)(fp.l.upper & ((1 << 20) - 1)) << 32))
+#else
+#define MANTD(fp)	(fp.ll & ((1LL << 52) - 1))
+#endif
+#define BIASEXPONENT(fp) fp.l.upper = (fp.l.upper & ~(0x7ff << 20)) | (EXPBIAS << 20)
+#endif
+
+static inline void fpush(void)
+{
+    env->fpstt = (env->fpstt - 1) & 7;
+    env->fptags[env->fpstt] = 0; /* validate stack entry */
+}
+
+static inline void fpop(void)
+{
+    env->fptags[env->fpstt] = 1; /* invvalidate stack entry */
+    env->fpstt = (env->fpstt + 1) & 7;
+}
+
+#ifndef USE_X86LDOUBLE
+static inline CPU86_LDouble helper_fldt(target_ulong ptr)
+{
+    CPU86_LDoubleU temp;
+    int upper, e;
+    uint64_t ll;
+
+    /* mantissa */
+    upper = lduw(ptr + 8);
+    /* XXX: handle overflow ? */
+    e = (upper & 0x7fff) - 16383 + EXPBIAS; /* exponent */
+    e |= (upper >> 4) & 0x800; /* sign */
+    ll = (ldq(ptr) >> 11) & ((1LL << 52) - 1);
+#ifdef __arm__
+    temp.l.upper = (e << 20) | (ll >> 32);
+    temp.l.lower = ll;
+#else
+    temp.ll = ll | ((uint64_t)e << 52);
+#endif
+    return temp.d;
+}
+
+static inline void helper_fstt(CPU86_LDouble f, target_ulong ptr)
+{
+    CPU86_LDoubleU temp;
+    int e;
+
+    temp.d = f;
+    /* mantissa */
+    stq(ptr, (MANTD(temp) << 11) | (1LL << 63));
+    /* exponent + sign */
+    e = EXPD(temp) - EXPBIAS + 16383;
+    e |= SIGND(temp) >> 16;
+    stw(ptr + 8, e);
+}
+#else
+
+/* we use memory access macros */
+
+static inline CPU86_LDouble helper_fldt(target_ulong ptr)
+{
+    CPU86_LDoubleU temp;
+
+    temp.l.lower = ldq(ptr);
+    temp.l.upper = lduw(ptr + 8);
+    return temp.d;
+}
+
+static inline void helper_fstt(CPU86_LDouble f, target_ulong ptr)
+{
+    CPU86_LDoubleU temp;
+
+    temp.d = f;
+    stq(ptr, temp.l.lower);
+    stw(ptr + 8, temp.l.upper);
+}
+
+#endif /* USE_X86LDOUBLE */
+
+#define FPUS_IE (1 << 0)
+#define FPUS_DE (1 << 1)
+#define FPUS_ZE (1 << 2)
+#define FPUS_OE (1 << 3)
+#define FPUS_UE (1 << 4)
+#define FPUS_PE (1 << 5)
+#define FPUS_SF (1 << 6)
+#define FPUS_SE (1 << 7)
+#define FPUS_B  (1 << 15)
+
+#define FPUC_EM 0x3f
+
+static inline uint32_t compute_eflags(void)
+{
+    return env->eflags | helper_cc_compute_all(CC_OP) | (DF & DF_MASK);
+}
+
+/* NOTE: CC_OP must be modified manually to CC_OP_EFLAGS */
+static inline void load_eflags(int eflags, int update_mask)
+{
+    CC_SRC = eflags & (CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C);
+    DF = 1 - (2 * ((eflags >> 10) & 1));
+    env->eflags = (env->eflags & ~update_mask) |
+        (eflags & update_mask) | 0x2;
+}
+
+static inline int cpu_has_work(CPUState *env)
+{
+    int work;
+
+    work = (env->interrupt_request & CPU_INTERRUPT_HARD) &&
+           (env->eflags & IF_MASK);
+    work |= env->interrupt_request & CPU_INTERRUPT_NMI;
+    work |= env->interrupt_request & CPU_INTERRUPT_INIT;
+    work |= env->interrupt_request & CPU_INTERRUPT_SIPI;
+
+    return work;
+}
+
+static inline int cpu_halted(CPUState *env) {
+    /* handle exit of HALTED state */
+    if (!env->halted)
+        return 0;
+    /* disable halt condition */
+    if (cpu_has_work(env)) {
+        env->halted = 0;
+        return 0;
+    }
+    return EXCP_HALTED;
+}
+
+/* load efer and update the corresponding hflags. XXX: do consistency
+   checks with cpuid bits ? */
+static inline void cpu_load_efer(CPUState *env, uint64_t val)
+{
+    env->efer = val;
+    env->hflags &= ~(HF_LMA_MASK | HF_SVME_MASK);
+    if (env->efer & MSR_EFER_LMA)
+        env->hflags |= HF_LMA_MASK;
+    if (env->efer & MSR_EFER_SVME)
+        env->hflags |= HF_SVME_MASK;
+}
+
+static inline void cpu_pc_from_tb(CPUState *env, TranslationBlock *tb)
+{
+    env->eip = tb->pc - tb->cs_base;
+}
+
diff --git a/src/recompiler/target-i386/helper.c b/src/recompiler/target-i386/helper.c
new file mode 100644
index 00000000..a6f37ff6
--- /dev/null
+++ b/src/recompiler/target-i386/helper.c
@@ -0,0 +1,1227 @@
+/*
+ *  i386 helpers (without register variable usage)
+ *
+ *  Copyright (c) 2003 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Oracle LGPL Disclaimer: For the avoidance of doubt, except that if any license choice
+ * other than GPL or LGPL is available it will apply instead, Oracle elects to use only
+ * the Lesser General Public License version 2.1 (LGPLv2) at this time for any software where
+ * a choice of LGPL license versions is made available with the language indicating
+ * that LGPLv2 or any later version may be used, or where a choice of which version
+ * of the LGPL is applied is otherwise unspecified.
+ */
+
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#ifndef VBOX
+#include <inttypes.h>
+#include <signal.h>
+#endif /* !VBOX */
+
+#include "cpu.h"
+#include "exec-all.h"
+#include "qemu-common.h"
+#include "kvm.h"
+
+//#define DEBUG_MMU
+
+/* NOTE: must be called outside the CPU execute loop */
+void cpu_reset(CPUX86State *env)
+{
+    int i;
+
+    if (qemu_loglevel_mask(CPU_LOG_RESET)) {
+        qemu_log("CPU Reset (CPU %d)\n", env->cpu_index);
+        log_cpu_state(env, X86_DUMP_FPU | X86_DUMP_CCOP);
+    }
+
+    memset(env, 0, offsetof(CPUX86State, breakpoints));
+
+    tlb_flush(env, 1);
+
+    env->old_exception = -1;
+
+    /* init to reset state */
+
+#ifdef CONFIG_SOFTMMU
+    env->hflags |= HF_SOFTMMU_MASK;
+#endif
+    env->hflags2 |= HF2_GIF_MASK;
+
+    cpu_x86_update_cr0(env, 0x60000010);
+    env->a20_mask = ~0x0;
+    env->smbase = 0x30000;
+
+    env->idt.limit = 0xffff;
+    env->gdt.limit = 0xffff;
+    env->ldt.limit = 0xffff;
+    env->ldt.flags = DESC_P_MASK | (2 << DESC_TYPE_SHIFT);
+    env->tr.limit = 0xffff;
+    env->tr.flags = DESC_P_MASK | (11 << DESC_TYPE_SHIFT);
+
+    cpu_x86_load_seg_cache(env, R_CS, 0xf000, 0xffff0000, 0xffff,
+                           DESC_P_MASK | DESC_S_MASK | DESC_CS_MASK |
+                           DESC_R_MASK | DESC_A_MASK);
+    cpu_x86_load_seg_cache(env, R_DS, 0, 0, 0xffff,
+                           DESC_P_MASK | DESC_S_MASK | DESC_W_MASK |
+                           DESC_A_MASK);
+    cpu_x86_load_seg_cache(env, R_ES, 0, 0, 0xffff,
+                           DESC_P_MASK | DESC_S_MASK | DESC_W_MASK |
+                           DESC_A_MASK);
+    cpu_x86_load_seg_cache(env, R_SS, 0, 0, 0xffff,
+                           DESC_P_MASK | DESC_S_MASK | DESC_W_MASK |
+                           DESC_A_MASK);
+    cpu_x86_load_seg_cache(env, R_FS, 0, 0, 0xffff,
+                           DESC_P_MASK | DESC_S_MASK | DESC_W_MASK |
+                           DESC_A_MASK);
+    cpu_x86_load_seg_cache(env, R_GS, 0, 0, 0xffff,
+                           DESC_P_MASK | DESC_S_MASK | DESC_W_MASK |
+                           DESC_A_MASK);
+
+    env->eip = 0xfff0;
+#ifndef VBOX /* We'll get the right value from CPUM. */
+    env->regs[R_EDX] = env->cpuid_version;
+#endif
+
+    env->eflags = 0x2;
+
+    /* FPU init */
+    for(i = 0;i < 8; i++)
+        env->fptags[i] = 1;
+    env->fpuc = 0x37f;
+
+    env->mxcsr = 0x1f80;
+
+    memset(env->dr, 0, sizeof(env->dr));
+    env->dr[6] = DR6_FIXED_1;
+    env->dr[7] = DR7_FIXED_1;
+    cpu_breakpoint_remove_all(env, BP_CPU);
+    cpu_watchpoint_remove_all(env, BP_CPU);
+
+#ifndef VBOX
+    env->mcg_status = 0;
+#endif
+}
+
+void cpu_x86_close(CPUX86State *env)
+{
+#ifndef VBOX
+    qemu_free(env);
+#endif
+}
+
+/***********************************************************/
+/* x86 debug */
+
+static const char *cc_op_str[] = {
+    "DYNAMIC",
+    "EFLAGS",
+
+    "MULB",
+    "MULW",
+    "MULL",
+    "MULQ",
+
+    "ADDB",
+    "ADDW",
+    "ADDL",
+    "ADDQ",
+
+    "ADCB",
+    "ADCW",
+    "ADCL",
+    "ADCQ",
+
+    "SUBB",
+    "SUBW",
+    "SUBL",
+    "SUBQ",
+
+    "SBBB",
+    "SBBW",
+    "SBBL",
+    "SBBQ",
+
+    "LOGICB",
+    "LOGICW",
+    "LOGICL",
+    "LOGICQ",
+
+    "INCB",
+    "INCW",
+    "INCL",
+    "INCQ",
+
+    "DECB",
+    "DECW",
+    "DECL",
+    "DECQ",
+
+    "SHLB",
+    "SHLW",
+    "SHLL",
+    "SHLQ",
+
+    "SARB",
+    "SARW",
+    "SARL",
+    "SARQ",
+};
+
+static void
+cpu_x86_dump_seg_cache(CPUState *env, FILE *f,
+                       int (*cpu_fprintf)(FILE *f, const char *fmt, ...),
+                       const char *name, struct SegmentCache *sc)
+{
+#ifdef VBOX
+# define cpu_fprintf(f, ...)    RTLogPrintf(__VA_ARGS__)
+#endif
+#ifdef TARGET_X86_64
+    if (env->hflags & HF_CS64_MASK) {
+        cpu_fprintf(f, "%-3s=%04x %016" PRIx64 " %08x %08x", name,
+                    sc->selector, sc->base, sc->limit, sc->flags);
+    } else
+#endif
+    {
+        cpu_fprintf(f, "%-3s=%04x %08x %08x %08x", name, sc->selector,
+                    (uint32_t)sc->base, sc->limit, sc->flags);
+    }
+
+    if (!(env->hflags & HF_PE_MASK) || !(sc->flags & DESC_P_MASK))
+        goto done;
+
+    cpu_fprintf(f, " DPL=%d ", (sc->flags & DESC_DPL_MASK) >> DESC_DPL_SHIFT);
+    if (sc->flags & DESC_S_MASK) {
+        if (sc->flags & DESC_CS_MASK) {
+            cpu_fprintf(f, (sc->flags & DESC_L_MASK) ? "CS64" :
+                           ((sc->flags & DESC_B_MASK) ? "CS32" : "CS16"));
+            cpu_fprintf(f, " [%c%c", (sc->flags & DESC_C_MASK) ? 'C' : '-',
+                        (sc->flags & DESC_R_MASK) ? 'R' : '-');
+        } else {
+            cpu_fprintf(f, (sc->flags & DESC_B_MASK) ? "DS  " : "DS16");
+            cpu_fprintf(f, " [%c%c", (sc->flags & DESC_E_MASK) ? 'E' : '-',
+                        (sc->flags & DESC_W_MASK) ? 'W' : '-');
+        }
+        cpu_fprintf(f, "%c]", (sc->flags & DESC_A_MASK) ? 'A' : '-');
+    } else {
+        static const char *sys_type_name[2][16] = {
+            { /* 32 bit mode */
+                "Reserved", "TSS16-avl", "LDT", "TSS16-busy",
+                "CallGate16", "TaskGate", "IntGate16", "TrapGate16",
+                "Reserved", "TSS32-avl", "Reserved", "TSS32-busy",
+                "CallGate32", "Reserved", "IntGate32", "TrapGate32"
+            },
+            { /* 64 bit mode */
+                "<hiword>", "Reserved", "LDT", "Reserved", "Reserved",
+                "Reserved", "Reserved", "Reserved", "Reserved",
+                "TSS64-avl", "Reserved", "TSS64-busy", "CallGate64",
+                "Reserved", "IntGate64", "TrapGate64"
+            }
+        };
+        cpu_fprintf(f, "%s",
+                    sys_type_name[(env->hflags & HF_LMA_MASK) ? 1 : 0]
+                                 [(sc->flags & DESC_TYPE_MASK)
+                                  >> DESC_TYPE_SHIFT]);
+    }
+done:
+    cpu_fprintf(f, "\n");
+#ifdef VBOX
+# undef cpu_fprintf
+#endif
+}
+
+void cpu_dump_state(CPUState *env, FILE *f,
+                    int (*cpu_fprintf)(FILE *f, const char *fmt, ...),
+                    int flags)
+{
+    int eflags, i, nb;
+    char cc_op_name[32];
+    static const char *seg_name[6] = { "ES", "CS", "SS", "DS", "FS", "GS" };
+
+#ifdef VBOX
+# define cpu_fprintf(f, ...)    RTLogPrintf(__VA_ARGS__)
+#endif
+    cpu_synchronize_state(env);
+
+    eflags = env->eflags;
+#ifdef TARGET_X86_64
+    if (env->hflags & HF_CS64_MASK) {
+        cpu_fprintf(f,
+                    "RAX=%016" PRIx64 " RBX=%016" PRIx64 " RCX=%016" PRIx64 " RDX=%016" PRIx64 "\n"
+                    "RSI=%016" PRIx64 " RDI=%016" PRIx64 " RBP=%016" PRIx64 " RSP=%016" PRIx64 "\n"
+                    "R8 =%016" PRIx64 " R9 =%016" PRIx64 " R10=%016" PRIx64 " R11=%016" PRIx64 "\n"
+                    "R12=%016" PRIx64 " R13=%016" PRIx64 " R14=%016" PRIx64 " R15=%016" PRIx64 "\n"
+                    "RIP=%016" PRIx64 " RFL=%08x [%c%c%c%c%c%c%c] CPL=%d II=%d A20=%d SMM=%d HLT=%d\n",
+                    env->regs[R_EAX],
+                    env->regs[R_EBX],
+                    env->regs[R_ECX],
+                    env->regs[R_EDX],
+                    env->regs[R_ESI],
+                    env->regs[R_EDI],
+                    env->regs[R_EBP],
+                    env->regs[R_ESP],
+                    env->regs[8],
+                    env->regs[9],
+                    env->regs[10],
+                    env->regs[11],
+                    env->regs[12],
+                    env->regs[13],
+                    env->regs[14],
+                    env->regs[15],
+                    env->eip, eflags,
+                    eflags & DF_MASK ? 'D' : '-',
+                    eflags & CC_O ? 'O' : '-',
+                    eflags & CC_S ? 'S' : '-',
+                    eflags & CC_Z ? 'Z' : '-',
+                    eflags & CC_A ? 'A' : '-',
+                    eflags & CC_P ? 'P' : '-',
+                    eflags & CC_C ? 'C' : '-',
+                    env->hflags & HF_CPL_MASK,
+                    (env->hflags >> HF_INHIBIT_IRQ_SHIFT) & 1,
+                    (env->a20_mask >> 20) & 1,
+                    (env->hflags >> HF_SMM_SHIFT) & 1,
+                    env->halted);
+    } else
+#endif
+    {
+        cpu_fprintf(f, "EAX=%08x EBX=%08x ECX=%08x EDX=%08x\n"
+                    "ESI=%08x EDI=%08x EBP=%08x ESP=%08x\n"
+                    "EIP=%08x EFL=%08x [%c%c%c%c%c%c%c] CPL=%d II=%d A20=%d SMM=%d HLT=%d\n",
+                    (uint32_t)env->regs[R_EAX],
+                    (uint32_t)env->regs[R_EBX],
+                    (uint32_t)env->regs[R_ECX],
+                    (uint32_t)env->regs[R_EDX],
+                    (uint32_t)env->regs[R_ESI],
+                    (uint32_t)env->regs[R_EDI],
+                    (uint32_t)env->regs[R_EBP],
+                    (uint32_t)env->regs[R_ESP],
+                    (uint32_t)env->eip, eflags,
+                    eflags & DF_MASK ? 'D' : '-',
+                    eflags & CC_O ? 'O' : '-',
+                    eflags & CC_S ? 'S' : '-',
+                    eflags & CC_Z ? 'Z' : '-',
+                    eflags & CC_A ? 'A' : '-',
+                    eflags & CC_P ? 'P' : '-',
+                    eflags & CC_C ? 'C' : '-',
+                    env->hflags & HF_CPL_MASK,
+                    (env->hflags >> HF_INHIBIT_IRQ_SHIFT) & 1,
+                    (env->a20_mask >> 20) & 1,
+                    (env->hflags >> HF_SMM_SHIFT) & 1,
+                    env->halted);
+    }
+
+    for(i = 0; i < 6; i++) {
+        cpu_x86_dump_seg_cache(env, f, cpu_fprintf, seg_name[i],
+                               &env->segs[i]);
+    }
+    cpu_x86_dump_seg_cache(env, f, cpu_fprintf, "LDT", &env->ldt);
+    cpu_x86_dump_seg_cache(env, f, cpu_fprintf, "TR", &env->tr);
+
+#ifdef TARGET_X86_64
+    if (env->hflags & HF_LMA_MASK) {
+        cpu_fprintf(f, "GDT=     %016" PRIx64 " %08x\n",
+                    env->gdt.base, env->gdt.limit);
+        cpu_fprintf(f, "IDT=     %016" PRIx64 " %08x\n",
+                    env->idt.base, env->idt.limit);
+        cpu_fprintf(f, "CR0=%08x CR2=%016" PRIx64 " CR3=%016" PRIx64 " CR4=%08x\n",
+                    (uint32_t)env->cr[0],
+                    env->cr[2],
+                    env->cr[3],
+                    (uint32_t)env->cr[4]);
+        for(i = 0; i < 4; i++)
+            cpu_fprintf(f, "DR%d=%016" PRIx64 " ", i, env->dr[i]);
+        cpu_fprintf(f, "\nDR6=%016" PRIx64 " DR7=%016" PRIx64 "\n",
+                    env->dr[6], env->dr[7]);
+    } else
+#endif
+    {
+        cpu_fprintf(f, "GDT=     %08x %08x\n",
+                    (uint32_t)env->gdt.base, env->gdt.limit);
+        cpu_fprintf(f, "IDT=     %08x %08x\n",
+                    (uint32_t)env->idt.base, env->idt.limit);
+        cpu_fprintf(f, "CR0=%08x CR2=%08x CR3=%08x CR4=%08x\n",
+                    (uint32_t)env->cr[0],
+                    (uint32_t)env->cr[2],
+                    (uint32_t)env->cr[3],
+                    (uint32_t)env->cr[4]);
+        for(i = 0; i < 4; i++)
+            cpu_fprintf(f, "DR%d=%08x ", i, env->dr[i]);
+        cpu_fprintf(f, "\nDR6=%08x DR7=%08x\n", env->dr[6], env->dr[7]);
+    }
+    if (flags & X86_DUMP_CCOP) {
+        if ((unsigned)env->cc_op < CC_OP_NB)
+            snprintf(cc_op_name, sizeof(cc_op_name), "%s", cc_op_str[env->cc_op]);
+        else
+            snprintf(cc_op_name, sizeof(cc_op_name), "[%d]", env->cc_op);
+#ifdef TARGET_X86_64
+        if (env->hflags & HF_CS64_MASK) {
+            cpu_fprintf(f, "CCS=%016" PRIx64 " CCD=%016" PRIx64 " CCO=%-8s\n",
+                        env->cc_src, env->cc_dst,
+                        cc_op_name);
+        } else
+#endif
+        {
+            cpu_fprintf(f, "CCS=%08x CCD=%08x CCO=%-8s\n",
+                        (uint32_t)env->cc_src, (uint32_t)env->cc_dst,
+                        cc_op_name);
+        }
+    }
+    cpu_fprintf(f, "EFER=%016" PRIx64 "\n", env->efer);
+    if (flags & X86_DUMP_FPU) {
+        int fptag;
+        fptag = 0;
+        for(i = 0; i < 8; i++) {
+            fptag |= ((!env->fptags[i]) << i);
+        }
+        cpu_fprintf(f, "FCW=%04x FSW=%04x [ST=%d] FTW=%02x MXCSR=%08x\n",
+                    env->fpuc,
+                    (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11,
+                    env->fpstt,
+                    fptag,
+                    env->mxcsr);
+        for(i=0;i<8;i++) {
+#if defined(USE_X86LDOUBLE)
+            union {
+                long double d;
+                struct {
+                    uint64_t lower;
+                    uint16_t upper;
+                } l;
+            } tmp;
+            tmp.d = env->fpregs[i].d;
+            cpu_fprintf(f, "FPR%d=%016" PRIx64 " %04x",
+                        i, tmp.l.lower, tmp.l.upper);
+#else
+            cpu_fprintf(f, "FPR%d=%016" PRIx64,
+                        i, env->fpregs[i].mmx.q);
+#endif
+            if ((i & 1) == 1)
+                cpu_fprintf(f, "\n");
+            else
+                cpu_fprintf(f, " ");
+        }
+        if (env->hflags & HF_CS64_MASK)
+            nb = 16;
+        else
+            nb = 8;
+        for(i=0;i<nb;i++) {
+            cpu_fprintf(f, "XMM%02d=%08x%08x%08x%08x",
+                        i,
+                        env->xmm_regs[i].XMM_L(3),
+                        env->xmm_regs[i].XMM_L(2),
+                        env->xmm_regs[i].XMM_L(1),
+                        env->xmm_regs[i].XMM_L(0));
+            if ((i & 1) == 1)
+                cpu_fprintf(f, "\n");
+            else
+                cpu_fprintf(f, " ");
+        }
+    }
+#ifdef VBOX
+# undef cpu_fprintf
+#endif
+}
+
+/***********************************************************/
+/* x86 mmu */
+/* XXX: add PGE support */
+
+void cpu_x86_set_a20(CPUX86State *env, int a20_state)
+{
+    a20_state = (a20_state != 0);
+    if (a20_state != ((env->a20_mask >> 20) & 1)) {
+#if defined(DEBUG_MMU)
+        printf("A20 update: a20=%d\n", a20_state);
+#endif
+        /* if the cpu is currently executing code, we must unlink it and
+           all the potentially executing TB */
+        cpu_interrupt(env, CPU_INTERRUPT_EXITTB);
+
+        /* when a20 is changed, all the MMU mappings are invalid, so
+           we must flush everything */
+        tlb_flush(env, 1);
+        env->a20_mask = ~(1 << 20) | (a20_state << 20);
+    }
+}
+
+void cpu_x86_update_cr0(CPUX86State *env, uint32_t new_cr0)
+{
+    int pe_state;
+
+#if defined(DEBUG_MMU)
+    printf("CR0 update: CR0=0x%08x\n", new_cr0);
+#endif
+    if ((new_cr0 & (CR0_PG_MASK | CR0_WP_MASK | CR0_PE_MASK)) !=
+        (env->cr[0] & (CR0_PG_MASK | CR0_WP_MASK | CR0_PE_MASK))) {
+        tlb_flush(env, 1);
+    }
+
+#ifdef TARGET_X86_64
+    if (!(env->cr[0] & CR0_PG_MASK) && (new_cr0 & CR0_PG_MASK) &&
+        (env->efer & MSR_EFER_LME)) {
+        /* enter in long mode */
+        /* XXX: generate an exception */
+        if (!(env->cr[4] & CR4_PAE_MASK))
+            return;
+        env->efer |= MSR_EFER_LMA;
+        env->hflags |= HF_LMA_MASK;
+    } else if ((env->cr[0] & CR0_PG_MASK) && !(new_cr0 & CR0_PG_MASK) &&
+               (env->efer & MSR_EFER_LMA)) {
+        /* exit long mode */
+        env->efer &= ~MSR_EFER_LMA;
+        env->hflags &= ~(HF_LMA_MASK | HF_CS64_MASK);
+        env->eip &= 0xffffffff;
+    }
+#endif
+    env->cr[0] = new_cr0 | CR0_ET_MASK;
+
+    /* update PE flag in hidden flags */
+    pe_state = (env->cr[0] & CR0_PE_MASK);
+    env->hflags = (env->hflags & ~HF_PE_MASK) | (pe_state << HF_PE_SHIFT);
+    /* ensure that ADDSEG is always set in real mode */
+    env->hflags |= ((pe_state ^ 1) << HF_ADDSEG_SHIFT);
+    /* update FPU flags */
+    env->hflags = (env->hflags & ~(HF_MP_MASK | HF_EM_MASK | HF_TS_MASK)) |
+        ((new_cr0 << (HF_MP_SHIFT - 1)) & (HF_MP_MASK | HF_EM_MASK | HF_TS_MASK));
+#ifdef VBOX
+    remR3ChangeCpuMode(env);
+#endif
+}
+
+/* XXX: in legacy PAE mode, generate a GPF if reserved bits are set in
+   the PDPT */
+void cpu_x86_update_cr3(CPUX86State *env, target_ulong new_cr3)
+{
+    env->cr[3] = new_cr3;
+    if (env->cr[0] & CR0_PG_MASK) {
+#if defined(DEBUG_MMU)
+        printf("CR3 update: CR3=" TARGET_FMT_lx "\n", new_cr3);
+#endif
+        tlb_flush(env, 0);
+    }
+}
+
+void cpu_x86_update_cr4(CPUX86State *env, uint32_t new_cr4)
+{
+#if defined(DEBUG_MMU)
+    printf("CR4 update: CR4=%08x\n", (uint32_t)env->cr[4]);
+#endif
+    if ((new_cr4 & (CR4_PGE_MASK | CR4_PAE_MASK | CR4_PSE_MASK)) !=
+        (env->cr[4] & (CR4_PGE_MASK | CR4_PAE_MASK | CR4_PSE_MASK))) {
+        tlb_flush(env, 1);
+    }
+    /* SSE handling */
+    if (!(env->cpuid_features & CPUID_SSE))
+        new_cr4 &= ~CR4_OSFXSR_MASK;
+    if (new_cr4 & CR4_OSFXSR_MASK)
+        env->hflags |= HF_OSFXSR_MASK;
+    else
+        env->hflags &= ~HF_OSFXSR_MASK;
+
+    env->cr[4] = new_cr4;
+#ifdef VBOX
+    remR3ChangeCpuMode(env);
+#endif
+}
+
+#if defined(CONFIG_USER_ONLY)
+
+int cpu_x86_handle_mmu_fault(CPUX86State *env, target_ulong addr,
+                             int is_write, int mmu_idx, int is_softmmu)
+{
+    /* user mode only emulation */
+    is_write &= 1;
+    env->cr[2] = addr;
+    env->error_code = (is_write << PG_ERROR_W_BIT);
+    env->error_code |= PG_ERROR_U_MASK;
+    env->exception_index = EXCP0E_PAGE;
+    return 1;
+}
+
+#else
+
+/* XXX: This value should match the one returned by CPUID
+ * and in exec.c */
+# if defined(TARGET_X86_64)
+# define PHYS_ADDR_MASK 0xfffffff000LL
+# else
+# define PHYS_ADDR_MASK 0xffffff000LL
+# endif
+
+/* return value:
+   -1 = cannot handle fault
+   0  = nothing more to do
+   1  = generate PF fault
+*/
+int cpu_x86_handle_mmu_fault(CPUX86State *env, target_ulong addr,
+                             int is_write1, int mmu_idx, int is_softmmu)
+{
+    uint64_t ptep, pte;
+    target_ulong pde_addr, pte_addr;
+    int error_code, is_dirty, prot, page_size, is_write, is_user;
+    target_phys_addr_t paddr;
+    uint32_t page_offset;
+    target_ulong vaddr, virt_addr;
+
+    is_user = mmu_idx == MMU_USER_IDX;
+#if defined(DEBUG_MMU)
+    printf("MMU fault: addr=" TARGET_FMT_lx " w=%d u=%d eip=" TARGET_FMT_lx "\n",
+           addr, is_write1, is_user, env->eip);
+#endif
+    is_write = is_write1 & 1;
+
+    if (!(env->cr[0] & CR0_PG_MASK)) {
+        pte = addr;
+        virt_addr = addr & TARGET_PAGE_MASK;
+        prot = PAGE_READ | PAGE_WRITE | PAGE_EXEC;
+        page_size = 4096;
+        goto do_mapping;
+    }
+
+    if (env->cr[4] & CR4_PAE_MASK) {
+        uint64_t pde, pdpe;
+        target_ulong pdpe_addr;
+
+#ifdef TARGET_X86_64
+        if (env->hflags & HF_LMA_MASK) {
+            uint64_t pml4e_addr, pml4e;
+            int32_t sext;
+
+            /* test virtual address sign extension */
+            sext = (int64_t)addr >> 47;
+            if (sext != 0 && sext != -1) {
+                env->error_code = 0;
+                env->exception_index = EXCP0D_GPF;
+                return 1;
+            }
+
+            pml4e_addr = ((env->cr[3] & ~0xfff) + (((addr >> 39) & 0x1ff) << 3)) &
+                env->a20_mask;
+            pml4e = ldq_phys(pml4e_addr);
+            if (!(pml4e & PG_PRESENT_MASK)) {
+                error_code = 0;
+                goto do_fault;
+            }
+            if (!(env->efer & MSR_EFER_NXE) && (pml4e & PG_NX_MASK)) {
+                error_code = PG_ERROR_RSVD_MASK;
+                goto do_fault;
+            }
+            if (!(pml4e & PG_ACCESSED_MASK)) {
+                pml4e |= PG_ACCESSED_MASK;
+                stl_phys_notdirty(pml4e_addr, pml4e);
+            }
+            ptep = pml4e ^ PG_NX_MASK;
+            pdpe_addr = ((pml4e & PHYS_ADDR_MASK) + (((addr >> 30) & 0x1ff) << 3)) &
+                env->a20_mask;
+            pdpe = ldq_phys(pdpe_addr);
+            if (!(pdpe & PG_PRESENT_MASK)) {
+                error_code = 0;
+                goto do_fault;
+            }
+            if (!(env->efer & MSR_EFER_NXE) && (pdpe & PG_NX_MASK)) {
+                error_code = PG_ERROR_RSVD_MASK;
+                goto do_fault;
+            }
+            ptep &= pdpe ^ PG_NX_MASK;
+            if (!(pdpe & PG_ACCESSED_MASK)) {
+                pdpe |= PG_ACCESSED_MASK;
+                stl_phys_notdirty(pdpe_addr, pdpe);
+            }
+        } else
+#endif
+        {
+            /* XXX: load them when cr3 is loaded ? */
+            pdpe_addr = ((env->cr[3] & ~0x1f) + ((addr >> 27) & 0x18)) &
+                env->a20_mask;
+            pdpe = ldq_phys(pdpe_addr);
+            if (!(pdpe & PG_PRESENT_MASK)) {
+                error_code = 0;
+                goto do_fault;
+            }
+            ptep = PG_NX_MASK | PG_USER_MASK | PG_RW_MASK;
+        }
+
+        pde_addr = ((pdpe & PHYS_ADDR_MASK) + (((addr >> 21) & 0x1ff) << 3)) &
+            env->a20_mask;
+        pde = ldq_phys(pde_addr);
+        if (!(pde & PG_PRESENT_MASK)) {
+            error_code = 0;
+            goto do_fault;
+        }
+        if (!(env->efer & MSR_EFER_NXE) && (pde & PG_NX_MASK)) {
+            error_code = PG_ERROR_RSVD_MASK;
+            goto do_fault;
+        }
+        ptep &= pde ^ PG_NX_MASK;
+        if (pde & PG_PSE_MASK) {
+            /* 2 MB page */
+            page_size = 2048 * 1024;
+            ptep ^= PG_NX_MASK;
+            if ((ptep & PG_NX_MASK) && is_write1 == 2)
+                goto do_fault_protect;
+            if (is_user) {
+                if (!(ptep & PG_USER_MASK))
+                    goto do_fault_protect;
+                if (is_write && !(ptep & PG_RW_MASK))
+                    goto do_fault_protect;
+            } else {
+                if ((env->cr[0] & CR0_WP_MASK) &&
+                    is_write && !(ptep & PG_RW_MASK))
+                    goto do_fault_protect;
+            }
+            is_dirty = is_write && !(pde & PG_DIRTY_MASK);
+            if (!(pde & PG_ACCESSED_MASK) || is_dirty) {
+                pde |= PG_ACCESSED_MASK;
+                if (is_dirty)
+                    pde |= PG_DIRTY_MASK;
+                stl_phys_notdirty(pde_addr, pde);
+            }
+            /* align to page_size */
+            pte = pde & ((PHYS_ADDR_MASK & ~(page_size - 1)) | 0xfff);
+            virt_addr = addr & ~(page_size - 1);
+        } else {
+            /* 4 KB page */
+            if (!(pde & PG_ACCESSED_MASK)) {
+                pde |= PG_ACCESSED_MASK;
+                stl_phys_notdirty(pde_addr, pde);
+            }
+            pte_addr = ((pde & PHYS_ADDR_MASK) + (((addr >> 12) & 0x1ff) << 3)) &
+                env->a20_mask;
+            pte = ldq_phys(pte_addr);
+            if (!(pte & PG_PRESENT_MASK)) {
+                error_code = 0;
+                goto do_fault;
+            }
+            if (!(env->efer & MSR_EFER_NXE) && (pte & PG_NX_MASK)) {
+                error_code = PG_ERROR_RSVD_MASK;
+                goto do_fault;
+            }
+            /* combine pde and pte nx, user and rw protections */
+            ptep &= pte ^ PG_NX_MASK;
+            ptep ^= PG_NX_MASK;
+            if ((ptep & PG_NX_MASK) && is_write1 == 2)
+                goto do_fault_protect;
+            if (is_user) {
+                if (!(ptep & PG_USER_MASK))
+                    goto do_fault_protect;
+                if (is_write && !(ptep & PG_RW_MASK))
+                    goto do_fault_protect;
+            } else {
+                if ((env->cr[0] & CR0_WP_MASK) &&
+                    is_write && !(ptep & PG_RW_MASK))
+                    goto do_fault_protect;
+            }
+            is_dirty = is_write && !(pte & PG_DIRTY_MASK);
+            if (!(pte & PG_ACCESSED_MASK) || is_dirty) {
+                pte |= PG_ACCESSED_MASK;
+                if (is_dirty)
+                    pte |= PG_DIRTY_MASK;
+                stl_phys_notdirty(pte_addr, pte);
+            }
+            page_size = 4096;
+            virt_addr = addr & ~0xfff;
+            pte = pte & (PHYS_ADDR_MASK | 0xfff);
+        }
+    } else {
+        uint32_t pde;
+
+        /* page directory entry */
+        pde_addr = ((env->cr[3] & ~0xfff) + ((addr >> 20) & 0xffc)) &
+            env->a20_mask;
+        pde = ldl_phys(pde_addr);
+        if (!(pde & PG_PRESENT_MASK)) {
+            error_code = 0;
+            goto do_fault;
+        }
+        /* if PSE bit is set, then we use a 4MB page */
+        if ((pde & PG_PSE_MASK) && (env->cr[4] & CR4_PSE_MASK)) {
+            page_size = 4096 * 1024;
+            if (is_user) {
+                if (!(pde & PG_USER_MASK))
+                    goto do_fault_protect;
+                if (is_write && !(pde & PG_RW_MASK))
+                    goto do_fault_protect;
+            } else {
+                if ((env->cr[0] & CR0_WP_MASK) &&
+                    is_write && !(pde & PG_RW_MASK))
+                    goto do_fault_protect;
+            }
+            is_dirty = is_write && !(pde & PG_DIRTY_MASK);
+            if (!(pde & PG_ACCESSED_MASK) || is_dirty) {
+                pde |= PG_ACCESSED_MASK;
+                if (is_dirty)
+                    pde |= PG_DIRTY_MASK;
+                stl_phys_notdirty(pde_addr, pde);
+            }
+
+            pte = pde & ~( (page_size - 1) & ~0xfff); /* align to page_size */
+            ptep = pte;
+            virt_addr = addr & ~(page_size - 1);
+        } else {
+            if (!(pde & PG_ACCESSED_MASK)) {
+                pde |= PG_ACCESSED_MASK;
+                stl_phys_notdirty(pde_addr, pde);
+            }
+
+            /* page directory entry */
+            pte_addr = ((pde & ~0xfff) + ((addr >> 10) & 0xffc)) &
+                env->a20_mask;
+            pte = ldl_phys(pte_addr);
+            if (!(pte & PG_PRESENT_MASK)) {
+                error_code = 0;
+                goto do_fault;
+            }
+            /* combine pde and pte user and rw protections */
+            ptep = pte & pde;
+            if (is_user) {
+                if (!(ptep & PG_USER_MASK))
+                    goto do_fault_protect;
+                if (is_write && !(ptep & PG_RW_MASK))
+                    goto do_fault_protect;
+            } else {
+                if ((env->cr[0] & CR0_WP_MASK) &&
+                    is_write && !(ptep & PG_RW_MASK))
+                    goto do_fault_protect;
+            }
+            is_dirty = is_write && !(pte & PG_DIRTY_MASK);
+            if (!(pte & PG_ACCESSED_MASK) || is_dirty) {
+                pte |= PG_ACCESSED_MASK;
+                if (is_dirty)
+                    pte |= PG_DIRTY_MASK;
+                stl_phys_notdirty(pte_addr, pte);
+            }
+            page_size = 4096;
+            virt_addr = addr & ~0xfff;
+        }
+    }
+    /* the page can be put in the TLB */
+    prot = PAGE_READ;
+    if (!(ptep & PG_NX_MASK))
+        prot |= PAGE_EXEC;
+    if (pte & PG_DIRTY_MASK) {
+        /* only set write access if already dirty... otherwise wait
+           for dirty access */
+        if (is_user) {
+            if (ptep & PG_RW_MASK)
+                prot |= PAGE_WRITE;
+        } else {
+            if (!(env->cr[0] & CR0_WP_MASK) ||
+                (ptep & PG_RW_MASK))
+                prot |= PAGE_WRITE;
+        }
+    }
+ do_mapping:
+#ifndef VBOX
+    pte = pte & env->a20_mask;
+#endif
+
+    /* Even if 4MB pages, we map only one 4KB page in the cache to
+       avoid filling it too fast */
+    page_offset = (addr & TARGET_PAGE_MASK) & (page_size - 1);
+    paddr = (pte & TARGET_PAGE_MASK) + page_offset;
+#ifdef VBOX
+    paddr &= env->a20_mask;
+#endif
+    vaddr = virt_addr + page_offset;
+
+    tlb_set_page(env, vaddr, paddr, prot, mmu_idx, page_size);
+    return 0;
+ do_fault_protect:
+    error_code = PG_ERROR_P_MASK;
+ do_fault:
+    error_code |= (is_write << PG_ERROR_W_BIT);
+    if (is_user)
+        error_code |= PG_ERROR_U_MASK;
+    if (is_write1 == 2 &&
+        (env->efer & MSR_EFER_NXE) &&
+        (env->cr[4] & CR4_PAE_MASK))
+        error_code |= PG_ERROR_I_D_MASK;
+    if (env->intercept_exceptions & (1 << EXCP0E_PAGE)) {
+        /* cr2 is not modified in case of exceptions */
+        stq_phys(env->vm_vmcb + offsetof(struct vmcb, control.exit_info_2),
+                 addr);
+    } else {
+        env->cr[2] = addr;
+    }
+    env->error_code = error_code;
+    env->exception_index = EXCP0E_PAGE;
+    return 1;
+}
+
+target_phys_addr_t cpu_get_phys_page_debug(CPUState *env, target_ulong addr)
+{
+    target_ulong pde_addr, pte_addr;
+    uint64_t pte;
+    target_phys_addr_t paddr;
+    uint32_t page_offset;
+    int page_size;
+
+    if (env->cr[4] & CR4_PAE_MASK) {
+        target_ulong pdpe_addr;
+        uint64_t pde, pdpe;
+
+#ifdef TARGET_X86_64
+        if (env->hflags & HF_LMA_MASK) {
+            uint64_t pml4e_addr, pml4e;
+            int32_t sext;
+
+            /* test virtual address sign extension */
+            sext = (int64_t)addr >> 47;
+            if (sext != 0 && sext != -1)
+                return -1;
+
+            pml4e_addr = ((env->cr[3] & ~0xfff) + (((addr >> 39) & 0x1ff) << 3)) &
+                env->a20_mask;
+            pml4e = ldq_phys(pml4e_addr);
+            if (!(pml4e & PG_PRESENT_MASK))
+                return -1;
+
+            pdpe_addr = ((pml4e & ~0xfff) + (((addr >> 30) & 0x1ff) << 3)) &
+                env->a20_mask;
+            pdpe = ldq_phys(pdpe_addr);
+            if (!(pdpe & PG_PRESENT_MASK))
+                return -1;
+        } else
+#endif
+        {
+            pdpe_addr = ((env->cr[3] & ~0x1f) + ((addr >> 27) & 0x18)) &
+                env->a20_mask;
+            pdpe = ldq_phys(pdpe_addr);
+            if (!(pdpe & PG_PRESENT_MASK))
+                return -1;
+        }
+
+        pde_addr = ((pdpe & ~0xfff) + (((addr >> 21) & 0x1ff) << 3)) &
+            env->a20_mask;
+        pde = ldq_phys(pde_addr);
+        if (!(pde & PG_PRESENT_MASK)) {
+            return -1;
+        }
+        if (pde & PG_PSE_MASK) {
+            /* 2 MB page */
+            page_size = 2048 * 1024;
+            pte = pde & ~( (page_size - 1) & ~0xfff); /* align to page_size */
+        } else {
+            /* 4 KB page */
+            pte_addr = ((pde & ~0xfff) + (((addr >> 12) & 0x1ff) << 3)) &
+                env->a20_mask;
+            page_size = 4096;
+            pte = ldq_phys(pte_addr);
+        }
+        if (!(pte & PG_PRESENT_MASK))
+            return -1;
+    } else {
+        uint32_t pde;
+
+        if (!(env->cr[0] & CR0_PG_MASK)) {
+            pte = addr;
+            page_size = 4096;
+        } else {
+            /* page directory entry */
+            pde_addr = ((env->cr[3] & ~0xfff) + ((addr >> 20) & 0xffc)) & env->a20_mask;
+            pde = ldl_phys(pde_addr);
+            if (!(pde & PG_PRESENT_MASK))
+                return -1;
+            if ((pde & PG_PSE_MASK) && (env->cr[4] & CR4_PSE_MASK)) {
+                pte = pde & ~0x003ff000; /* align to 4MB */
+                page_size = 4096 * 1024;
+            } else {
+                /* page directory entry */
+                pte_addr = ((pde & ~0xfff) + ((addr >> 10) & 0xffc)) & env->a20_mask;
+                pte = ldl_phys(pte_addr);
+                if (!(pte & PG_PRESENT_MASK))
+                    return -1;
+                page_size = 4096;
+            }
+        }
+        pte = pte & env->a20_mask;
+    }
+
+    page_offset = (addr & TARGET_PAGE_MASK) & (page_size - 1);
+    paddr = (pte & TARGET_PAGE_MASK) + page_offset;
+    return paddr;
+}
+
+void hw_breakpoint_insert(CPUState *env, int index)
+{
+    int type, err = 0;
+
+    switch (hw_breakpoint_type(env->dr[7], index)) {
+    case 0:
+        if (hw_breakpoint_enabled(env->dr[7], index))
+            err = cpu_breakpoint_insert(env, env->dr[index], BP_CPU,
+                                        &env->cpu_breakpoint[index]);
+        break;
+    case 1:
+        type = BP_CPU | BP_MEM_WRITE;
+        goto insert_wp;
+    case 2:
+         /* No support for I/O watchpoints yet */
+        break;
+    case 3:
+        type = BP_CPU | BP_MEM_ACCESS;
+    insert_wp:
+        err = cpu_watchpoint_insert(env, env->dr[index],
+                                    hw_breakpoint_len(env->dr[7], index),
+                                    type, &env->cpu_watchpoint[index]);
+        break;
+    }
+    if (err)
+        env->cpu_breakpoint[index] = NULL;
+}
+
+void hw_breakpoint_remove(CPUState *env, int index)
+{
+    if (!env->cpu_breakpoint[index])
+        return;
+    switch (hw_breakpoint_type(env->dr[7], index)) {
+    case 0:
+        if (hw_breakpoint_enabled(env->dr[7], index))
+            cpu_breakpoint_remove_by_ref(env, env->cpu_breakpoint[index]);
+        break;
+    case 1:
+    case 3:
+        cpu_watchpoint_remove_by_ref(env, env->cpu_watchpoint[index]);
+        break;
+    case 2:
+        /* No support for I/O watchpoints yet */
+        break;
+    }
+}
+
+int check_hw_breakpoints(CPUState *env, int force_dr6_update)
+{
+    target_ulong dr6;
+    int reg, type;
+    int hit_enabled = 0;
+
+    dr6 = env->dr[6] & ~0xf;
+    for (reg = 0; reg < 4; reg++) {
+        type = hw_breakpoint_type(env->dr[7], reg);
+        if ((type == 0 && env->dr[reg] == env->eip) ||
+            ((type & 1) && env->cpu_watchpoint[reg] &&
+             (env->cpu_watchpoint[reg]->flags & BP_WATCHPOINT_HIT))) {
+            dr6 |= 1 << reg;
+            if (hw_breakpoint_enabled(env->dr[7], reg))
+                hit_enabled = 1;
+        }
+    }
+    if (hit_enabled || force_dr6_update)
+        env->dr[6] = dr6;
+    return hit_enabled;
+}
+
+static CPUDebugExcpHandler *prev_debug_excp_handler;
+
+void raise_exception_env(int exception_index, CPUState *env);
+
+static void breakpoint_handler(CPUState *env)
+{
+    CPUBreakpoint *bp;
+
+    if (env->watchpoint_hit) {
+        if (env->watchpoint_hit->flags & BP_CPU) {
+            env->watchpoint_hit = NULL;
+            if (check_hw_breakpoints(env, 0))
+                raise_exception_env(EXCP01_DB, env);
+            else
+                cpu_resume_from_signal(env, NULL);
+        }
+    } else {
+        QTAILQ_FOREACH(bp, &env->breakpoints, entry)
+            if (bp->pc == env->eip) {
+                if (bp->flags & BP_CPU) {
+                    check_hw_breakpoints(env, 1);
+                    raise_exception_env(EXCP01_DB, env);
+                }
+                break;
+            }
+    }
+    if (prev_debug_excp_handler)
+        prev_debug_excp_handler(env);
+}
+
+#ifndef VBOX
+/* This should come from sysemu.h - if we could include it here... */
+void qemu_system_reset_request(void);
+
+void cpu_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
+                        uint64_t mcg_status, uint64_t addr, uint64_t misc)
+{
+    uint64_t mcg_cap = cenv->mcg_cap;
+    unsigned bank_num = mcg_cap & 0xff;
+    uint64_t *banks = cenv->mce_banks;
+
+    if (bank >= bank_num || !(status & MCI_STATUS_VAL))
+        return;
+
+    /*
+     * if MSR_MCG_CTL is not all 1s, the uncorrected error
+     * reporting is disabled
+     */
+    if ((status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
+        cenv->mcg_ctl != ~(uint64_t)0)
+        return;
+    banks += 4 * bank;
+    /*
+     * if MSR_MCi_CTL is not all 1s, the uncorrected error
+     * reporting is disabled for the bank
+     */
+    if ((status & MCI_STATUS_UC) && banks[0] != ~(uint64_t)0)
+        return;
+    if (status & MCI_STATUS_UC) {
+        if ((cenv->mcg_status & MCG_STATUS_MCIP) ||
+            !(cenv->cr[4] & CR4_MCE_MASK)) {
+            fprintf(stderr, "injects mce exception while previous "
+                    "one is in progress!\n");
+            qemu_log_mask(CPU_LOG_RESET, "Triple fault\n");
+            qemu_system_reset_request();
+            return;
+        }
+        if (banks[1] & MCI_STATUS_VAL)
+            status |= MCI_STATUS_OVER;
+        banks[2] = addr;
+        banks[3] = misc;
+        cenv->mcg_status = mcg_status;
+        banks[1] = status;
+        cpu_interrupt(cenv, CPU_INTERRUPT_MCE);
+    } else if (!(banks[1] & MCI_STATUS_VAL)
+               || !(banks[1] & MCI_STATUS_UC)) {
+        if (banks[1] & MCI_STATUS_VAL)
+            status |= MCI_STATUS_OVER;
+        banks[2] = addr;
+        banks[3] = misc;
+        banks[1] = status;
+    } else
+        banks[1] |= MCI_STATUS_OVER;
+}
+#endif /* !VBOX */
+#endif /* !CONFIG_USER_ONLY */
+
+#ifndef VBOX
+
+static void mce_init(CPUX86State *cenv)
+{
+    unsigned int bank, bank_num;
+
+    if (((cenv->cpuid_version >> 8)&0xf) >= 6
+        && (cenv->cpuid_features&(CPUID_MCE|CPUID_MCA)) == (CPUID_MCE|CPUID_MCA)) {
+        cenv->mcg_cap = MCE_CAP_DEF | MCE_BANKS_DEF;
+        cenv->mcg_ctl = ~(uint64_t)0;
+        bank_num = MCE_BANKS_DEF;
+        for (bank = 0; bank < bank_num; bank++)
+            cenv->mce_banks[bank*4] = ~(uint64_t)0;
+    }
+}
+
+int cpu_x86_get_descr_debug(CPUX86State *env, unsigned int selector,
+                            target_ulong *base, unsigned int *limit,
+                            unsigned int *flags)
+{
+    SegmentCache *dt;
+    target_ulong ptr;
+    uint32_t e1, e2;
+    int index;
+
+    if (selector & 0x4)
+        dt = &env->ldt;
+    else
+        dt = &env->gdt;
+    index = selector & ~7;
+    ptr = dt->base + index;
+    if ((index + 7) > dt->limit
+        || cpu_memory_rw_debug(env, ptr, (uint8_t *)&e1, sizeof(e1), 0) != 0
+        || cpu_memory_rw_debug(env, ptr+4, (uint8_t *)&e2, sizeof(e2), 0) != 0)
+        return 0;
+
+    *base = ((e1 >> 16) | ((e2 & 0xff) << 16) | (e2 & 0xff000000));
+    *limit = (e1 & 0xffff) | (e2 & 0x000f0000);
+    if (e2 & DESC_G_MASK)
+        *limit = (*limit << 12) | 0xfff;
+    *flags = e2;
+
+    return 1;
+}
+
+#endif /* !VBOX */
+
+#ifndef VBOX
+CPUX86State *cpu_x86_init(const char *cpu_model)
+#else
+CPUX86State *cpu_x86_init(CPUX86State *env, const char *cpu_model)
+#endif
+{
+#ifndef VBOX
+    CPUX86State *env;
+#endif
+    static int inited;
+
+#ifndef VBOX
+    env = qemu_mallocz(sizeof(CPUX86State));
+#endif
+    cpu_exec_init(env);
+    env->cpu_model_str = cpu_model;
+
+    /* init various static tables */
+    if (!inited) {
+        inited = 1;
+        optimize_flags_init();
+#ifndef CONFIG_USER_ONLY
+        prev_debug_excp_handler =
+            cpu_set_debug_excp_handler(breakpoint_handler);
+#endif
+    }
+#ifndef VBOX
+    if (cpu_x86_register(env, cpu_model) < 0) {
+        cpu_x86_close(env);
+        return NULL;
+    }
+    mce_init(env);
+#endif
+
+    qemu_init_vcpu(env);
+
+    return env;
+}
+
+#ifndef VBOX
+#if !defined(CONFIG_USER_ONLY)
+void do_cpu_init(CPUState *env)
+{
+    int sipi = env->interrupt_request & CPU_INTERRUPT_SIPI;
+    cpu_reset(env);
+    env->interrupt_request = sipi;
+    apic_init_reset(env->apic_state);
+    env->halted = !cpu_is_bsp(env);
+}
+
+void do_cpu_sipi(CPUState *env)
+{
+    apic_sipi(env->apic_state);
+}
+#else
+void do_cpu_init(CPUState *env)
+{
+}
+void do_cpu_sipi(CPUState *env)
+{
+}
+#endif
+#endif /* !VBOX */
diff --git a/src/recompiler/target-i386/helper.h b/src/recompiler/target-i386/helper.h
new file mode 100644
index 00000000..8307304a
--- /dev/null
+++ b/src/recompiler/target-i386/helper.h
@@ -0,0 +1,253 @@
+#include "def-helper.h"
+
+DEF_HELPER_FLAGS_1(cc_compute_all, TCG_CALL_PURE, i32, int)
+DEF_HELPER_FLAGS_1(cc_compute_c, TCG_CALL_PURE, i32, int)
+
+DEF_HELPER_0(lock, void)
+DEF_HELPER_0(unlock, void)
+DEF_HELPER_2(write_eflags, void, tl, i32)
+DEF_HELPER_0(read_eflags, tl)
+DEF_HELPER_1(divb_AL, void, tl)
+DEF_HELPER_1(idivb_AL, void, tl)
+DEF_HELPER_1(divw_AX, void, tl)
+DEF_HELPER_1(idivw_AX, void, tl)
+DEF_HELPER_1(divl_EAX, void, tl)
+DEF_HELPER_1(idivl_EAX, void, tl)
+#ifdef TARGET_X86_64
+DEF_HELPER_1(mulq_EAX_T0, void, tl)
+DEF_HELPER_1(imulq_EAX_T0, void, tl)
+DEF_HELPER_2(imulq_T0_T1, tl, tl, tl)
+DEF_HELPER_1(divq_EAX, void, tl)
+DEF_HELPER_1(idivq_EAX, void, tl)
+#endif
+
+DEF_HELPER_1(aam, void, int)
+DEF_HELPER_1(aad, void, int)
+DEF_HELPER_0(aaa, void)
+DEF_HELPER_0(aas, void)
+DEF_HELPER_0(daa, void)
+DEF_HELPER_0(das, void)
+
+DEF_HELPER_1(lsl, tl, tl)
+DEF_HELPER_1(lar, tl, tl)
+DEF_HELPER_1(verr, void, tl)
+DEF_HELPER_1(verw, void, tl)
+DEF_HELPER_1(lldt, void, int)
+DEF_HELPER_1(ltr, void, int)
+DEF_HELPER_2(load_seg, void, int, int)
+DEF_HELPER_3(ljmp_protected, void, int, tl, int)
+DEF_HELPER_4(lcall_real, void, int, tl, int, int)
+DEF_HELPER_4(lcall_protected, void, int, tl, int, int)
+DEF_HELPER_1(iret_real, void, int)
+DEF_HELPER_2(iret_protected, void, int, int)
+DEF_HELPER_2(lret_protected, void, int, int)
+DEF_HELPER_1(read_crN, tl, int)
+DEF_HELPER_2(write_crN, void, int, tl)
+DEF_HELPER_1(lmsw, void, tl)
+DEF_HELPER_0(clts, void)
+DEF_HELPER_2(movl_drN_T0, void, int, tl)
+DEF_HELPER_1(invlpg, void, tl)
+
+DEF_HELPER_3(enter_level, void, int, int, tl)
+#ifdef TARGET_X86_64
+DEF_HELPER_3(enter64_level, void, int, int, tl)
+#endif
+DEF_HELPER_0(sysenter, void)
+DEF_HELPER_1(sysexit, void, int)
+#ifdef TARGET_X86_64
+DEF_HELPER_1(syscall, void, int)
+DEF_HELPER_1(sysret, void, int)
+#endif
+DEF_HELPER_1(hlt, void, int)
+DEF_HELPER_1(monitor, void, tl)
+DEF_HELPER_1(mwait, void, int)
+DEF_HELPER_0(debug, void)
+DEF_HELPER_0(reset_rf, void)
+DEF_HELPER_2(raise_interrupt, void, int, int)
+DEF_HELPER_1(raise_exception, void, int)
+DEF_HELPER_0(cli, void)
+DEF_HELPER_0(sti, void)
+DEF_HELPER_0(set_inhibit_irq, void)
+DEF_HELPER_0(reset_inhibit_irq, void)
+DEF_HELPER_2(boundw, void, tl, int)
+DEF_HELPER_2(boundl, void, tl, int)
+DEF_HELPER_0(rsm, void)
+DEF_HELPER_1(into, void, int)
+DEF_HELPER_1(cmpxchg8b, void, tl)
+#ifdef TARGET_X86_64
+DEF_HELPER_1(cmpxchg16b, void, tl)
+#endif
+DEF_HELPER_0(single_step, void)
+DEF_HELPER_0(cpuid, void)
+DEF_HELPER_0(rdtsc, void)
+DEF_HELPER_0(rdtscp, void)
+DEF_HELPER_0(rdpmc, void)
+DEF_HELPER_0(rdmsr, void)
+DEF_HELPER_0(wrmsr, void)
+
+DEF_HELPER_1(check_iob, void, i32)
+DEF_HELPER_1(check_iow, void, i32)
+DEF_HELPER_1(check_iol, void, i32)
+DEF_HELPER_2(outb, void, i32, i32)
+DEF_HELPER_1(inb, tl, i32)
+DEF_HELPER_2(outw, void, i32, i32)
+DEF_HELPER_1(inw, tl, i32)
+DEF_HELPER_2(outl, void, i32, i32)
+DEF_HELPER_1(inl, tl, i32)
+
+DEF_HELPER_2(svm_check_intercept_param, void, i32, i64)
+DEF_HELPER_2(vmexit, void, i32, i64)
+DEF_HELPER_3(svm_check_io, void, i32, i32, i32)
+DEF_HELPER_2(vmrun, void, int, int)
+DEF_HELPER_0(vmmcall, void)
+DEF_HELPER_1(vmload, void, int)
+DEF_HELPER_1(vmsave, void, int)
+DEF_HELPER_0(stgi, void)
+DEF_HELPER_0(clgi, void)
+DEF_HELPER_0(skinit, void)
+DEF_HELPER_1(invlpga, void, int)
+
+/* x86 FPU */
+
+DEF_HELPER_1(flds_FT0, void, i32)
+DEF_HELPER_1(fldl_FT0, void, i64)
+DEF_HELPER_1(fildl_FT0, void, s32)
+DEF_HELPER_1(flds_ST0, void, i32)
+DEF_HELPER_1(fldl_ST0, void, i64)
+DEF_HELPER_1(fildl_ST0, void, s32)
+DEF_HELPER_1(fildll_ST0, void, s64)
+#ifndef VBOX
+DEF_HELPER_0(fsts_ST0, i32)
+DEF_HELPER_0(fstl_ST0, i64)
+DEF_HELPER_0(fist_ST0, s32)
+DEF_HELPER_0(fistl_ST0, s32)
+DEF_HELPER_0(fistll_ST0, s64)
+DEF_HELPER_0(fistt_ST0, s32)
+DEF_HELPER_0(fisttl_ST0, s32)
+DEF_HELPER_0(fisttll_ST0, s64)
+#else  /* VBOX */
+DEF_HELPER_0(fsts_ST0, RTCCUINTREG)
+DEF_HELPER_0(fstl_ST0, i64)
+DEF_HELPER_0(fist_ST0, RTCCINTREG)
+DEF_HELPER_0(fistl_ST0, RTCCINTREG)
+DEF_HELPER_0(fistll_ST0, s64)
+DEF_HELPER_0(fistt_ST0, RTCCINTREG)
+DEF_HELPER_0(fisttl_ST0, RTCCINTREG)
+DEF_HELPER_0(fisttll_ST0, s64)
+#endif /* VBOX */
+DEF_HELPER_1(fldt_ST0, void, tl)
+DEF_HELPER_1(fstt_ST0, void, tl)
+DEF_HELPER_0(fpush, void)
+DEF_HELPER_0(fpop, void)
+DEF_HELPER_0(fdecstp, void)
+DEF_HELPER_0(fincstp, void)
+DEF_HELPER_1(ffree_STN, void, int)
+DEF_HELPER_0(fmov_ST0_FT0, void)
+DEF_HELPER_1(fmov_FT0_STN, void, int)
+DEF_HELPER_1(fmov_ST0_STN, void, int)
+DEF_HELPER_1(fmov_STN_ST0, void, int)
+DEF_HELPER_1(fxchg_ST0_STN, void, int)
+DEF_HELPER_0(fcom_ST0_FT0, void)
+DEF_HELPER_0(fucom_ST0_FT0, void)
+DEF_HELPER_0(fcomi_ST0_FT0, void)
+DEF_HELPER_0(fucomi_ST0_FT0, void)
+DEF_HELPER_0(fadd_ST0_FT0, void)
+DEF_HELPER_0(fmul_ST0_FT0, void)
+DEF_HELPER_0(fsub_ST0_FT0, void)
+DEF_HELPER_0(fsubr_ST0_FT0, void)
+DEF_HELPER_0(fdiv_ST0_FT0, void)
+DEF_HELPER_0(fdivr_ST0_FT0, void)
+DEF_HELPER_1(fadd_STN_ST0, void, int)
+DEF_HELPER_1(fmul_STN_ST0, void, int)
+DEF_HELPER_1(fsub_STN_ST0, void, int)
+DEF_HELPER_1(fsubr_STN_ST0, void, int)
+DEF_HELPER_1(fdiv_STN_ST0, void, int)
+DEF_HELPER_1(fdivr_STN_ST0, void, int)
+DEF_HELPER_0(fchs_ST0, void)
+DEF_HELPER_0(fabs_ST0, void)
+DEF_HELPER_0(fxam_ST0, void)
+DEF_HELPER_0(fld1_ST0, void)
+DEF_HELPER_0(fldl2t_ST0, void)
+DEF_HELPER_0(fldl2e_ST0, void)
+DEF_HELPER_0(fldpi_ST0, void)
+DEF_HELPER_0(fldlg2_ST0, void)
+DEF_HELPER_0(fldln2_ST0, void)
+DEF_HELPER_0(fldz_ST0, void)
+DEF_HELPER_0(fldz_FT0, void)
+#ifndef VBOX
+DEF_HELPER_0(fnstsw, i32)
+DEF_HELPER_0(fnstcw, i32)
+#else  /* VBOX */
+DEF_HELPER_0(fnstsw, RTCCUINTREG)
+DEF_HELPER_0(fnstcw, RTCCUINTREG)
+#endif /* VBOX */
+DEF_HELPER_1(fldcw, void, i32)
+DEF_HELPER_0(fclex, void)
+DEF_HELPER_0(fwait, void)
+DEF_HELPER_0(fninit, void)
+DEF_HELPER_1(fbld_ST0, void, tl)
+DEF_HELPER_1(fbst_ST0, void, tl)
+DEF_HELPER_0(f2xm1, void)
+DEF_HELPER_0(fyl2x, void)
+DEF_HELPER_0(fptan, void)
+DEF_HELPER_0(fpatan, void)
+DEF_HELPER_0(fxtract, void)
+DEF_HELPER_0(fprem1, void)
+DEF_HELPER_0(fprem, void)
+DEF_HELPER_0(fyl2xp1, void)
+DEF_HELPER_0(fsqrt, void)
+DEF_HELPER_0(fsincos, void)
+DEF_HELPER_0(frndint, void)
+DEF_HELPER_0(fscale, void)
+DEF_HELPER_0(fsin, void)
+DEF_HELPER_0(fcos, void)
+DEF_HELPER_2(fstenv, void, tl, int)
+DEF_HELPER_2(fldenv, void, tl, int)
+DEF_HELPER_2(fsave, void, tl, int)
+DEF_HELPER_2(frstor, void, tl, int)
+DEF_HELPER_2(fxsave, void, tl, int)
+DEF_HELPER_2(fxrstor, void, tl, int)
+DEF_HELPER_1(bsf, tl, tl)
+DEF_HELPER_1(bsr, tl, tl)
+DEF_HELPER_2(lzcnt, tl, tl, int)
+
+/* MMX/SSE */
+
+DEF_HELPER_0(enter_mmx, void)
+DEF_HELPER_0(emms, void)
+DEF_HELPER_2(movq, void, ptr, ptr)
+
+#define SHIFT 0
+#include "ops_sse_header.h"
+#define SHIFT 1
+#include "ops_sse_header.h"
+
+DEF_HELPER_2(rclb, tl, tl, tl)
+DEF_HELPER_2(rclw, tl, tl, tl)
+DEF_HELPER_2(rcll, tl, tl, tl)
+DEF_HELPER_2(rcrb, tl, tl, tl)
+DEF_HELPER_2(rcrw, tl, tl, tl)
+DEF_HELPER_2(rcrl, tl, tl, tl)
+#ifdef TARGET_X86_64
+DEF_HELPER_2(rclq, tl, tl, tl)
+DEF_HELPER_2(rcrq, tl, tl, tl)
+#endif
+
+#ifdef VBOX
+DEF_HELPER_1(write_eflags_vme, void, tl)
+DEF_HELPER_0(read_eflags_vme, tl)
+DEF_HELPER_0(cli_vme, void)
+DEF_HELPER_0(sti_vme, void)
+DEF_HELPER_0(check_external_event, void)
+DEF_HELPER_0(dump_state, void)
+DEF_HELPER_1(sync_seg, void, i32)
+
+void helper_external_event(void);
+void helper_record_call(void);
+
+/* in op_helper.c */
+void sync_seg(CPUX86State *env1, int seg_reg, int selector);
+void sync_ldtr(CPUX86State *env1, int selector);
+#endif /* VBOX */
+
+#include "def-helper.h"
diff --git a/src/recompiler/target-i386/helper_template.h b/src/recompiler/target-i386/helper_template.h
new file mode 100644
index 00000000..193b3274
--- /dev/null
+++ b/src/recompiler/target-i386/helper_template.h
@@ -0,0 +1,344 @@
+/*
+ *  i386 helpers
+ *
+ *  Copyright (c) 2008 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Oracle LGPL Disclaimer: For the avoidance of doubt, except that if any license choice
+ * other than GPL or LGPL is available it will apply instead, Oracle elects to use only
+ * the Lesser General Public License version 2.1 (LGPLv2) at this time for any software where
+ * a choice of LGPL license versions is made available with the language indicating
+ * that LGPLv2 or any later version may be used, or where a choice of which version
+ * of the LGPL is applied is otherwise unspecified.
+ */
+
+#define DATA_BITS (1 << (3 + SHIFT))
+#define SHIFT_MASK (DATA_BITS - 1)
+#define SIGN_MASK (((target_ulong)1) << (DATA_BITS - 1))
+#if DATA_BITS <= 32
+#define SHIFT1_MASK 0x1f
+#else
+#define SHIFT1_MASK 0x3f
+#endif
+
+#if DATA_BITS == 8
+#define SUFFIX b
+#define DATA_TYPE uint8_t
+#define DATA_STYPE int8_t
+#define DATA_MASK 0xff
+#elif DATA_BITS == 16
+#define SUFFIX w
+#define DATA_TYPE uint16_t
+#define DATA_STYPE int16_t
+#define DATA_MASK 0xffff
+#elif DATA_BITS == 32
+#define SUFFIX l
+#define DATA_TYPE uint32_t
+#define DATA_STYPE int32_t
+#define DATA_MASK 0xffffffff
+#elif DATA_BITS == 64
+#define SUFFIX q
+#define DATA_TYPE uint64_t
+#define DATA_STYPE int64_t
+#define DATA_MASK 0xffffffffffffffffULL
+#else
+#error unhandled operand size
+#endif
+
+/* dynamic flags computation */
+
+static int glue(compute_all_add, SUFFIX)(void)
+{
+    int cf, pf, af, zf, sf, of;
+    target_long src1, src2;
+    src1 = CC_SRC;
+    src2 = CC_DST - CC_SRC;
+    cf = (DATA_TYPE)CC_DST < (DATA_TYPE)src1;
+    pf = parity_table[(uint8_t)CC_DST];
+    af = (CC_DST ^ src1 ^ src2) & 0x10;
+    zf = ((DATA_TYPE)CC_DST == 0) << 6;
+    sf = lshift(CC_DST, 8 - DATA_BITS) & 0x80;
+    of = lshift((src1 ^ src2 ^ -1) & (src1 ^ CC_DST), 12 - DATA_BITS) & CC_O;
+    return cf | pf | af | zf | sf | of;
+}
+
+static int glue(compute_c_add, SUFFIX)(void)
+{
+    int cf;
+    target_long src1;
+    src1 = CC_SRC;
+    cf = (DATA_TYPE)CC_DST < (DATA_TYPE)src1;
+    return cf;
+}
+
+static int glue(compute_all_adc, SUFFIX)(void)
+{
+    int cf, pf, af, zf, sf, of;
+    target_long src1, src2;
+    src1 = CC_SRC;
+    src2 = CC_DST - CC_SRC - 1;
+    cf = (DATA_TYPE)CC_DST <= (DATA_TYPE)src1;
+    pf = parity_table[(uint8_t)CC_DST];
+    af = (CC_DST ^ src1 ^ src2) & 0x10;
+    zf = ((DATA_TYPE)CC_DST == 0) << 6;
+    sf = lshift(CC_DST, 8 - DATA_BITS) & 0x80;
+    of = lshift((src1 ^ src2 ^ -1) & (src1 ^ CC_DST), 12 - DATA_BITS) & CC_O;
+    return cf | pf | af | zf | sf | of;
+}
+
+static int glue(compute_c_adc, SUFFIX)(void)
+{
+    int cf;
+    target_long src1;
+    src1 = CC_SRC;
+    cf = (DATA_TYPE)CC_DST <= (DATA_TYPE)src1;
+    return cf;
+}
+
+static int glue(compute_all_sub, SUFFIX)(void)
+{
+    int cf, pf, af, zf, sf, of;
+    target_long src1, src2;
+    src1 = CC_DST + CC_SRC;
+    src2 = CC_SRC;
+    cf = (DATA_TYPE)src1 < (DATA_TYPE)src2;
+    pf = parity_table[(uint8_t)CC_DST];
+    af = (CC_DST ^ src1 ^ src2) & 0x10;
+    zf = ((DATA_TYPE)CC_DST == 0) << 6;
+    sf = lshift(CC_DST, 8 - DATA_BITS) & 0x80;
+    of = lshift((src1 ^ src2) & (src1 ^ CC_DST), 12 - DATA_BITS) & CC_O;
+    return cf | pf | af | zf | sf | of;
+}
+
+static int glue(compute_c_sub, SUFFIX)(void)
+{
+    int cf;
+    target_long src1, src2;
+    src1 = CC_DST + CC_SRC;
+    src2 = CC_SRC;
+    cf = (DATA_TYPE)src1 < (DATA_TYPE)src2;
+    return cf;
+}
+
+static int glue(compute_all_sbb, SUFFIX)(void)
+{
+    int cf, pf, af, zf, sf, of;
+    target_long src1, src2;
+    src1 = CC_DST + CC_SRC + 1;
+    src2 = CC_SRC;
+    cf = (DATA_TYPE)src1 <= (DATA_TYPE)src2;
+    pf = parity_table[(uint8_t)CC_DST];
+    af = (CC_DST ^ src1 ^ src2) & 0x10;
+    zf = ((DATA_TYPE)CC_DST == 0) << 6;
+    sf = lshift(CC_DST, 8 - DATA_BITS) & 0x80;
+    of = lshift((src1 ^ src2) & (src1 ^ CC_DST), 12 - DATA_BITS) & CC_O;
+    return cf | pf | af | zf | sf | of;
+}
+
+static int glue(compute_c_sbb, SUFFIX)(void)
+{
+    int cf;
+    target_long src1, src2;
+    src1 = CC_DST + CC_SRC + 1;
+    src2 = CC_SRC;
+    cf = (DATA_TYPE)src1 <= (DATA_TYPE)src2;
+    return cf;
+}
+
+static int glue(compute_all_logic, SUFFIX)(void)
+{
+    int cf, pf, af, zf, sf, of;
+    cf = 0;
+    pf = parity_table[(uint8_t)CC_DST];
+    af = 0;
+    zf = ((DATA_TYPE)CC_DST == 0) << 6;
+    sf = lshift(CC_DST, 8 - DATA_BITS) & 0x80;
+    of = 0;
+    return cf | pf | af | zf | sf | of;
+}
+
+static int glue(compute_c_logic, SUFFIX)(void)
+{
+    return 0;
+}
+
+static int glue(compute_all_inc, SUFFIX)(void)
+{
+    int cf, pf, af, zf, sf, of;
+    target_long src1, src2;
+    src1 = CC_DST - 1;
+    src2 = 1;
+    cf = CC_SRC;
+    pf = parity_table[(uint8_t)CC_DST];
+    af = (CC_DST ^ src1 ^ src2) & 0x10;
+    zf = ((DATA_TYPE)CC_DST == 0) << 6;
+    sf = lshift(CC_DST, 8 - DATA_BITS) & 0x80;
+    of = ((CC_DST & DATA_MASK) == SIGN_MASK) << 11;
+    return cf | pf | af | zf | sf | of;
+}
+
+#if DATA_BITS == 32
+static int glue(compute_c_inc, SUFFIX)(void)
+{
+    return CC_SRC;
+}
+#endif
+
+static int glue(compute_all_dec, SUFFIX)(void)
+{
+    int cf, pf, af, zf, sf, of;
+    target_long src1, src2;
+    src1 = CC_DST + 1;
+    src2 = 1;
+    cf = CC_SRC;
+    pf = parity_table[(uint8_t)CC_DST];
+    af = (CC_DST ^ src1 ^ src2) & 0x10;
+    zf = ((DATA_TYPE)CC_DST == 0) << 6;
+    sf = lshift(CC_DST, 8 - DATA_BITS) & 0x80;
+    of = ((CC_DST & DATA_MASK) == ((target_ulong)SIGN_MASK - 1)) << 11;
+    return cf | pf | af | zf | sf | of;
+}
+
+static int glue(compute_all_shl, SUFFIX)(void)
+{
+    int cf, pf, af, zf, sf, of;
+    cf = (CC_SRC >> (DATA_BITS - 1)) & CC_C;
+    pf = parity_table[(uint8_t)CC_DST];
+    af = 0; /* undefined */
+    zf = ((DATA_TYPE)CC_DST == 0) << 6;
+    sf = lshift(CC_DST, 8 - DATA_BITS) & 0x80;
+    /* of is defined if shift count == 1 */
+    of = lshift(CC_SRC ^ CC_DST, 12 - DATA_BITS) & CC_O;
+    return cf | pf | af | zf | sf | of;
+}
+
+static int glue(compute_c_shl, SUFFIX)(void)
+{
+    return (CC_SRC >> (DATA_BITS - 1)) & CC_C;
+}
+
+#if DATA_BITS == 32
+static int glue(compute_c_sar, SUFFIX)(void)
+{
+    return CC_SRC & 1;
+}
+#endif
+
+static int glue(compute_all_sar, SUFFIX)(void)
+{
+    int cf, pf, af, zf, sf, of;
+    cf = CC_SRC & 1;
+    pf = parity_table[(uint8_t)CC_DST];
+    af = 0; /* undefined */
+    zf = ((DATA_TYPE)CC_DST == 0) << 6;
+    sf = lshift(CC_DST, 8 - DATA_BITS) & 0x80;
+    /* of is defined if shift count == 1 */
+    of = lshift(CC_SRC ^ CC_DST, 12 - DATA_BITS) & CC_O;
+    return cf | pf | af | zf | sf | of;
+}
+
+#if DATA_BITS == 32
+static int glue(compute_c_mul, SUFFIX)(void)
+{
+    int cf;
+    cf = (CC_SRC != 0);
+    return cf;
+}
+#endif
+
+/* NOTE: we compute the flags like the P4. On olders CPUs, only OF and
+   CF are modified and it is slower to do that. */
+static int glue(compute_all_mul, SUFFIX)(void)
+{
+    int cf, pf, af, zf, sf, of;
+    cf = (CC_SRC != 0);
+    pf = parity_table[(uint8_t)CC_DST];
+    af = 0; /* undefined */
+    zf = ((DATA_TYPE)CC_DST == 0) << 6;
+    sf = lshift(CC_DST, 8 - DATA_BITS) & 0x80;
+    of = cf << 11;
+    return cf | pf | af | zf | sf | of;
+}
+
+/* shifts */
+
+target_ulong glue(helper_rcl, SUFFIX)(target_ulong t0, target_ulong t1)
+{
+    int count, eflags;
+    target_ulong src;
+    target_long res;
+
+    count = t1 & SHIFT1_MASK;
+#if DATA_BITS == 16
+    count = rclw_table[count];
+#elif DATA_BITS == 8
+    count = rclb_table[count];
+#endif
+    if (count) {
+        eflags = helper_cc_compute_all(CC_OP);
+        t0 &= DATA_MASK;
+        src = t0;
+        res = (t0 << count) | ((target_ulong)(eflags & CC_C) << (count - 1));
+        if (count > 1)
+            res |= t0 >> (DATA_BITS + 1 - count);
+        t0 = res;
+        env->cc_tmp = (eflags & ~(CC_C | CC_O)) |
+            (lshift(src ^ t0, 11 - (DATA_BITS - 1)) & CC_O) |
+            ((src >> (DATA_BITS - count)) & CC_C);
+    } else {
+        env->cc_tmp = -1;
+    }
+    return t0;
+}
+
+target_ulong glue(helper_rcr, SUFFIX)(target_ulong t0, target_ulong t1)
+{
+    int count, eflags;
+    target_ulong src;
+    target_long res;
+
+    count = t1 & SHIFT1_MASK;
+#if DATA_BITS == 16
+    count = rclw_table[count];
+#elif DATA_BITS == 8
+    count = rclb_table[count];
+#endif
+    if (count) {
+        eflags = helper_cc_compute_all(CC_OP);
+        t0 &= DATA_MASK;
+        src = t0;
+        res = (t0 >> count) | ((target_ulong)(eflags & CC_C) << (DATA_BITS - count));
+        if (count > 1)
+            res |= t0 << (DATA_BITS + 1 - count);
+        t0 = res;
+        env->cc_tmp = (eflags & ~(CC_C | CC_O)) |
+            (lshift(src ^ t0, 11 - (DATA_BITS - 1)) & CC_O) |
+            ((src >> (count - 1)) & CC_C);
+    } else {
+        env->cc_tmp = -1;
+    }
+    return t0;
+}
+
+#undef DATA_BITS
+#undef SHIFT_MASK
+#undef SHIFT1_MASK
+#undef SIGN_MASK
+#undef DATA_TYPE
+#undef DATA_STYPE
+#undef DATA_MASK
+#undef SUFFIX
diff --git a/src/recompiler/target-i386/op_helper.c b/src/recompiler/target-i386/op_helper.c
new file mode 100644
index 00000000..07b58f8d
--- /dev/null
+++ b/src/recompiler/target-i386/op_helper.c
@@ -0,0 +1,7164 @@
+/*
+ *  i386 helpers
+ *
+ *  Copyright (c) 2003 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Oracle LGPL Disclaimer: For the avoidance of doubt, except that if any license choice
+ * other than GPL or LGPL is available it will apply instead, Oracle elects to use only
+ * the Lesser General Public License version 2.1 (LGPLv2) at this time for any software where
+ * a choice of LGPL license versions is made available with the language indicating
+ * that LGPLv2 or any later version may be used, or where a choice of which version
+ * of the LGPL is applied is otherwise unspecified.
+ */
+
+#include "exec.h"
+#include "exec-all.h"
+#include "host-utils.h"
+#include "ioport.h"
+
+#ifdef VBOX
+# include "qemu-common.h"
+# include <math.h>
+# include "tcg.h"
+# include <VBox/err.h>
+#endif /* VBOX */
+
+//#define DEBUG_PCALL
+
+
+#ifdef DEBUG_PCALL
+#  define LOG_PCALL(...) qemu_log_mask(CPU_LOG_PCALL, ## __VA_ARGS__)
+#  define LOG_PCALL_STATE(env) \
+          log_cpu_state_mask(CPU_LOG_PCALL, (env), X86_DUMP_CCOP)
+#else
+#  define LOG_PCALL(...) do { } while (0)
+#  define LOG_PCALL_STATE(env) do { } while (0)
+#endif
+
+
+#if 0
+#define raise_exception_err(a, b)\
+do {\
+    qemu_log("raise_exception line=%d\n", __LINE__);\
+    (raise_exception_err)(a, b);\
+} while (0)
+#endif
+
+static const uint8_t parity_table[256] = {
+    CC_P, 0, 0, CC_P, 0, CC_P, CC_P, 0,
+    0, CC_P, CC_P, 0, CC_P, 0, 0, CC_P,
+    0, CC_P, CC_P, 0, CC_P, 0, 0, CC_P,
+    CC_P, 0, 0, CC_P, 0, CC_P, CC_P, 0,
+    0, CC_P, CC_P, 0, CC_P, 0, 0, CC_P,
+    CC_P, 0, 0, CC_P, 0, CC_P, CC_P, 0,
+    CC_P, 0, 0, CC_P, 0, CC_P, CC_P, 0,
+    0, CC_P, CC_P, 0, CC_P, 0, 0, CC_P,
+    0, CC_P, CC_P, 0, CC_P, 0, 0, CC_P,
+    CC_P, 0, 0, CC_P, 0, CC_P, CC_P, 0,
+    CC_P, 0, 0, CC_P, 0, CC_P, CC_P, 0,
+    0, CC_P, CC_P, 0, CC_P, 0, 0, CC_P,
+    CC_P, 0, 0, CC_P, 0, CC_P, CC_P, 0,
+    0, CC_P, CC_P, 0, CC_P, 0, 0, CC_P,
+    0, CC_P, CC_P, 0, CC_P, 0, 0, CC_P,
+    CC_P, 0, 0, CC_P, 0, CC_P, CC_P, 0,
+    0, CC_P, CC_P, 0, CC_P, 0, 0, CC_P,
+    CC_P, 0, 0, CC_P, 0, CC_P, CC_P, 0,
+    CC_P, 0, 0, CC_P, 0, CC_P, CC_P, 0,
+    0, CC_P, CC_P, 0, CC_P, 0, 0, CC_P,
+    CC_P, 0, 0, CC_P, 0, CC_P, CC_P, 0,
+    0, CC_P, CC_P, 0, CC_P, 0, 0, CC_P,
+    0, CC_P, CC_P, 0, CC_P, 0, 0, CC_P,
+    CC_P, 0, 0, CC_P, 0, CC_P, CC_P, 0,
+    CC_P, 0, 0, CC_P, 0, CC_P, CC_P, 0,
+    0, CC_P, CC_P, 0, CC_P, 0, 0, CC_P,
+    0, CC_P, CC_P, 0, CC_P, 0, 0, CC_P,
+    CC_P, 0, 0, CC_P, 0, CC_P, CC_P, 0,
+    0, CC_P, CC_P, 0, CC_P, 0, 0, CC_P,
+    CC_P, 0, 0, CC_P, 0, CC_P, CC_P, 0,
+    CC_P, 0, 0, CC_P, 0, CC_P, CC_P, 0,
+    0, CC_P, CC_P, 0, CC_P, 0, 0, CC_P,
+};
+
+/* modulo 17 table */
+static const uint8_t rclw_table[32] = {
+    0, 1, 2, 3, 4, 5, 6, 7,
+    8, 9,10,11,12,13,14,15,
+   16, 0, 1, 2, 3, 4, 5, 6,
+    7, 8, 9,10,11,12,13,14,
+};
+
+/* modulo 9 table */
+static const uint8_t rclb_table[32] = {
+    0, 1, 2, 3, 4, 5, 6, 7,
+    8, 0, 1, 2, 3, 4, 5, 6,
+    7, 8, 0, 1, 2, 3, 4, 5,
+    6, 7, 8, 0, 1, 2, 3, 4,
+};
+
+static const CPU86_LDouble f15rk[7] =
+{
+    0.00000000000000000000L,
+    1.00000000000000000000L,
+    3.14159265358979323851L,  /*pi*/
+    0.30102999566398119523L,  /*lg2*/
+    0.69314718055994530943L,  /*ln2*/
+    1.44269504088896340739L,  /*l2e*/
+    3.32192809488736234781L,  /*l2t*/
+};
+
+/* broken thread support */
+
+static spinlock_t global_cpu_lock = SPIN_LOCK_UNLOCKED;
+
+void helper_lock(void)
+{
+    spin_lock(&global_cpu_lock);
+}
+
+void helper_unlock(void)
+{
+    spin_unlock(&global_cpu_lock);
+}
+
+void helper_write_eflags(target_ulong t0, uint32_t update_mask)
+{
+    load_eflags(t0, update_mask);
+}
+
+target_ulong helper_read_eflags(void)
+{
+    uint32_t eflags;
+    eflags = helper_cc_compute_all(CC_OP);
+    eflags |= (DF & DF_MASK);
+    eflags |= env->eflags & ~(VM_MASK | RF_MASK);
+    return eflags;
+}
+
+#ifdef VBOX
+
+void helper_write_eflags_vme(target_ulong t0)
+{
+    unsigned int new_eflags = t0;
+
+    assert(env->eflags & (1<<VM_SHIFT));
+
+    /* if virtual interrupt pending and (virtual) interrupts will be enabled -> #GP */
+    /* if TF will be set -> #GP */
+    if (    ((new_eflags & IF_MASK) && (env->eflags & VIP_MASK))
+        ||  (new_eflags & TF_MASK)) {
+        raise_exception(EXCP0D_GPF);
+    } else {
+        load_eflags(new_eflags,
+                    (TF_MASK | AC_MASK | ID_MASK | NT_MASK) & 0xffff);
+
+        if (new_eflags & IF_MASK) {
+            env->eflags |= VIF_MASK;
+        } else {
+            env->eflags &= ~VIF_MASK;
+        }
+    }
+}
+
+target_ulong helper_read_eflags_vme(void)
+{
+    uint32_t eflags;
+    eflags = helper_cc_compute_all(CC_OP);
+    eflags |= (DF & DF_MASK);
+    eflags |= env->eflags & ~(VM_MASK | RF_MASK);
+    if (env->eflags & VIF_MASK)
+        eflags |= IF_MASK;
+    else
+        eflags &= ~IF_MASK;
+
+    /* According to AMD manual, should be read with IOPL == 3 */
+    eflags |= (3 << IOPL_SHIFT);
+
+    /* We only use helper_read_eflags_vme() in 16-bits mode */
+    return eflags & 0xffff;
+}
+
+void helper_dump_state()
+{
+    LogRel(("CS:EIP=%08x:%08x, FLAGS=%08x\n", env->segs[R_CS].base, env->eip, env->eflags));
+    LogRel(("EAX=%08x\tECX=%08x\tEDX=%08x\tEBX=%08x\n",
+            (uint32_t)env->regs[R_EAX], (uint32_t)env->regs[R_ECX],
+            (uint32_t)env->regs[R_EDX], (uint32_t)env->regs[R_EBX]));
+    LogRel(("ESP=%08x\tEBP=%08x\tESI=%08x\tEDI=%08x\n",
+            (uint32_t)env->regs[R_ESP], (uint32_t)env->regs[R_EBP],
+            (uint32_t)env->regs[R_ESI], (uint32_t)env->regs[R_EDI]));
+}
+
+/**
+ * Updates e2 with the DESC_A_MASK, writes it to the descriptor table, and
+ * returns the updated e2.
+ *
+ * @returns e2 with A set.
+ * @param   e2      The 2nd selector DWORD.
+ */
+static uint32_t set_segment_accessed(int selector, uint32_t e2)
+{
+    SegmentCache *dt = selector & X86_SEL_LDT ? &env->ldt : &env->gdt;
+    target_ulong ptr = dt->base + (selector & X86_SEL_MASK);
+
+    e2 |= DESC_A_MASK;
+    stl_kernel(ptr + 4, e2);
+    return e2;
+}
+
+#endif /* VBOX */
+
+/* return non zero if error */
+static inline int load_segment(uint32_t *e1_ptr, uint32_t *e2_ptr,
+                               int selector)
+{
+    SegmentCache *dt;
+    int index;
+    target_ulong ptr;
+
+    if (selector & 0x4)
+        dt = &env->ldt;
+    else
+        dt = &env->gdt;
+    index = selector & ~7;
+    if ((index + 7) > dt->limit)
+        return -1;
+    ptr = dt->base + index;
+    *e1_ptr = ldl_kernel(ptr);
+    *e2_ptr = ldl_kernel(ptr + 4);
+    return 0;
+}
+
+static inline unsigned int get_seg_limit(uint32_t e1, uint32_t e2)
+{
+    unsigned int limit;
+    limit = (e1 & 0xffff) | (e2 & 0x000f0000);
+    if (e2 & DESC_G_MASK)
+        limit = (limit << 12) | 0xfff;
+    return limit;
+}
+
+static inline uint32_t get_seg_base(uint32_t e1, uint32_t e2)
+{
+    return ((e1 >> 16) | ((e2 & 0xff) << 16) | (e2 & 0xff000000));
+}
+
+static inline void load_seg_cache_raw_dt(SegmentCache *sc, uint32_t e1, uint32_t e2)
+{
+    sc->base = get_seg_base(e1, e2);
+    sc->limit = get_seg_limit(e1, e2);
+#ifndef VBOX
+    sc->flags = e2;
+#else
+    sc->flags = e2 & DESC_RAW_FLAG_BITS;
+    sc->newselector = 0;
+    sc->fVBoxFlags  = CPUMSELREG_FLAGS_VALID;
+#endif
+}
+
+/* init the segment cache in vm86 mode. */
+static inline void load_seg_vm(int seg, int selector)
+{
+    selector &= 0xffff;
+#ifdef VBOX
+    /* flags must be 0xf3; expand-up read/write accessed data segment with DPL=3. (VT-x) */
+    unsigned flags = DESC_P_MASK | DESC_S_MASK | DESC_W_MASK | DESC_A_MASK;
+    flags |= (3 << DESC_DPL_SHIFT);
+
+    cpu_x86_load_seg_cache(env, seg, selector,
+                           (selector << 4), 0xffff, flags);
+#else  /* VBOX */
+    cpu_x86_load_seg_cache(env, seg, selector,
+                           (selector << 4), 0xffff, 0);
+#endif /* VBOX */
+}
+
+static inline void get_ss_esp_from_tss(uint32_t *ss_ptr,
+                                       uint32_t *esp_ptr, int dpl)
+{
+#ifndef VBOX
+    int type, index, shift;
+#else
+    unsigned int type, index, shift;
+#endif
+
+#if 0
+    {
+        int i;
+        printf("TR: base=%p limit=%x\n", env->tr.base, env->tr.limit);
+        for(i=0;i<env->tr.limit;i++) {
+            printf("%02x ", env->tr.base[i]);
+            if ((i & 7) == 7) printf("\n");
+        }
+        printf("\n");
+    }
+#endif
+
+    if (!(env->tr.flags & DESC_P_MASK))
+        cpu_abort(env, "invalid tss");
+    type = (env->tr.flags >> DESC_TYPE_SHIFT) & 0xf;
+    if ((type & 7) != 3)
+        cpu_abort(env, "invalid tss type");
+    shift = type >> 3;
+    index = (dpl * 4 + 2) << shift;
+    if (index + (4 << shift) - 1 > env->tr.limit)
+        raise_exception_err(EXCP0A_TSS, env->tr.selector & 0xfffc);
+    if (shift == 0) {
+        *esp_ptr = lduw_kernel(env->tr.base + index);
+        *ss_ptr = lduw_kernel(env->tr.base + index + 2);
+    } else {
+        *esp_ptr = ldl_kernel(env->tr.base + index);
+        *ss_ptr = lduw_kernel(env->tr.base + index + 4);
+    }
+}
+
+/* XXX: merge with load_seg() */
+static void tss_load_seg(int seg_reg, int selector)
+{
+    uint32_t e1, e2;
+    int rpl, dpl, cpl;
+
+#ifdef VBOX
+    e1 = e2 = 0; /* gcc warning? */
+    cpl = env->hflags & HF_CPL_MASK;
+    /* Trying to load a selector with CPL=1? */
+    if (cpl == 0 && (selector & 3) == 1 && (env->state & CPU_RAW_RING0))
+    {
+        Log(("RPL 1 -> sel %04X -> %04X (tss_load_seg)\n", selector, selector & 0xfffc));
+        selector = selector & 0xfffc;
+    }
+#endif /* VBOX */
+
+    if ((selector & 0xfffc) != 0) {
+        if (load_segment(&e1, &e2, selector) != 0)
+            raise_exception_err(EXCP0A_TSS, selector & 0xfffc);
+        if (!(e2 & DESC_S_MASK))
+            raise_exception_err(EXCP0A_TSS, selector & 0xfffc);
+        rpl = selector & 3;
+        dpl = (e2 >> DESC_DPL_SHIFT) & 3;
+        cpl = env->hflags & HF_CPL_MASK;
+        if (seg_reg == R_CS) {
+            if (!(e2 & DESC_CS_MASK))
+                raise_exception_err(EXCP0A_TSS, selector & 0xfffc);
+            /* XXX: is it correct ? */
+            if (dpl != rpl)
+                raise_exception_err(EXCP0A_TSS, selector & 0xfffc);
+            if ((e2 & DESC_C_MASK) && dpl > rpl)
+                raise_exception_err(EXCP0A_TSS, selector & 0xfffc);
+        } else if (seg_reg == R_SS) {
+            /* SS must be writable data */
+            if ((e2 & DESC_CS_MASK) || !(e2 & DESC_W_MASK))
+                raise_exception_err(EXCP0A_TSS, selector & 0xfffc);
+            if (dpl != cpl || dpl != rpl)
+                raise_exception_err(EXCP0A_TSS, selector & 0xfffc);
+        } else {
+            /* not readable code */
+            if ((e2 & DESC_CS_MASK) && !(e2 & DESC_R_MASK))
+                raise_exception_err(EXCP0A_TSS, selector & 0xfffc);
+            /* if data or non conforming code, checks the rights */
+            if (((e2 >> DESC_TYPE_SHIFT) & 0xf) < 12) {
+                if (dpl < cpl || dpl < rpl)
+                    raise_exception_err(EXCP0A_TSS, selector & 0xfffc);
+            }
+        }
+        if (!(e2 & DESC_P_MASK))
+            raise_exception_err(EXCP0B_NOSEG, selector & 0xfffc);
+        cpu_x86_load_seg_cache(env, seg_reg, selector,
+                       get_seg_base(e1, e2),
+                       get_seg_limit(e1, e2),
+                       e2);
+    } else {
+        if (seg_reg == R_SS || seg_reg == R_CS)
+            raise_exception_err(EXCP0A_TSS, selector & 0xfffc);
+#ifdef VBOX
+# if 0  /** @todo now we ignore loading 0 selectors, need to check what is correct once */
+        cpu_x86_load_seg_cache(env, seg_reg, selector,
+                               0, 0, 0);
+# endif
+#endif /* VBOX */
+    }
+}
+
+#define SWITCH_TSS_JMP  0
+#define SWITCH_TSS_IRET 1
+#define SWITCH_TSS_CALL 2
+
+/* XXX: restore CPU state in registers (PowerPC case) */
+static void switch_tss(int tss_selector,
+                       uint32_t e1, uint32_t e2, int source,
+                       uint32_t next_eip)
+{
+    int tss_limit, tss_limit_max, type, old_tss_limit_max, old_type, v1, v2, i;
+    target_ulong tss_base;
+    uint32_t new_regs[8], new_segs[6];
+    uint32_t new_eflags, new_eip, new_cr3, new_ldt, new_trap;
+    uint32_t old_eflags, eflags_mask;
+    SegmentCache *dt;
+#ifndef VBOX
+    int index;
+#else
+    unsigned int index;
+#endif
+    target_ulong ptr;
+
+    type = (e2 >> DESC_TYPE_SHIFT) & 0xf;
+    LOG_PCALL("switch_tss: sel=0x%04x type=%d src=%d\n", tss_selector, type, source);
+
+    /* if task gate, we read the TSS segment and we load it */
+    if (type == 5) {
+        if (!(e2 & DESC_P_MASK))
+            raise_exception_err(EXCP0B_NOSEG, tss_selector & 0xfffc);
+        tss_selector = e1 >> 16;
+        if (tss_selector & 4)
+            raise_exception_err(EXCP0A_TSS, tss_selector & 0xfffc);
+        if (load_segment(&e1, &e2, tss_selector) != 0)
+            raise_exception_err(EXCP0D_GPF, tss_selector & 0xfffc);
+        if (e2 & DESC_S_MASK)
+            raise_exception_err(EXCP0D_GPF, tss_selector & 0xfffc);
+        type = (e2 >> DESC_TYPE_SHIFT) & 0xf;
+        if ((type & 7) != 1)
+            raise_exception_err(EXCP0D_GPF, tss_selector & 0xfffc);
+    }
+
+    if (!(e2 & DESC_P_MASK))
+        raise_exception_err(EXCP0B_NOSEG, tss_selector & 0xfffc);
+
+    if (type & 8)
+        tss_limit_max = 103;
+    else
+        tss_limit_max = 43;
+    tss_limit = get_seg_limit(e1, e2);
+    tss_base = get_seg_base(e1, e2);
+    if ((tss_selector & 4) != 0 ||
+        tss_limit < tss_limit_max)
+        raise_exception_err(EXCP0A_TSS, tss_selector & 0xfffc);
+    old_type = (env->tr.flags >> DESC_TYPE_SHIFT) & 0xf;
+    if (old_type & 8)
+        old_tss_limit_max = 103;
+    else
+        old_tss_limit_max = 43;
+
+#ifndef VBOX    /* The old TSS is written first... */
+    /* read all the registers from the new TSS */
+    if (type & 8) {
+        /* 32 bit */
+        new_cr3 = ldl_kernel(tss_base + 0x1c);
+        new_eip = ldl_kernel(tss_base + 0x20);
+        new_eflags = ldl_kernel(tss_base + 0x24);
+        for(i = 0; i < 8; i++)
+            new_regs[i] = ldl_kernel(tss_base + (0x28 + i * 4));
+        for(i = 0; i < 6; i++)
+            new_segs[i] = lduw_kernel(tss_base + (0x48 + i * 4));
+        new_ldt = lduw_kernel(tss_base + 0x60);
+        new_trap = ldl_kernel(tss_base + 0x64);
+    } else {
+        /* 16 bit */
+        new_cr3 = 0;
+        new_eip = lduw_kernel(tss_base + 0x0e);
+        new_eflags = lduw_kernel(tss_base + 0x10);
+        for(i = 0; i < 8; i++)
+            new_regs[i] = lduw_kernel(tss_base + (0x12 + i * 2)) | 0xffff0000;
+        for(i = 0; i < 4; i++)
+            new_segs[i] = lduw_kernel(tss_base + (0x22 + i * 4));
+        new_ldt = lduw_kernel(tss_base + 0x2a);
+        new_segs[R_FS] = 0;
+        new_segs[R_GS] = 0;
+        new_trap = 0;
+    }
+#endif
+
+    /* NOTE: we must avoid memory exceptions during the task switch,
+       so we make dummy accesses before */
+    /* XXX: it can still fail in some cases, so a bigger hack is
+       necessary to valid the TLB after having done the accesses */
+
+    v1 = ldub_kernel(env->tr.base);
+    v2 = ldub_kernel(env->tr.base + old_tss_limit_max);
+    stb_kernel(env->tr.base, v1);
+    stb_kernel(env->tr.base + old_tss_limit_max, v2);
+
+    /* clear busy bit (it is restartable) */
+    if (source == SWITCH_TSS_JMP || source == SWITCH_TSS_IRET) {
+        target_ulong ptr;
+        uint32_t e2;
+        ptr = env->gdt.base + (env->tr.selector & ~7);
+        e2 = ldl_kernel(ptr + 4);
+        e2 &= ~DESC_TSS_BUSY_MASK;
+        stl_kernel(ptr + 4, e2);
+    }
+    old_eflags = compute_eflags();
+    if (source == SWITCH_TSS_IRET)
+        old_eflags &= ~NT_MASK;
+
+    /* save the current state in the old TSS */
+    if (type & 8) {
+        /* 32 bit */
+        stl_kernel(env->tr.base + 0x20, next_eip);
+        stl_kernel(env->tr.base + 0x24, old_eflags);
+        stl_kernel(env->tr.base + (0x28 + 0 * 4), EAX);
+        stl_kernel(env->tr.base + (0x28 + 1 * 4), ECX);
+        stl_kernel(env->tr.base + (0x28 + 2 * 4), EDX);
+        stl_kernel(env->tr.base + (0x28 + 3 * 4), EBX);
+        stl_kernel(env->tr.base + (0x28 + 4 * 4), ESP);
+        stl_kernel(env->tr.base + (0x28 + 5 * 4), EBP);
+        stl_kernel(env->tr.base + (0x28 + 6 * 4), ESI);
+        stl_kernel(env->tr.base + (0x28 + 7 * 4), EDI);
+        for(i = 0; i < 6; i++)
+            stw_kernel(env->tr.base + (0x48 + i * 4), env->segs[i].selector);
+#if defined(VBOX) && defined(DEBUG)
+        printf("TSS 32 bits switch\n");
+        printf("Saving CS=%08X\n", env->segs[R_CS].selector);
+#endif
+    } else {
+        /* 16 bit */
+        stw_kernel(env->tr.base + 0x0e, next_eip);
+        stw_kernel(env->tr.base + 0x10, old_eflags);
+        stw_kernel(env->tr.base + (0x12 + 0 * 2), EAX);
+        stw_kernel(env->tr.base + (0x12 + 1 * 2), ECX);
+        stw_kernel(env->tr.base + (0x12 + 2 * 2), EDX);
+        stw_kernel(env->tr.base + (0x12 + 3 * 2), EBX);
+        stw_kernel(env->tr.base + (0x12 + 4 * 2), ESP);
+        stw_kernel(env->tr.base + (0x12 + 5 * 2), EBP);
+        stw_kernel(env->tr.base + (0x12 + 6 * 2), ESI);
+        stw_kernel(env->tr.base + (0x12 + 7 * 2), EDI);
+        for(i = 0; i < 4; i++)
+            stw_kernel(env->tr.base + (0x22 + i * 2), env->segs[i].selector);
+    }
+
+#ifdef VBOX
+    /* read all the registers from the new TSS - may be the same as the old one */
+    if (type & 8) {
+        /* 32 bit */
+        new_cr3 = ldl_kernel(tss_base + 0x1c);
+        new_eip = ldl_kernel(tss_base + 0x20);
+        new_eflags = ldl_kernel(tss_base + 0x24);
+        for(i = 0; i < 8; i++)
+            new_regs[i] = ldl_kernel(tss_base + (0x28 + i * 4));
+        for(i = 0; i < 6; i++)
+            new_segs[i] = lduw_kernel(tss_base + (0x48 + i * 4));
+        new_ldt = lduw_kernel(tss_base + 0x60);
+        new_trap = ldl_kernel(tss_base + 0x64);
+    } else {
+        /* 16 bit */
+        new_cr3 = 0;
+        new_eip = lduw_kernel(tss_base + 0x0e);
+        new_eflags = lduw_kernel(tss_base + 0x10);
+        for(i = 0; i < 8; i++)
+            new_regs[i] = lduw_kernel(tss_base + (0x12 + i * 2)) | 0xffff0000;
+        for(i = 0; i < 4; i++)
+            new_segs[i] = lduw_kernel(tss_base + (0x22 + i * 2));
+        new_ldt = lduw_kernel(tss_base + 0x2a);
+        new_segs[R_FS] = 0;
+        new_segs[R_GS] = 0;
+        new_trap = 0;
+    }
+#endif
+
+    /* now if an exception occurs, it will occurs in the next task
+       context */
+
+    if (source == SWITCH_TSS_CALL) {
+        stw_kernel(tss_base, env->tr.selector);
+        new_eflags |= NT_MASK;
+    }
+
+    /* set busy bit */
+    if (source == SWITCH_TSS_JMP || source == SWITCH_TSS_CALL) {
+        target_ulong ptr;
+        uint32_t e2;
+        ptr = env->gdt.base + (tss_selector & ~7);
+        e2 = ldl_kernel(ptr + 4);
+        e2 |= DESC_TSS_BUSY_MASK;
+        stl_kernel(ptr + 4, e2);
+    }
+
+    /* set the new CPU state */
+    /* from this point, any exception which occurs can give problems */
+    env->cr[0] |= CR0_TS_MASK;
+    env->hflags |= HF_TS_MASK;
+    env->tr.selector = tss_selector;
+    env->tr.base = tss_base;
+    env->tr.limit = tss_limit;
+#ifndef VBOX
+    env->tr.flags = e2 & ~DESC_TSS_BUSY_MASK;
+#else
+    env->tr.flags = (e2 | DESC_TSS_BUSY_MASK) & DESC_RAW_FLAG_BITS;
+    env->tr.fVBoxFlags  = CPUMSELREG_FLAGS_VALID;
+    env->tr.newselector = 0;
+#endif
+
+    if ((type & 8) && (env->cr[0] & CR0_PG_MASK)) {
+        cpu_x86_update_cr3(env, new_cr3);
+    }
+
+    /* load all registers without an exception, then reload them with
+       possible exception */
+    env->eip = new_eip;
+    eflags_mask = TF_MASK | AC_MASK | ID_MASK |
+        IF_MASK | IOPL_MASK | VM_MASK | RF_MASK | NT_MASK;
+    if (!(type & 8))
+        eflags_mask &= 0xffff;
+    load_eflags(new_eflags, eflags_mask);
+    /* XXX: what to do in 16 bit case ? */
+    EAX = new_regs[0];
+    ECX = new_regs[1];
+    EDX = new_regs[2];
+    EBX = new_regs[3];
+    ESP = new_regs[4];
+    EBP = new_regs[5];
+    ESI = new_regs[6];
+    EDI = new_regs[7];
+    if (new_eflags & VM_MASK) {
+        for(i = 0; i < 6; i++)
+            load_seg_vm(i, new_segs[i]);
+        /* in vm86, CPL is always 3 */
+        cpu_x86_set_cpl(env, 3);
+    } else {
+        /* CPL is set the RPL of CS */
+        cpu_x86_set_cpl(env, new_segs[R_CS] & 3);
+        /* first just selectors as the rest may trigger exceptions */
+        for(i = 0; i < 6; i++)
+            cpu_x86_load_seg_cache(env, i, new_segs[i], 0, 0, 0);
+    }
+
+    env->ldt.selector = new_ldt & ~4;
+    env->ldt.base = 0;
+    env->ldt.limit = 0;
+    env->ldt.flags = 0;
+#ifdef VBOX
+    env->ldt.flags = DESC_INTEL_UNUSABLE;
+    env->ldt.fVBoxFlags  = CPUMSELREG_FLAGS_VALID;
+    env->ldt.newselector = 0;
+#endif
+
+    /* load the LDT */
+    if (new_ldt & 4)
+        raise_exception_err(EXCP0A_TSS, new_ldt & 0xfffc);
+
+    if ((new_ldt & 0xfffc) != 0) {
+        dt = &env->gdt;
+        index = new_ldt & ~7;
+        if ((index + 7) > dt->limit)
+            raise_exception_err(EXCP0A_TSS, new_ldt & 0xfffc);
+        ptr = dt->base + index;
+        e1 = ldl_kernel(ptr);
+        e2 = ldl_kernel(ptr + 4);
+        if ((e2 & DESC_S_MASK) || ((e2 >> DESC_TYPE_SHIFT) & 0xf) != 2)
+            raise_exception_err(EXCP0A_TSS, new_ldt & 0xfffc);
+        if (!(e2 & DESC_P_MASK))
+            raise_exception_err(EXCP0A_TSS, new_ldt & 0xfffc);
+        load_seg_cache_raw_dt(&env->ldt, e1, e2);
+    }
+
+    /* load the segments */
+    if (!(new_eflags & VM_MASK)) {
+        tss_load_seg(R_CS, new_segs[R_CS]);
+        tss_load_seg(R_SS, new_segs[R_SS]);
+        tss_load_seg(R_ES, new_segs[R_ES]);
+        tss_load_seg(R_DS, new_segs[R_DS]);
+        tss_load_seg(R_FS, new_segs[R_FS]);
+        tss_load_seg(R_GS, new_segs[R_GS]);
+    }
+
+    /* check that EIP is in the CS segment limits */
+    if (new_eip > env->segs[R_CS].limit) {
+        /* XXX: different exception if CALL ? */
+        raise_exception_err(EXCP0D_GPF, 0);
+    }
+
+#ifndef CONFIG_USER_ONLY
+    /* reset local breakpoints */
+    if (env->dr[7] & 0x55) {
+        for (i = 0; i < 4; i++) {
+            if (hw_breakpoint_enabled(env->dr[7], i) == 0x1)
+                hw_breakpoint_remove(env, i);
+        }
+        env->dr[7] &= ~0x55;
+    }
+#endif
+}
+
+/* check if Port I/O is allowed in TSS */
+static inline void check_io(int addr, int size)
+{
+#ifndef VBOX
+    int io_offset, val, mask;
+#else
+    int val, mask;
+    unsigned int io_offset;
+#endif /* VBOX */
+
+    /* TSS must be a valid 32 bit one */
+    if (!(env->tr.flags & DESC_P_MASK) ||
+        ((env->tr.flags >> DESC_TYPE_SHIFT) & 0xf) != 11 ||
+        env->tr.limit < 103)
+        goto fail;
+    io_offset = lduw_kernel(env->tr.base + 0x66);
+    io_offset += (addr >> 3);
+    /* Note: the check needs two bytes */
+    if ((io_offset + 1) > env->tr.limit)
+        goto fail;
+    val = lduw_kernel(env->tr.base + io_offset);
+    val >>= (addr & 7);
+    mask = (1 << size) - 1;
+    /* all bits must be zero to allow the I/O */
+    if ((val & mask) != 0) {
+    fail:
+        raise_exception_err(EXCP0D_GPF, 0);
+    }
+}
+
+#ifdef VBOX
+
+/* Keep in sync with gen_check_external_event() */
+void helper_check_external_event()
+{
+    if (    (env->interrupt_request & (  CPU_INTERRUPT_EXTERNAL_FLUSH_TLB
+                                       | CPU_INTERRUPT_EXTERNAL_EXIT
+                                       | CPU_INTERRUPT_EXTERNAL_TIMER
+                                       | CPU_INTERRUPT_EXTERNAL_DMA))
+        ||  (   (env->interrupt_request & CPU_INTERRUPT_EXTERNAL_HARD)
+             && (env->eflags & IF_MASK)
+             && !(env->hflags & HF_INHIBIT_IRQ_MASK) ) )
+    {
+        helper_external_event();
+    }
+
+}
+
+void helper_sync_seg(uint32_t reg)
+{
+    if (env->segs[reg].newselector)
+        sync_seg(env, reg, env->segs[reg].newselector);
+}
+
+#endif /* VBOX */
+
+void helper_check_iob(uint32_t t0)
+{
+    check_io(t0, 1);
+}
+
+void helper_check_iow(uint32_t t0)
+{
+    check_io(t0, 2);
+}
+
+void helper_check_iol(uint32_t t0)
+{
+    check_io(t0, 4);
+}
+
+void helper_outb(uint32_t port, uint32_t data)
+{
+#ifndef VBOX
+    cpu_outb(port, data & 0xff);
+#else
+    cpu_outb(env, port, data & 0xff);
+#endif
+}
+
+target_ulong helper_inb(uint32_t port)
+{
+#ifndef VBOX
+    return cpu_inb(port);
+#else
+    return cpu_inb(env, port);
+#endif
+}
+
+void helper_outw(uint32_t port, uint32_t data)
+{
+#ifndef VBOX
+    cpu_outw(port, data & 0xffff);
+#else
+    cpu_outw(env, port, data & 0xffff);
+#endif
+}
+
+target_ulong helper_inw(uint32_t port)
+{
+#ifndef VBOX
+    return cpu_inw(port);
+#else
+    return cpu_inw(env, port);
+#endif
+}
+
+void helper_outl(uint32_t port, uint32_t data)
+{
+#ifndef VBOX
+    cpu_outl(port, data);
+#else
+    cpu_outl(env, port, data);
+#endif
+}
+
+target_ulong helper_inl(uint32_t port)
+{
+#ifndef VBOX
+    return cpu_inl(port);
+#else
+    return cpu_inl(env, port);
+#endif
+}
+
+static inline unsigned int get_sp_mask(unsigned int e2)
+{
+    if (e2 & DESC_B_MASK)
+        return 0xffffffff;
+    else
+        return 0xffff;
+}
+
+static int exeption_has_error_code(int intno)
+{
+        switch(intno) {
+        case 8:
+        case 10:
+        case 11:
+        case 12:
+        case 13:
+        case 14:
+        case 17:
+            return 1;
+        }
+	return 0;
+}
+
+#ifdef TARGET_X86_64
+#define SET_ESP(val, sp_mask)\
+do {\
+    if ((sp_mask) == 0xffff)\
+        ESP = (ESP & ~0xffff) | ((val) & 0xffff);\
+    else if ((sp_mask) == 0xffffffffLL)\
+        ESP = (uint32_t)(val);\
+    else\
+        ESP = (val);\
+} while (0)
+#else
+#define SET_ESP(val, sp_mask) ESP = (ESP & ~(sp_mask)) | ((val) & (sp_mask))
+#endif
+
+/* in 64-bit machines, this can overflow. So this segment addition macro
+ * can be used to trim the value to 32-bit whenever needed */
+#define SEG_ADDL(ssp, sp, sp_mask) ((uint32_t)((ssp) + (sp & (sp_mask))))
+
+/* XXX: add a is_user flag to have proper security support */
+#define PUSHW(ssp, sp, sp_mask, val)\
+{\
+    sp -= 2;\
+    stw_kernel((ssp) + (sp & (sp_mask)), (val));\
+}
+
+#define PUSHL(ssp, sp, sp_mask, val)\
+{\
+    sp -= 4;\
+    stl_kernel(SEG_ADDL(ssp, sp, sp_mask), (uint32_t)(val));\
+}
+
+#define POPW(ssp, sp, sp_mask, val)\
+{\
+    val = lduw_kernel((ssp) + (sp & (sp_mask)));\
+    sp += 2;\
+}
+
+#define POPL(ssp, sp, sp_mask, val)\
+{\
+    val = (uint32_t)ldl_kernel(SEG_ADDL(ssp, sp, sp_mask));\
+    sp += 4;\
+}
+
+/* protected mode interrupt */
+static void do_interrupt_protected(int intno, int is_int, int error_code,
+                                   unsigned int next_eip, int is_hw)
+{
+    SegmentCache *dt;
+    target_ulong ptr, ssp;
+    int type, dpl, selector, ss_dpl, cpl;
+    int has_error_code, new_stack, shift;
+    uint32_t e1, e2, offset, ss = 0, esp, ss_e1 = 0, ss_e2 = 0;
+    uint32_t old_eip, sp_mask;
+
+#ifdef VBOX
+    if (remR3NotifyTrap(env, intno, error_code, next_eip) != VINF_SUCCESS)
+        cpu_loop_exit();
+#endif
+
+    has_error_code = 0;
+    if (!is_int && !is_hw)
+        has_error_code = exeption_has_error_code(intno);
+    if (is_int)
+        old_eip = next_eip;
+    else
+        old_eip = env->eip;
+
+    dt = &env->idt;
+#ifndef VBOX
+    if (intno * 8 + 7 > dt->limit)
+#else
+    if ((unsigned)intno * 8 + 7 > dt->limit)
+#endif
+        raise_exception_err(EXCP0D_GPF, intno * 8 + 2);
+    ptr = dt->base + intno * 8;
+    e1 = ldl_kernel(ptr);
+    e2 = ldl_kernel(ptr + 4);
+    /* check gate type */
+    type = (e2 >> DESC_TYPE_SHIFT) & 0x1f;
+    switch(type) {
+    case 5: /* task gate */
+#ifdef VBOX
+        dpl = (e2 >> DESC_DPL_SHIFT) & 3;
+        cpl = env->hflags & HF_CPL_MASK;
+        /* check privilege if software int */
+        if (is_int && dpl < cpl)
+            raise_exception_err(EXCP0D_GPF, intno * 8 + 2);
+#endif
+        /* must do that check here to return the correct error code */
+        if (!(e2 & DESC_P_MASK))
+            raise_exception_err(EXCP0B_NOSEG, intno * 8 + 2);
+        switch_tss(intno * 8, e1, e2, SWITCH_TSS_CALL, old_eip);
+        if (has_error_code) {
+            int type;
+            uint32_t mask;
+            /* push the error code */
+            type = (env->tr.flags >> DESC_TYPE_SHIFT) & 0xf;
+            shift = type >> 3;
+            if (env->segs[R_SS].flags & DESC_B_MASK)
+                mask = 0xffffffff;
+            else
+                mask = 0xffff;
+            esp = (ESP - (2 << shift)) & mask;
+            ssp = env->segs[R_SS].base + esp;
+            if (shift)
+                stl_kernel(ssp, error_code);
+            else
+                stw_kernel(ssp, error_code);
+            SET_ESP(esp, mask);
+        }
+        return;
+    case 6: /* 286 interrupt gate */
+    case 7: /* 286 trap gate */
+    case 14: /* 386 interrupt gate */
+    case 15: /* 386 trap gate */
+        break;
+    default:
+        raise_exception_err(EXCP0D_GPF, intno * 8 + 2);
+        break;
+    }
+    dpl = (e2 >> DESC_DPL_SHIFT) & 3;
+    cpl = env->hflags & HF_CPL_MASK;
+    /* check privilege if software int */
+    if (is_int && dpl < cpl)
+        raise_exception_err(EXCP0D_GPF, intno * 8 + 2);
+    /* check valid bit */
+    if (!(e2 & DESC_P_MASK))
+        raise_exception_err(EXCP0B_NOSEG, intno * 8 + 2);
+    selector = e1 >> 16;
+    offset = (e2 & 0xffff0000) | (e1 & 0x0000ffff);
+    if ((selector & 0xfffc) == 0)
+        raise_exception_err(EXCP0D_GPF, 0);
+
+    if (load_segment(&e1, &e2, selector) != 0)
+        raise_exception_err(EXCP0D_GPF, selector & 0xfffc);
+#ifdef VBOX /** @todo figure out when this is done one day... */
+    if (!(e2 & DESC_A_MASK))
+        e2 = set_segment_accessed(selector, e2);
+#endif
+    if (!(e2 & DESC_S_MASK) || !(e2 & (DESC_CS_MASK)))
+        raise_exception_err(EXCP0D_GPF, selector & 0xfffc);
+    dpl = (e2 >> DESC_DPL_SHIFT) & 3;
+    if (dpl > cpl)
+        raise_exception_err(EXCP0D_GPF, selector & 0xfffc);
+    if (!(e2 & DESC_P_MASK))
+        raise_exception_err(EXCP0B_NOSEG, selector & 0xfffc);
+    if (!(e2 & DESC_C_MASK) && dpl < cpl) {
+        /* to inner privilege */
+        get_ss_esp_from_tss(&ss, &esp, dpl);
+        if ((ss & 0xfffc) == 0)
+            raise_exception_err(EXCP0A_TSS, ss & 0xfffc);
+        if ((ss & 3) != dpl)
+            raise_exception_err(EXCP0A_TSS, ss & 0xfffc);
+        if (load_segment(&ss_e1, &ss_e2, ss) != 0)
+            raise_exception_err(EXCP0A_TSS, ss & 0xfffc);
+#ifdef VBOX /** @todo figure out when this is done one day... */
+        if (!(ss_e2 & DESC_A_MASK))
+            ss_e2 = set_segment_accessed(ss, ss_e2);
+#endif
+        ss_dpl = (ss_e2 >> DESC_DPL_SHIFT) & 3;
+        if (ss_dpl != dpl)
+            raise_exception_err(EXCP0A_TSS, ss & 0xfffc);
+        if (!(ss_e2 & DESC_S_MASK) ||
+            (ss_e2 & DESC_CS_MASK) ||
+            !(ss_e2 & DESC_W_MASK))
+            raise_exception_err(EXCP0A_TSS, ss & 0xfffc);
+        if (!(ss_e2 & DESC_P_MASK))
+#ifdef VBOX /* See page 3-477 of 253666.pdf */
+            raise_exception_err(EXCP0C_STACK, ss & 0xfffc);
+#else
+            raise_exception_err(EXCP0A_TSS, ss & 0xfffc);
+#endif
+        new_stack = 1;
+        sp_mask = get_sp_mask(ss_e2);
+        ssp = get_seg_base(ss_e1, ss_e2);
+#if defined(VBOX) && defined(DEBUG)
+        printf("new stack %04X:%08X gate dpl=%d\n", ss, esp, dpl);
+#endif
+    } else if ((e2 & DESC_C_MASK) || dpl == cpl) {
+        /* to same privilege */
+        if (env->eflags & VM_MASK)
+            raise_exception_err(EXCP0D_GPF, selector & 0xfffc);
+        new_stack = 0;
+        sp_mask = get_sp_mask(env->segs[R_SS].flags);
+        ssp = env->segs[R_SS].base;
+        esp = ESP;
+        dpl = cpl;
+    } else {
+        raise_exception_err(EXCP0D_GPF, selector & 0xfffc);
+        new_stack = 0; /* avoid warning */
+        sp_mask = 0; /* avoid warning */
+        ssp = 0; /* avoid warning */
+        esp = 0; /* avoid warning */
+    }
+
+    shift = type >> 3;
+
+#if 0
+    /* XXX: check that enough room is available */
+    push_size = 6 + (new_stack << 2) + (has_error_code << 1);
+    if (env->eflags & VM_MASK)
+        push_size += 8;
+    push_size <<= shift;
+#endif
+    if (shift == 1) {
+        if (new_stack) {
+            if (env->eflags & VM_MASK) {
+                PUSHL(ssp, esp, sp_mask, env->segs[R_GS].selector);
+                PUSHL(ssp, esp, sp_mask, env->segs[R_FS].selector);
+                PUSHL(ssp, esp, sp_mask, env->segs[R_DS].selector);
+                PUSHL(ssp, esp, sp_mask, env->segs[R_ES].selector);
+            }
+            PUSHL(ssp, esp, sp_mask, env->segs[R_SS].selector);
+            PUSHL(ssp, esp, sp_mask, ESP);
+        }
+        PUSHL(ssp, esp, sp_mask, compute_eflags());
+        PUSHL(ssp, esp, sp_mask, env->segs[R_CS].selector);
+        PUSHL(ssp, esp, sp_mask, old_eip);
+        if (has_error_code) {
+            PUSHL(ssp, esp, sp_mask, error_code);
+        }
+    } else {
+        if (new_stack) {
+            if (env->eflags & VM_MASK) {
+                PUSHW(ssp, esp, sp_mask, env->segs[R_GS].selector);
+                PUSHW(ssp, esp, sp_mask, env->segs[R_FS].selector);
+                PUSHW(ssp, esp, sp_mask, env->segs[R_DS].selector);
+                PUSHW(ssp, esp, sp_mask, env->segs[R_ES].selector);
+            }
+            PUSHW(ssp, esp, sp_mask, env->segs[R_SS].selector);
+            PUSHW(ssp, esp, sp_mask, ESP);
+        }
+        PUSHW(ssp, esp, sp_mask, compute_eflags());
+        PUSHW(ssp, esp, sp_mask, env->segs[R_CS].selector);
+        PUSHW(ssp, esp, sp_mask, old_eip);
+        if (has_error_code) {
+            PUSHW(ssp, esp, sp_mask, error_code);
+        }
+    }
+
+    if (new_stack) {
+        if (env->eflags & VM_MASK) {
+            cpu_x86_load_seg_cache(env, R_ES, 0, 0, 0, 0);
+            cpu_x86_load_seg_cache(env, R_DS, 0, 0, 0, 0);
+            cpu_x86_load_seg_cache(env, R_FS, 0, 0, 0, 0);
+            cpu_x86_load_seg_cache(env, R_GS, 0, 0, 0, 0);
+        }
+        ss = (ss & ~3) | dpl;
+        cpu_x86_load_seg_cache(env, R_SS, ss,
+                               ssp, get_seg_limit(ss_e1, ss_e2), ss_e2);
+    }
+    SET_ESP(esp, sp_mask);
+
+    selector = (selector & ~3) | dpl;
+    cpu_x86_load_seg_cache(env, R_CS, selector,
+                   get_seg_base(e1, e2),
+                   get_seg_limit(e1, e2),
+                   e2);
+    cpu_x86_set_cpl(env, dpl);
+    env->eip = offset;
+
+    /* interrupt gate clear IF mask */
+    if ((type & 1) == 0) {
+        env->eflags &= ~IF_MASK;
+    }
+#ifndef VBOX
+    env->eflags &= ~(TF_MASK | VM_MASK | RF_MASK | NT_MASK);
+#else
+     /*
+      * We must clear VIP/VIF too on interrupt entry, as otherwise FreeBSD
+      * gets confused by seemingly changed EFLAGS. See #3491 and
+      * public bug #2341.
+      */
+    env->eflags &= ~(TF_MASK | VM_MASK | RF_MASK | NT_MASK | VIF_MASK | VIP_MASK);
+#endif
+}
+
+#ifdef VBOX
+
+/* check if VME interrupt redirection is enabled in TSS */
+DECLINLINE(bool) is_vme_irq_redirected(int intno)
+{
+    unsigned int io_offset, intredir_offset;
+    unsigned char val, mask;
+
+    /* TSS must be a valid 32 bit one */
+    if (!(env->tr.flags & DESC_P_MASK) ||
+        ((env->tr.flags >> DESC_TYPE_SHIFT) & 0xf) != 11 ||
+        env->tr.limit < 103)
+        goto fail;
+    io_offset = lduw_kernel(env->tr.base + 0x66);
+    /* Make sure the io bitmap offset is valid; anything less than sizeof(VBOXTSS) means there's none. */
+    if (io_offset < 0x68 + 0x20)
+        io_offset = 0x68 + 0x20;
+    /* the virtual interrupt redirection bitmap is located below the io bitmap */
+    intredir_offset = io_offset - 0x20;
+
+    intredir_offset += (intno >> 3);
+    if ((intredir_offset) > env->tr.limit)
+        goto fail;
+
+    val = ldub_kernel(env->tr.base + intredir_offset);
+    mask = 1 << (unsigned char)(intno & 7);
+
+    /* bit set means no redirection. */
+    if ((val & mask) != 0) {
+        return false;
+    }
+    return true;
+
+fail:
+    raise_exception_err(EXCP0D_GPF, 0);
+    return true;
+}
+
+/* V86 mode software interrupt with CR4.VME=1 */
+static void do_soft_interrupt_vme(int intno, int error_code, unsigned int next_eip)
+{
+    target_ulong ptr, ssp;
+    int selector;
+    uint32_t offset, esp;
+    uint32_t old_cs, old_eflags;
+    uint32_t iopl;
+
+    iopl = ((env->eflags >> IOPL_SHIFT) & 3);
+
+    if (!is_vme_irq_redirected(intno))
+    {
+        if (iopl == 3)
+        {
+            do_interrupt_protected(intno, 1, error_code, next_eip, 0);
+            return;
+        }
+        else
+            raise_exception_err(EXCP0D_GPF, 0);
+    }
+
+    /* virtual mode idt is at linear address 0 */
+    ptr = 0 + intno * 4;
+    offset = lduw_kernel(ptr);
+    selector = lduw_kernel(ptr + 2);
+    esp = ESP;
+    ssp = env->segs[R_SS].base;
+    old_cs = env->segs[R_CS].selector;
+
+    old_eflags = compute_eflags();
+    if (iopl < 3)
+    {
+        /* copy VIF into IF and set IOPL to 3 */
+        if (env->eflags & VIF_MASK)
+            old_eflags |= IF_MASK;
+        else
+            old_eflags &= ~IF_MASK;
+
+        old_eflags |= (3 << IOPL_SHIFT);
+    }
+
+    /* XXX: use SS segment size ? */
+    PUSHW(ssp, esp, 0xffff, old_eflags);
+    PUSHW(ssp, esp, 0xffff, old_cs);
+    PUSHW(ssp, esp, 0xffff, next_eip);
+
+    /* update processor state */
+    ESP = (ESP & ~0xffff) | (esp & 0xffff);
+    env->eip = offset;
+    env->segs[R_CS].selector = selector;
+    env->segs[R_CS].base = (selector << 4);
+    env->eflags &= ~(TF_MASK | RF_MASK);
+
+    if (iopl < 3)
+        env->eflags &= ~VIF_MASK;
+    else
+        env->eflags &= ~IF_MASK;
+}
+
+#endif /* VBOX */
+
+#ifdef TARGET_X86_64
+
+#define PUSHQ(sp, val)\
+{\
+    sp -= 8;\
+    stq_kernel(sp, (val));\
+}
+
+#define POPQ(sp, val)\
+{\
+    val = ldq_kernel(sp);\
+    sp += 8;\
+}
+
+static inline target_ulong get_rsp_from_tss(int level)
+{
+    int index;
+
+#if 0
+    printf("TR: base=" TARGET_FMT_lx " limit=%x\n",
+           env->tr.base, env->tr.limit);
+#endif
+
+    if (!(env->tr.flags & DESC_P_MASK))
+        cpu_abort(env, "invalid tss");
+    index = 8 * level + 4;
+    if ((index + 7) > env->tr.limit)
+        raise_exception_err(EXCP0A_TSS, env->tr.selector & 0xfffc);
+    return ldq_kernel(env->tr.base + index);
+}
+
+/* 64 bit interrupt */
+static void do_interrupt64(int intno, int is_int, int error_code,
+                           target_ulong next_eip, int is_hw)
+{
+    SegmentCache *dt;
+    target_ulong ptr;
+    int type, dpl, selector, cpl, ist;
+    int has_error_code, new_stack;
+    uint32_t e1, e2, e3, ss;
+    target_ulong old_eip, esp, offset;
+
+#ifdef VBOX
+    if (remR3NotifyTrap(env, intno, error_code, next_eip) != VINF_SUCCESS)
+        cpu_loop_exit();
+#endif
+
+    has_error_code = 0;
+    if (!is_int && !is_hw)
+        has_error_code = exeption_has_error_code(intno);
+    if (is_int)
+        old_eip = next_eip;
+    else
+        old_eip = env->eip;
+
+    dt = &env->idt;
+    if (intno * 16 + 15 > dt->limit)
+        raise_exception_err(EXCP0D_GPF, intno * 16 + 2);
+    ptr = dt->base + intno * 16;
+    e1 = ldl_kernel(ptr);
+    e2 = ldl_kernel(ptr + 4);
+    e3 = ldl_kernel(ptr + 8);
+    /* check gate type */
+    type = (e2 >> DESC_TYPE_SHIFT) & 0x1f;
+    switch(type) {
+    case 14: /* 386 interrupt gate */
+    case 15: /* 386 trap gate */
+        break;
+    default:
+        raise_exception_err(EXCP0D_GPF, intno * 16 + 2);
+        break;
+    }
+    dpl = (e2 >> DESC_DPL_SHIFT) & 3;
+    cpl = env->hflags & HF_CPL_MASK;
+    /* check privilege if software int */
+    if (is_int && dpl < cpl)
+        raise_exception_err(EXCP0D_GPF, intno * 16 + 2);
+    /* check valid bit */
+    if (!(e2 & DESC_P_MASK))
+        raise_exception_err(EXCP0B_NOSEG, intno * 16 + 2);
+    selector = e1 >> 16;
+    offset = ((target_ulong)e3 << 32) | (e2 & 0xffff0000) | (e1 & 0x0000ffff);
+    ist = e2 & 7;
+    if ((selector & 0xfffc) == 0)
+        raise_exception_err(EXCP0D_GPF, 0);
+
+    if (load_segment(&e1, &e2, selector) != 0)
+        raise_exception_err(EXCP0D_GPF, selector & 0xfffc);
+    if (!(e2 & DESC_S_MASK) || !(e2 & (DESC_CS_MASK)))
+        raise_exception_err(EXCP0D_GPF, selector & 0xfffc);
+    dpl = (e2 >> DESC_DPL_SHIFT) & 3;
+    if (dpl > cpl)
+        raise_exception_err(EXCP0D_GPF, selector & 0xfffc);
+    if (!(e2 & DESC_P_MASK))
+        raise_exception_err(EXCP0B_NOSEG, selector & 0xfffc);
+    if (!(e2 & DESC_L_MASK) || (e2 & DESC_B_MASK))
+        raise_exception_err(EXCP0D_GPF, selector & 0xfffc);
+    if ((!(e2 & DESC_C_MASK) && dpl < cpl) || ist != 0) {
+        /* to inner privilege */
+        if (ist != 0)
+            esp = get_rsp_from_tss(ist + 3);
+        else
+            esp = get_rsp_from_tss(dpl);
+        esp &= ~0xfLL; /* align stack */
+        ss = 0;
+        new_stack = 1;
+    } else if ((e2 & DESC_C_MASK) || dpl == cpl) {
+        /* to same privilege */
+        if (env->eflags & VM_MASK)
+            raise_exception_err(EXCP0D_GPF, selector & 0xfffc);
+        new_stack = 0;
+        if (ist != 0)
+            esp = get_rsp_from_tss(ist + 3);
+        else
+            esp = ESP;
+        esp &= ~0xfLL; /* align stack */
+        dpl = cpl;
+    } else {
+        raise_exception_err(EXCP0D_GPF, selector & 0xfffc);
+        new_stack = 0; /* avoid warning */
+        esp = 0; /* avoid warning */
+    }
+
+    PUSHQ(esp, env->segs[R_SS].selector);
+    PUSHQ(esp, ESP);
+    PUSHQ(esp, compute_eflags());
+    PUSHQ(esp, env->segs[R_CS].selector);
+    PUSHQ(esp, old_eip);
+    if (has_error_code) {
+        PUSHQ(esp, error_code);
+    }
+
+    if (new_stack) {
+        ss = 0 | dpl;
+#ifndef VBOX
+        cpu_x86_load_seg_cache(env, R_SS, ss, 0, 0, 0);
+#else
+        cpu_x86_load_seg_cache(env, R_SS, ss, 0, 0, dpl << DESC_DPL_SHIFT);
+#endif
+    }
+    ESP = esp;
+
+    selector = (selector & ~3) | dpl;
+    cpu_x86_load_seg_cache(env, R_CS, selector,
+                   get_seg_base(e1, e2),
+                   get_seg_limit(e1, e2),
+                   e2);
+    cpu_x86_set_cpl(env, dpl);
+    env->eip = offset;
+
+    /* interrupt gate clear IF mask */
+    if ((type & 1) == 0) {
+        env->eflags &= ~IF_MASK;
+    }
+#ifndef VBOX
+    env->eflags &= ~(TF_MASK | VM_MASK | RF_MASK | NT_MASK);
+#else  /* VBOX */
+    /*
+     * We must clear VIP/VIF too on interrupt entry, as otherwise FreeBSD
+     * gets confused by seemingly changed EFLAGS. See #3491 and
+     * public bug #2341.
+     */
+    env->eflags &= ~(TF_MASK | VM_MASK | RF_MASK | NT_MASK | VIF_MASK | VIP_MASK);
+#endif /* VBOX */
+}
+#endif
+
+#ifdef TARGET_X86_64
+#if defined(CONFIG_USER_ONLY)
+void helper_syscall(int next_eip_addend)
+{
+    env->exception_index = EXCP_SYSCALL;
+    env->exception_next_eip = env->eip + next_eip_addend;
+    cpu_loop_exit();
+}
+#else
+void helper_syscall(int next_eip_addend)
+{
+    int selector;
+
+    if (!(env->efer & MSR_EFER_SCE)) {
+        raise_exception_err(EXCP06_ILLOP, 0);
+    }
+    selector = (env->star >> 32) & 0xffff;
+    if (env->hflags & HF_LMA_MASK) {
+        int code64;
+
+        ECX = env->eip + next_eip_addend;
+        env->regs[11] = compute_eflags();
+
+        code64 = env->hflags & HF_CS64_MASK;
+
+        cpu_x86_set_cpl(env, 0);
+        cpu_x86_load_seg_cache(env, R_CS, selector & 0xfffc,
+                           0, 0xffffffff,
+                               DESC_G_MASK | DESC_P_MASK |
+                               DESC_S_MASK |
+                               DESC_CS_MASK | DESC_R_MASK | DESC_A_MASK | DESC_L_MASK);
+        cpu_x86_load_seg_cache(env, R_SS, (selector + 8) & 0xfffc,
+                               0, 0xffffffff,
+                               DESC_G_MASK | DESC_B_MASK | DESC_P_MASK |
+                               DESC_S_MASK |
+                               DESC_W_MASK | DESC_A_MASK);
+        env->eflags &= ~env->fmask;
+        load_eflags(env->eflags, 0);
+        if (code64)
+            env->eip = env->lstar;
+        else
+            env->eip = env->cstar;
+    } else {
+        ECX = (uint32_t)(env->eip + next_eip_addend);
+
+        cpu_x86_set_cpl(env, 0);
+        cpu_x86_load_seg_cache(env, R_CS, selector & 0xfffc,
+                           0, 0xffffffff,
+                               DESC_G_MASK | DESC_B_MASK | DESC_P_MASK |
+                               DESC_S_MASK |
+                               DESC_CS_MASK | DESC_R_MASK | DESC_A_MASK);
+        cpu_x86_load_seg_cache(env, R_SS, (selector + 8) & 0xfffc,
+                               0, 0xffffffff,
+                               DESC_G_MASK | DESC_B_MASK | DESC_P_MASK |
+                               DESC_S_MASK |
+                               DESC_W_MASK | DESC_A_MASK);
+        env->eflags &= ~(IF_MASK | RF_MASK | VM_MASK);
+        env->eip = (uint32_t)env->star;
+    }
+}
+#endif
+#endif
+
+#ifdef TARGET_X86_64
+void helper_sysret(int dflag)
+{
+    int cpl, selector;
+
+    if (!(env->efer & MSR_EFER_SCE)) {
+        raise_exception_err(EXCP06_ILLOP, 0);
+    }
+    cpl = env->hflags & HF_CPL_MASK;
+    if (!(env->cr[0] & CR0_PE_MASK) || cpl != 0) {
+        raise_exception_err(EXCP0D_GPF, 0);
+    }
+    selector = (env->star >> 48) & 0xffff;
+    if (env->hflags & HF_LMA_MASK) {
+        if (dflag == 2) {
+            cpu_x86_load_seg_cache(env, R_CS, (selector + 16) | 3,
+                                   0, 0xffffffff,
+                                   DESC_G_MASK | DESC_P_MASK |
+                                   DESC_S_MASK | (3 << DESC_DPL_SHIFT) |
+                                   DESC_CS_MASK | DESC_R_MASK | DESC_A_MASK |
+                                   DESC_L_MASK);
+            env->eip = ECX;
+        } else {
+            cpu_x86_load_seg_cache(env, R_CS, selector | 3,
+                                   0, 0xffffffff,
+                                   DESC_G_MASK | DESC_B_MASK | DESC_P_MASK |
+                                   DESC_S_MASK | (3 << DESC_DPL_SHIFT) |
+                                   DESC_CS_MASK | DESC_R_MASK | DESC_A_MASK);
+            env->eip = (uint32_t)ECX;
+        }
+        cpu_x86_load_seg_cache(env, R_SS, selector + 8,
+                               0, 0xffffffff,
+                               DESC_G_MASK | DESC_B_MASK | DESC_P_MASK |
+                               DESC_S_MASK | (3 << DESC_DPL_SHIFT) |
+                               DESC_W_MASK | DESC_A_MASK);
+        load_eflags((uint32_t)(env->regs[11]), TF_MASK | AC_MASK | ID_MASK |
+                    IF_MASK | IOPL_MASK | VM_MASK | RF_MASK | NT_MASK);
+        cpu_x86_set_cpl(env, 3);
+    } else {
+        cpu_x86_load_seg_cache(env, R_CS, selector | 3,
+                               0, 0xffffffff,
+                               DESC_G_MASK | DESC_B_MASK | DESC_P_MASK |
+                               DESC_S_MASK | (3 << DESC_DPL_SHIFT) |
+                               DESC_CS_MASK | DESC_R_MASK | DESC_A_MASK);
+        env->eip = (uint32_t)ECX;
+        cpu_x86_load_seg_cache(env, R_SS, selector + 8,
+                               0, 0xffffffff,
+                               DESC_G_MASK | DESC_B_MASK | DESC_P_MASK |
+                               DESC_S_MASK | (3 << DESC_DPL_SHIFT) |
+                               DESC_W_MASK | DESC_A_MASK);
+        env->eflags |= IF_MASK;
+        cpu_x86_set_cpl(env, 3);
+    }
+}
+#endif
+
+#ifdef VBOX
+
+/**
+ * Checks and processes external VMM events.
+ * Called by op_check_external_event() when any of the flags is set and can be serviced.
+ */
+void helper_external_event(void)
+{
+# if defined(RT_OS_DARWIN) && defined(VBOX_STRICT)
+    uintptr_t uSP;
+#  ifdef RT_ARCH_AMD64
+    __asm__ __volatile__("movq %%rsp, %0" : "=r" (uSP));
+#  else
+    __asm__ __volatile__("movl %%esp, %0" : "=r" (uSP));
+#  endif
+    AssertMsg(!(uSP & 15), ("xSP=%#p\n", uSP));
+# endif
+    /* Keep in sync with flags checked by gen_check_external_event() */
+    if (env->interrupt_request & CPU_INTERRUPT_EXTERNAL_HARD)
+    {
+        ASMAtomicAndS32((int32_t volatile *)&env->interrupt_request,
+                        ~CPU_INTERRUPT_EXTERNAL_HARD);
+        cpu_interrupt(env, CPU_INTERRUPT_HARD);
+    }
+    if (env->interrupt_request & CPU_INTERRUPT_EXTERNAL_EXIT)
+    {
+        ASMAtomicAndS32((int32_t volatile *)&env->interrupt_request,
+                        ~CPU_INTERRUPT_EXTERNAL_EXIT);
+        cpu_exit(env);
+    }
+    if (env->interrupt_request & CPU_INTERRUPT_EXTERNAL_DMA)
+    {
+        ASMAtomicAndS32((int32_t volatile *)&env->interrupt_request,
+                        ~CPU_INTERRUPT_EXTERNAL_DMA);
+        remR3DmaRun(env);
+    }
+    if (env->interrupt_request & CPU_INTERRUPT_EXTERNAL_TIMER)
+    {
+        ASMAtomicAndS32((int32_t volatile *)&env->interrupt_request,
+                        ~CPU_INTERRUPT_EXTERNAL_TIMER);
+        remR3TimersRun(env);
+    }
+    if (env->interrupt_request & CPU_INTERRUPT_EXTERNAL_FLUSH_TLB)
+    {
+        ASMAtomicAndS32((int32_t volatile *)&env->interrupt_request,
+                        ~CPU_INTERRUPT_EXTERNAL_HARD);
+        cpu_interrupt(env, CPU_INTERRUPT_HARD);
+    }
+}
+
+/* helper for recording call instruction addresses for later scanning */
+void helper_record_call()
+{
+    if (    !(env->state & CPU_RAW_RING0)
+        &&  (env->cr[0] & CR0_PG_MASK)
+        &&  !(env->eflags & X86_EFL_IF))
+        remR3RecordCall(env);
+}
+
+#endif /* VBOX */
+
+/* real mode interrupt */
+static void do_interrupt_real(int intno, int is_int, int error_code,
+                              unsigned int next_eip)
+{
+    SegmentCache *dt;
+    target_ulong ptr, ssp;
+    int selector;
+    uint32_t offset, esp;
+    uint32_t old_cs, old_eip;
+
+    /* real mode (simpler !) */
+    dt = &env->idt;
+#ifndef VBOX
+    if (intno * 4 + 3 > dt->limit)
+#else
+    if ((unsigned)intno * 4 + 3 > dt->limit)
+#endif
+        raise_exception_err(EXCP0D_GPF, intno * 8 + 2);
+    ptr = dt->base + intno * 4;
+    offset = lduw_kernel(ptr);
+    selector = lduw_kernel(ptr + 2);
+    esp = ESP;
+    ssp = env->segs[R_SS].base;
+    if (is_int)
+        old_eip = next_eip;
+    else
+        old_eip = env->eip;
+    old_cs = env->segs[R_CS].selector;
+    /* XXX: use SS segment size ? */
+    PUSHW(ssp, esp, 0xffff, compute_eflags());
+    PUSHW(ssp, esp, 0xffff, old_cs);
+    PUSHW(ssp, esp, 0xffff, old_eip);
+
+    /* update processor state */
+    ESP = (ESP & ~0xffff) | (esp & 0xffff);
+    env->eip = offset;
+    env->segs[R_CS].selector = selector;
+    env->segs[R_CS].base = (selector << 4);
+    env->eflags &= ~(IF_MASK | TF_MASK | AC_MASK | RF_MASK);
+}
+
+/* fake user mode interrupt */
+void do_interrupt_user(int intno, int is_int, int error_code,
+                       target_ulong next_eip)
+{
+    SegmentCache *dt;
+    target_ulong ptr;
+    int dpl, cpl, shift;
+    uint32_t e2;
+
+    dt = &env->idt;
+    if (env->hflags & HF_LMA_MASK) {
+        shift = 4;
+    } else {
+        shift = 3;
+    }
+    ptr = dt->base + (intno << shift);
+    e2 = ldl_kernel(ptr + 4);
+
+    dpl = (e2 >> DESC_DPL_SHIFT) & 3;
+    cpl = env->hflags & HF_CPL_MASK;
+    /* check privilege if software int */
+    if (is_int && dpl < cpl)
+        raise_exception_err(EXCP0D_GPF, (intno << shift) + 2);
+
+    /* Since we emulate only user space, we cannot do more than
+       exiting the emulation with the suitable exception and error
+       code */
+    if (is_int)
+        EIP = next_eip;
+}
+
+#if !defined(CONFIG_USER_ONLY)
+static void handle_even_inj(int intno, int is_int, int error_code,
+		int is_hw, int rm)
+{
+    uint32_t event_inj = ldl_phys(env->vm_vmcb + offsetof(struct vmcb, control.event_inj));
+    if (!(event_inj & SVM_EVTINJ_VALID)) {
+	    int type;
+	    if (is_int)
+		    type = SVM_EVTINJ_TYPE_SOFT;
+	    else
+		    type = SVM_EVTINJ_TYPE_EXEPT;
+	    event_inj = intno | type | SVM_EVTINJ_VALID;
+	    if (!rm && exeption_has_error_code(intno)) {
+		    event_inj |= SVM_EVTINJ_VALID_ERR;
+		    stl_phys(env->vm_vmcb + offsetof(struct vmcb, control.event_inj_err), error_code);
+	    }
+	    stl_phys(env->vm_vmcb + offsetof(struct vmcb, control.event_inj), event_inj);
+    }
+}
+#endif
+
+/*
+ * Begin execution of an interruption. is_int is TRUE if coming from
+ * the int instruction. next_eip is the EIP value AFTER the interrupt
+ * instruction. It is only relevant if is_int is TRUE.
+ */
+void do_interrupt(int intno, int is_int, int error_code,
+                  target_ulong next_eip, int is_hw)
+{
+    if (qemu_loglevel_mask(CPU_LOG_INT)) {
+        if ((env->cr[0] & CR0_PE_MASK)) {
+            static int count;
+            qemu_log("%6d: v=%02x e=%04x i=%d cpl=%d IP=%04x:" TARGET_FMT_lx " pc=" TARGET_FMT_lx " SP=%04x:" TARGET_FMT_lx,
+                    count, intno, error_code, is_int,
+                    env->hflags & HF_CPL_MASK,
+                    env->segs[R_CS].selector, EIP,
+                    (int)env->segs[R_CS].base + EIP,
+                    env->segs[R_SS].selector, ESP);
+            if (intno == 0x0e) {
+                qemu_log(" CR2=" TARGET_FMT_lx, env->cr[2]);
+            } else {
+                qemu_log(" EAX=" TARGET_FMT_lx, EAX);
+            }
+            qemu_log("\n");
+            log_cpu_state(env, X86_DUMP_CCOP);
+#if 0
+            {
+                int i;
+                uint8_t *ptr;
+                qemu_log("       code=");
+                ptr = env->segs[R_CS].base + env->eip;
+                for(i = 0; i < 16; i++) {
+                    qemu_log(" %02x", ldub(ptr + i));
+                }
+                qemu_log("\n");
+            }
+#endif
+            count++;
+        }
+    }
+#ifdef VBOX
+    if (RT_UNLIKELY(env->state & CPU_EMULATE_SINGLE_STEP)) {
+        if (is_int) {
+            RTLogPrintf("do_interrupt: %#04x err=%#x pc=%#RGv%s\n",
+                        intno, error_code, (RTGCPTR)env->eip, is_hw ? " hw" : "");
+        } else {
+            RTLogPrintf("do_interrupt: %#04x err=%#x pc=%#RGv next=%#RGv%s\n",
+                        intno, error_code, (RTGCPTR)env->eip, (RTGCPTR)next_eip, is_hw ? " hw" : "");
+        }
+    }
+#endif
+    if (env->cr[0] & CR0_PE_MASK) {
+#if !defined(CONFIG_USER_ONLY)
+        if (env->hflags & HF_SVMI_MASK)
+            handle_even_inj(intno, is_int, error_code, is_hw, 0);
+#endif
+#ifdef TARGET_X86_64
+        if (env->hflags & HF_LMA_MASK) {
+            do_interrupt64(intno, is_int, error_code, next_eip, is_hw);
+        } else
+#endif
+        {
+#ifdef VBOX
+            /* int xx *, v86 code and VME enabled? */
+            if (    (env->eflags & VM_MASK)
+                    &&  (env->cr[4] & CR4_VME_MASK)
+                &&  is_int
+                &&  !is_hw
+                &&  env->eip + 1 != next_eip /* single byte int 3 goes straight to the protected mode handler */
+               )
+                do_soft_interrupt_vme(intno, error_code, next_eip);
+            else
+#endif /* VBOX */
+            do_interrupt_protected(intno, is_int, error_code, next_eip, is_hw);
+        }
+    } else {
+#if !defined(CONFIG_USER_ONLY)
+        if (env->hflags & HF_SVMI_MASK)
+            handle_even_inj(intno, is_int, error_code, is_hw, 1);
+#endif
+        do_interrupt_real(intno, is_int, error_code, next_eip);
+    }
+
+#if !defined(CONFIG_USER_ONLY)
+    if (env->hflags & HF_SVMI_MASK) {
+	    uint32_t event_inj = ldl_phys(env->vm_vmcb + offsetof(struct vmcb, control.event_inj));
+	    stl_phys(env->vm_vmcb + offsetof(struct vmcb, control.event_inj), event_inj & ~SVM_EVTINJ_VALID);
+    }
+#endif
+}
+
+/* This should come from sysemu.h - if we could include it here... */
+void qemu_system_reset_request(void);
+
+/*
+ * Check nested exceptions and change to double or triple fault if
+ * needed. It should only be called, if this is not an interrupt.
+ * Returns the new exception number.
+ */
+static int check_exception(int intno, int *error_code)
+{
+    int first_contributory = env->old_exception == 0 ||
+                              (env->old_exception >= 10 &&
+                               env->old_exception <= 13);
+    int second_contributory = intno == 0 ||
+                               (intno >= 10 && intno <= 13);
+
+    qemu_log_mask(CPU_LOG_INT, "check_exception old: 0x%x new 0x%x\n",
+                env->old_exception, intno);
+
+#if !defined(CONFIG_USER_ONLY)
+    if (env->old_exception == EXCP08_DBLE) {
+        if (env->hflags & HF_SVMI_MASK)
+            helper_vmexit(SVM_EXIT_SHUTDOWN, 0); /* does not return */
+
+        qemu_log_mask(CPU_LOG_RESET, "Triple fault\n");
+
+# ifndef VBOX
+        qemu_system_reset_request();
+        return EXCP_HLT;
+# else
+        remR3RaiseRC(env->pVM, VINF_EM_TRIPLE_FAULT);
+        return EXCP_RC;
+# endif
+    }
+#endif
+
+    if ((first_contributory && second_contributory)
+        || (env->old_exception == EXCP0E_PAGE &&
+            (second_contributory || (intno == EXCP0E_PAGE)))) {
+        intno = EXCP08_DBLE;
+        *error_code = 0;
+    }
+
+    if (second_contributory || (intno == EXCP0E_PAGE) ||
+        (intno == EXCP08_DBLE))
+        env->old_exception = intno;
+
+    return intno;
+}
+
+/*
+ * Signal an interruption. It is executed in the main CPU loop.
+ * is_int is TRUE if coming from the int instruction. next_eip is the
+ * EIP value AFTER the interrupt instruction. It is only relevant if
+ * is_int is TRUE.
+ */
+static void QEMU_NORETURN raise_interrupt(int intno, int is_int, int error_code,
+                                          int next_eip_addend)
+{
+#if defined(VBOX) && defined(DEBUG)
+    Log2(("raise_interrupt: %x %x %x %RGv\n", intno, is_int, error_code, (RTGCPTR)env->eip + next_eip_addend));
+#endif
+    if (!is_int) {
+        helper_svm_check_intercept_param(SVM_EXIT_EXCP_BASE + intno, error_code);
+        intno = check_exception(intno, &error_code);
+    } else {
+        helper_svm_check_intercept_param(SVM_EXIT_SWINT, 0);
+    }
+
+    env->exception_index = intno;
+    env->error_code = error_code;
+    env->exception_is_int = is_int;
+    env->exception_next_eip = env->eip + next_eip_addend;
+    cpu_loop_exit();
+}
+
+/* shortcuts to generate exceptions */
+
+void raise_exception_err(int exception_index, int error_code)
+{
+    raise_interrupt(exception_index, 0, error_code, 0);
+}
+
+void raise_exception(int exception_index)
+{
+    raise_interrupt(exception_index, 0, 0, 0);
+}
+
+void raise_exception_env(int exception_index, CPUState *nenv)
+{
+    env = nenv;
+    raise_exception(exception_index);
+}
+/* SMM support */
+
+#if defined(CONFIG_USER_ONLY)
+
+void do_smm_enter(void)
+{
+}
+
+void helper_rsm(void)
+{
+}
+
+#else
+
+#ifdef TARGET_X86_64
+#define SMM_REVISION_ID 0x00020064
+#else
+#define SMM_REVISION_ID 0x00020000
+#endif
+
+void do_smm_enter(void)
+{
+    target_ulong sm_state;
+    SegmentCache *dt;
+    int i, offset;
+
+    qemu_log_mask(CPU_LOG_INT, "SMM: enter\n");
+    log_cpu_state_mask(CPU_LOG_INT, env, X86_DUMP_CCOP);
+
+    env->hflags |= HF_SMM_MASK;
+    cpu_smm_update(env);
+
+    sm_state = env->smbase + 0x8000;
+
+#ifdef TARGET_X86_64
+    for(i = 0; i < 6; i++) {
+        dt = &env->segs[i];
+        offset = 0x7e00 + i * 16;
+        stw_phys(sm_state + offset, dt->selector);
+        stw_phys(sm_state + offset + 2, (dt->flags >> 8) & 0xf0ff);
+        stl_phys(sm_state + offset + 4, dt->limit);
+        stq_phys(sm_state + offset + 8, dt->base);
+    }
+
+    stq_phys(sm_state + 0x7e68, env->gdt.base);
+    stl_phys(sm_state + 0x7e64, env->gdt.limit);
+
+    stw_phys(sm_state + 0x7e70, env->ldt.selector);
+    stq_phys(sm_state + 0x7e78, env->ldt.base);
+    stl_phys(sm_state + 0x7e74, env->ldt.limit);
+    stw_phys(sm_state + 0x7e72, (env->ldt.flags >> 8) & 0xf0ff);
+
+    stq_phys(sm_state + 0x7e88, env->idt.base);
+    stl_phys(sm_state + 0x7e84, env->idt.limit);
+
+    stw_phys(sm_state + 0x7e90, env->tr.selector);
+    stq_phys(sm_state + 0x7e98, env->tr.base);
+    stl_phys(sm_state + 0x7e94, env->tr.limit);
+    stw_phys(sm_state + 0x7e92, (env->tr.flags >> 8) & 0xf0ff);
+
+    stq_phys(sm_state + 0x7ed0, env->efer);
+
+    stq_phys(sm_state + 0x7ff8, EAX);
+    stq_phys(sm_state + 0x7ff0, ECX);
+    stq_phys(sm_state + 0x7fe8, EDX);
+    stq_phys(sm_state + 0x7fe0, EBX);
+    stq_phys(sm_state + 0x7fd8, ESP);
+    stq_phys(sm_state + 0x7fd0, EBP);
+    stq_phys(sm_state + 0x7fc8, ESI);
+    stq_phys(sm_state + 0x7fc0, EDI);
+    for(i = 8; i < 16; i++)
+        stq_phys(sm_state + 0x7ff8 - i * 8, env->regs[i]);
+    stq_phys(sm_state + 0x7f78, env->eip);
+    stl_phys(sm_state + 0x7f70, compute_eflags());
+    stl_phys(sm_state + 0x7f68, env->dr[6]);
+    stl_phys(sm_state + 0x7f60, env->dr[7]);
+
+    stl_phys(sm_state + 0x7f48, env->cr[4]);
+    stl_phys(sm_state + 0x7f50, env->cr[3]);
+    stl_phys(sm_state + 0x7f58, env->cr[0]);
+
+    stl_phys(sm_state + 0x7efc, SMM_REVISION_ID);
+    stl_phys(sm_state + 0x7f00, env->smbase);
+#else
+    stl_phys(sm_state + 0x7ffc, env->cr[0]);
+    stl_phys(sm_state + 0x7ff8, env->cr[3]);
+    stl_phys(sm_state + 0x7ff4, compute_eflags());
+    stl_phys(sm_state + 0x7ff0, env->eip);
+    stl_phys(sm_state + 0x7fec, EDI);
+    stl_phys(sm_state + 0x7fe8, ESI);
+    stl_phys(sm_state + 0x7fe4, EBP);
+    stl_phys(sm_state + 0x7fe0, ESP);
+    stl_phys(sm_state + 0x7fdc, EBX);
+    stl_phys(sm_state + 0x7fd8, EDX);
+    stl_phys(sm_state + 0x7fd4, ECX);
+    stl_phys(sm_state + 0x7fd0, EAX);
+    stl_phys(sm_state + 0x7fcc, env->dr[6]);
+    stl_phys(sm_state + 0x7fc8, env->dr[7]);
+
+    stl_phys(sm_state + 0x7fc4, env->tr.selector);
+    stl_phys(sm_state + 0x7f64, env->tr.base);
+    stl_phys(sm_state + 0x7f60, env->tr.limit);
+    stl_phys(sm_state + 0x7f5c, (env->tr.flags >> 8) & 0xf0ff);
+
+    stl_phys(sm_state + 0x7fc0, env->ldt.selector);
+    stl_phys(sm_state + 0x7f80, env->ldt.base);
+    stl_phys(sm_state + 0x7f7c, env->ldt.limit);
+    stl_phys(sm_state + 0x7f78, (env->ldt.flags >> 8) & 0xf0ff);
+
+    stl_phys(sm_state + 0x7f74, env->gdt.base);
+    stl_phys(sm_state + 0x7f70, env->gdt.limit);
+
+    stl_phys(sm_state + 0x7f58, env->idt.base);
+    stl_phys(sm_state + 0x7f54, env->idt.limit);
+
+    for(i = 0; i < 6; i++) {
+        dt = &env->segs[i];
+        if (i < 3)
+            offset = 0x7f84 + i * 12;
+        else
+            offset = 0x7f2c + (i - 3) * 12;
+        stl_phys(sm_state + 0x7fa8 + i * 4, dt->selector);
+        stl_phys(sm_state + offset + 8, dt->base);
+        stl_phys(sm_state + offset + 4, dt->limit);
+        stl_phys(sm_state + offset, (dt->flags >> 8) & 0xf0ff);
+    }
+    stl_phys(sm_state + 0x7f14, env->cr[4]);
+
+    stl_phys(sm_state + 0x7efc, SMM_REVISION_ID);
+    stl_phys(sm_state + 0x7ef8, env->smbase);
+#endif
+    /* init SMM cpu state */
+
+#ifdef TARGET_X86_64
+    cpu_load_efer(env, 0);
+#endif
+    load_eflags(0, ~(CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C | DF_MASK));
+    env->eip = 0x00008000;
+    cpu_x86_load_seg_cache(env, R_CS, (env->smbase >> 4) & 0xffff, env->smbase,
+                           0xffffffff, 0);
+    cpu_x86_load_seg_cache(env, R_DS, 0, 0, 0xffffffff, 0);
+    cpu_x86_load_seg_cache(env, R_ES, 0, 0, 0xffffffff, 0);
+    cpu_x86_load_seg_cache(env, R_SS, 0, 0, 0xffffffff, 0);
+    cpu_x86_load_seg_cache(env, R_FS, 0, 0, 0xffffffff, 0);
+    cpu_x86_load_seg_cache(env, R_GS, 0, 0, 0xffffffff, 0);
+
+    cpu_x86_update_cr0(env,
+                       env->cr[0] & ~(CR0_PE_MASK | CR0_EM_MASK | CR0_TS_MASK | CR0_PG_MASK));
+    cpu_x86_update_cr4(env, 0);
+    env->dr[7] = 0x00000400;
+    CC_OP = CC_OP_EFLAGS;
+}
+
+void helper_rsm(void)
+{
+#ifdef VBOX
+   cpu_abort(env, "helper_rsm");
+#else /* !VBOX */
+    target_ulong sm_state;
+    int i, offset;
+    uint32_t val;
+
+    sm_state = env->smbase + 0x8000;
+#ifdef TARGET_X86_64
+    cpu_load_efer(env, ldq_phys(sm_state + 0x7ed0));
+
+    for(i = 0; i < 6; i++) {
+        offset = 0x7e00 + i * 16;
+        cpu_x86_load_seg_cache(env, i,
+                               lduw_phys(sm_state + offset),
+                               ldq_phys(sm_state + offset + 8),
+                               ldl_phys(sm_state + offset + 4),
+                               (lduw_phys(sm_state + offset + 2) & 0xf0ff) << 8);
+    }
+
+    env->gdt.base = ldq_phys(sm_state + 0x7e68);
+    env->gdt.limit = ldl_phys(sm_state + 0x7e64);
+
+    env->ldt.selector = lduw_phys(sm_state + 0x7e70);
+    env->ldt.base = ldq_phys(sm_state + 0x7e78);
+    env->ldt.limit = ldl_phys(sm_state + 0x7e74);
+    env->ldt.flags = (lduw_phys(sm_state + 0x7e72) & 0xf0ff) << 8;
+#ifdef VBOX
+    env->ldt.fVBoxFlags = CPUMSELREG_FLAGS_VALID;
+    env->ldt.newselector = 0;
+#endif
+
+    env->idt.base = ldq_phys(sm_state + 0x7e88);
+    env->idt.limit = ldl_phys(sm_state + 0x7e84);
+
+    env->tr.selector = lduw_phys(sm_state + 0x7e90);
+    env->tr.base = ldq_phys(sm_state + 0x7e98);
+    env->tr.limit = ldl_phys(sm_state + 0x7e94);
+    env->tr.flags = (lduw_phys(sm_state + 0x7e92) & 0xf0ff) << 8;
+#ifdef VBOX
+    env->tr.fVBoxFlags = CPUMSELREG_FLAGS_VALID;
+    env->tr.newselector = 0;
+#endif
+
+    EAX = ldq_phys(sm_state + 0x7ff8);
+    ECX = ldq_phys(sm_state + 0x7ff0);
+    EDX = ldq_phys(sm_state + 0x7fe8);
+    EBX = ldq_phys(sm_state + 0x7fe0);
+    ESP = ldq_phys(sm_state + 0x7fd8);
+    EBP = ldq_phys(sm_state + 0x7fd0);
+    ESI = ldq_phys(sm_state + 0x7fc8);
+    EDI = ldq_phys(sm_state + 0x7fc0);
+    for(i = 8; i < 16; i++)
+        env->regs[i] = ldq_phys(sm_state + 0x7ff8 - i * 8);
+    env->eip = ldq_phys(sm_state + 0x7f78);
+    load_eflags(ldl_phys(sm_state + 0x7f70),
+                ~(CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C | DF_MASK));
+    env->dr[6] = ldl_phys(sm_state + 0x7f68);
+    env->dr[7] = ldl_phys(sm_state + 0x7f60);
+
+    cpu_x86_update_cr4(env, ldl_phys(sm_state + 0x7f48));
+    cpu_x86_update_cr3(env, ldl_phys(sm_state + 0x7f50));
+    cpu_x86_update_cr0(env, ldl_phys(sm_state + 0x7f58));
+
+    val = ldl_phys(sm_state + 0x7efc); /* revision ID */
+    if (val & 0x20000) {
+        env->smbase = ldl_phys(sm_state + 0x7f00) & ~0x7fff;
+    }
+#else
+    cpu_x86_update_cr0(env, ldl_phys(sm_state + 0x7ffc));
+    cpu_x86_update_cr3(env, ldl_phys(sm_state + 0x7ff8));
+    load_eflags(ldl_phys(sm_state + 0x7ff4),
+                ~(CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C | DF_MASK));
+    env->eip = ldl_phys(sm_state + 0x7ff0);
+    EDI = ldl_phys(sm_state + 0x7fec);
+    ESI = ldl_phys(sm_state + 0x7fe8);
+    EBP = ldl_phys(sm_state + 0x7fe4);
+    ESP = ldl_phys(sm_state + 0x7fe0);
+    EBX = ldl_phys(sm_state + 0x7fdc);
+    EDX = ldl_phys(sm_state + 0x7fd8);
+    ECX = ldl_phys(sm_state + 0x7fd4);
+    EAX = ldl_phys(sm_state + 0x7fd0);
+    env->dr[6] = ldl_phys(sm_state + 0x7fcc);
+    env->dr[7] = ldl_phys(sm_state + 0x7fc8);
+
+    env->tr.selector = ldl_phys(sm_state + 0x7fc4) & 0xffff;
+    env->tr.base = ldl_phys(sm_state + 0x7f64);
+    env->tr.limit = ldl_phys(sm_state + 0x7f60);
+    env->tr.flags = (ldl_phys(sm_state + 0x7f5c) & 0xf0ff) << 8;
+#ifdef VBOX
+    env->tr.fVBoxFlags  = CPUMSELREG_FLAGS_VALID;
+    env->tr.newselector = 0;
+#endif
+
+    env->ldt.selector = ldl_phys(sm_state + 0x7fc0) & 0xffff;
+    env->ldt.base = ldl_phys(sm_state + 0x7f80);
+    env->ldt.limit = ldl_phys(sm_state + 0x7f7c);
+    env->ldt.flags = (ldl_phys(sm_state + 0x7f78) & 0xf0ff) << 8;
+#ifdef VBOX
+    env->ldt.fVBoxFlags  = CPUMSELREG_FLAGS_VALID;
+    env->ldt.newselector = 0;
+#endif
+
+    env->gdt.base = ldl_phys(sm_state + 0x7f74);
+    env->gdt.limit = ldl_phys(sm_state + 0x7f70);
+
+    env->idt.base = ldl_phys(sm_state + 0x7f58);
+    env->idt.limit = ldl_phys(sm_state + 0x7f54);
+
+    for(i = 0; i < 6; i++) {
+        if (i < 3)
+            offset = 0x7f84 + i * 12;
+        else
+            offset = 0x7f2c + (i - 3) * 12;
+        cpu_x86_load_seg_cache(env, i,
+                               ldl_phys(sm_state + 0x7fa8 + i * 4) & 0xffff,
+                               ldl_phys(sm_state + offset + 8),
+                               ldl_phys(sm_state + offset + 4),
+                               (ldl_phys(sm_state + offset) & 0xf0ff) << 8);
+    }
+    cpu_x86_update_cr4(env, ldl_phys(sm_state + 0x7f14));
+
+    val = ldl_phys(sm_state + 0x7efc); /* revision ID */
+    if (val & 0x20000) {
+        env->smbase = ldl_phys(sm_state + 0x7ef8) & ~0x7fff;
+    }
+#endif
+    CC_OP = CC_OP_EFLAGS;
+    env->hflags &= ~HF_SMM_MASK;
+    cpu_smm_update(env);
+
+    qemu_log_mask(CPU_LOG_INT, "SMM: after RSM\n");
+    log_cpu_state_mask(CPU_LOG_INT, env, X86_DUMP_CCOP);
+#endif /* !VBOX */
+}
+
+#endif /* !CONFIG_USER_ONLY */
+
+
+/* division, flags are undefined */
+
+void helper_divb_AL(target_ulong t0)
+{
+    unsigned int num, den, q, r;
+
+    num = (EAX & 0xffff);
+    den = (t0 & 0xff);
+    if (den == 0) {
+        raise_exception(EXCP00_DIVZ);
+    }
+    q = (num / den);
+    if (q > 0xff)
+        raise_exception(EXCP00_DIVZ);
+    q &= 0xff;
+    r = (num % den) & 0xff;
+    EAX = (EAX & ~0xffff) | (r << 8) | q;
+}
+
+void helper_idivb_AL(target_ulong t0)
+{
+    int num, den, q, r;
+
+    num = (int16_t)EAX;
+    den = (int8_t)t0;
+    if (den == 0) {
+        raise_exception(EXCP00_DIVZ);
+    }
+    q = (num / den);
+    if (q != (int8_t)q)
+        raise_exception(EXCP00_DIVZ);
+    q &= 0xff;
+    r = (num % den) & 0xff;
+    EAX = (EAX & ~0xffff) | (r << 8) | q;
+}
+
+void helper_divw_AX(target_ulong t0)
+{
+    unsigned int num, den, q, r;
+
+    num = (EAX & 0xffff) | ((EDX & 0xffff) << 16);
+    den = (t0 & 0xffff);
+    if (den == 0) {
+        raise_exception(EXCP00_DIVZ);
+    }
+    q = (num / den);
+    if (q > 0xffff)
+        raise_exception(EXCP00_DIVZ);
+    q &= 0xffff;
+    r = (num % den) & 0xffff;
+    EAX = (EAX & ~0xffff) | q;
+    EDX = (EDX & ~0xffff) | r;
+}
+
+void helper_idivw_AX(target_ulong t0)
+{
+    int num, den, q, r;
+
+    num = (EAX & 0xffff) | ((EDX & 0xffff) << 16);
+    den = (int16_t)t0;
+    if (den == 0) {
+        raise_exception(EXCP00_DIVZ);
+    }
+    q = (num / den);
+    if (q != (int16_t)q)
+        raise_exception(EXCP00_DIVZ);
+    q &= 0xffff;
+    r = (num % den) & 0xffff;
+    EAX = (EAX & ~0xffff) | q;
+    EDX = (EDX & ~0xffff) | r;
+}
+
+void helper_divl_EAX(target_ulong t0)
+{
+    unsigned int den, r;
+    uint64_t num, q;
+
+    num = ((uint32_t)EAX) | ((uint64_t)((uint32_t)EDX) << 32);
+    den = t0;
+    if (den == 0) {
+        raise_exception(EXCP00_DIVZ);
+    }
+    q = (num / den);
+    r = (num % den);
+    if (q > 0xffffffff)
+        raise_exception(EXCP00_DIVZ);
+    EAX = (uint32_t)q;
+    EDX = (uint32_t)r;
+}
+
+void helper_idivl_EAX(target_ulong t0)
+{
+    int den, r;
+    int64_t num, q;
+
+    num = ((uint32_t)EAX) | ((uint64_t)((uint32_t)EDX) << 32);
+    den = t0;
+    if (den == 0) {
+        raise_exception(EXCP00_DIVZ);
+    }
+    q = (num / den);
+    r = (num % den);
+    if (q != (int32_t)q)
+        raise_exception(EXCP00_DIVZ);
+    EAX = (uint32_t)q;
+    EDX = (uint32_t)r;
+}
+
+/* bcd */
+
+/* XXX: exception */
+void helper_aam(int base)
+{
+    int al, ah;
+    al = EAX & 0xff;
+    ah = al / base;
+    al = al % base;
+    EAX = (EAX & ~0xffff) | al | (ah << 8);
+    CC_DST = al;
+}
+
+void helper_aad(int base)
+{
+    int al, ah;
+    al = EAX & 0xff;
+    ah = (EAX >> 8) & 0xff;
+    al = ((ah * base) + al) & 0xff;
+    EAX = (EAX & ~0xffff) | al;
+    CC_DST = al;
+}
+
+void helper_aaa(void)
+{
+    int icarry;
+    int al, ah, af;
+    int eflags;
+
+    eflags = helper_cc_compute_all(CC_OP);
+    af = eflags & CC_A;
+    al = EAX & 0xff;
+    ah = (EAX >> 8) & 0xff;
+
+    icarry = (al > 0xf9);
+    if (((al & 0x0f) > 9 ) || af) {
+        al = (al + 6) & 0x0f;
+        ah = (ah + 1 + icarry) & 0xff;
+        eflags |= CC_C | CC_A;
+    } else {
+        eflags &= ~(CC_C | CC_A);
+        al &= 0x0f;
+    }
+    EAX = (EAX & ~0xffff) | al | (ah << 8);
+    CC_SRC = eflags;
+}
+
+void helper_aas(void)
+{
+    int icarry;
+    int al, ah, af;
+    int eflags;
+
+    eflags = helper_cc_compute_all(CC_OP);
+    af = eflags & CC_A;
+    al = EAX & 0xff;
+    ah = (EAX >> 8) & 0xff;
+
+    icarry = (al < 6);
+    if (((al & 0x0f) > 9 ) || af) {
+        al = (al - 6) & 0x0f;
+        ah = (ah - 1 - icarry) & 0xff;
+        eflags |= CC_C | CC_A;
+    } else {
+        eflags &= ~(CC_C | CC_A);
+        al &= 0x0f;
+    }
+    EAX = (EAX & ~0xffff) | al | (ah << 8);
+    CC_SRC = eflags;
+}
+
+void helper_daa(void)
+{
+    int al, af, cf;
+    int eflags;
+
+    eflags = helper_cc_compute_all(CC_OP);
+    cf = eflags & CC_C;
+    af = eflags & CC_A;
+    al = EAX & 0xff;
+
+    eflags = 0;
+    if (((al & 0x0f) > 9 ) || af) {
+        al = (al + 6) & 0xff;
+        eflags |= CC_A;
+    }
+    if ((al > 0x9f) || cf) {
+        al = (al + 0x60) & 0xff;
+        eflags |= CC_C;
+    }
+    EAX = (EAX & ~0xff) | al;
+    /* well, speed is not an issue here, so we compute the flags by hand */
+    eflags |= (al == 0) << 6; /* zf */
+    eflags |= parity_table[al]; /* pf */
+    eflags |= (al & 0x80); /* sf */
+    CC_SRC = eflags;
+}
+
+void helper_das(void)
+{
+    int al, al1, af, cf;
+    int eflags;
+
+    eflags = helper_cc_compute_all(CC_OP);
+    cf = eflags & CC_C;
+    af = eflags & CC_A;
+    al = EAX & 0xff;
+
+    eflags = 0;
+    al1 = al;
+    if (((al & 0x0f) > 9 ) || af) {
+        eflags |= CC_A;
+        if (al < 6 || cf)
+            eflags |= CC_C;
+        al = (al - 6) & 0xff;
+    }
+    if ((al1 > 0x99) || cf) {
+        al = (al - 0x60) & 0xff;
+        eflags |= CC_C;
+    }
+    EAX = (EAX & ~0xff) | al;
+    /* well, speed is not an issue here, so we compute the flags by hand */
+    eflags |= (al == 0) << 6; /* zf */
+    eflags |= parity_table[al]; /* pf */
+    eflags |= (al & 0x80); /* sf */
+    CC_SRC = eflags;
+}
+
+void helper_into(int next_eip_addend)
+{
+    int eflags;
+    eflags = helper_cc_compute_all(CC_OP);
+    if (eflags & CC_O) {
+        raise_interrupt(EXCP04_INTO, 1, 0, next_eip_addend);
+    }
+}
+
+void helper_cmpxchg8b(target_ulong a0)
+{
+    uint64_t d;
+    int eflags;
+
+    eflags = helper_cc_compute_all(CC_OP);
+    d = ldq(a0);
+    if (d == (((uint64_t)EDX << 32) | (uint32_t)EAX)) {
+        stq(a0, ((uint64_t)ECX << 32) | (uint32_t)EBX);
+        eflags |= CC_Z;
+    } else {
+        /* always do the store */
+        stq(a0, d);
+        EDX = (uint32_t)(d >> 32);
+        EAX = (uint32_t)d;
+        eflags &= ~CC_Z;
+    }
+    CC_SRC = eflags;
+}
+
+#ifdef TARGET_X86_64
+void helper_cmpxchg16b(target_ulong a0)
+{
+    uint64_t d0, d1;
+    int eflags;
+
+    if ((a0 & 0xf) != 0)
+        raise_exception(EXCP0D_GPF);
+    eflags = helper_cc_compute_all(CC_OP);
+    d0 = ldq(a0);
+    d1 = ldq(a0 + 8);
+    if (d0 == EAX && d1 == EDX) {
+        stq(a0, EBX);
+        stq(a0 + 8, ECX);
+        eflags |= CC_Z;
+    } else {
+        /* always do the store */
+        stq(a0, d0);
+        stq(a0 + 8, d1);
+        EDX = d1;
+        EAX = d0;
+        eflags &= ~CC_Z;
+    }
+    CC_SRC = eflags;
+}
+#endif
+
+void helper_single_step(void)
+{
+#ifndef CONFIG_USER_ONLY
+    check_hw_breakpoints(env, 1);
+    env->dr[6] |= DR6_BS;
+#endif
+    raise_exception(EXCP01_DB);
+}
+
+void helper_cpuid(void)
+{
+    uint32_t eax, ebx, ecx, edx;
+
+    helper_svm_check_intercept_param(SVM_EXIT_CPUID, 0);
+
+    cpu_x86_cpuid(env, (uint32_t)EAX, (uint32_t)ECX, &eax, &ebx, &ecx, &edx);
+    EAX = eax;
+    EBX = ebx;
+    ECX = ecx;
+    EDX = edx;
+}
+
+void helper_enter_level(int level, int data32, target_ulong t1)
+{
+    target_ulong ssp;
+    uint32_t esp_mask, esp, ebp;
+
+    esp_mask = get_sp_mask(env->segs[R_SS].flags);
+    ssp = env->segs[R_SS].base;
+    ebp = EBP;
+    esp = ESP;
+    if (data32) {
+        /* 32 bit */
+        esp -= 4;
+        while (--level) {
+            esp -= 4;
+            ebp -= 4;
+            stl(ssp + (esp & esp_mask), ldl(ssp + (ebp & esp_mask)));
+        }
+        esp -= 4;
+        stl(ssp + (esp & esp_mask), t1);
+    } else {
+        /* 16 bit */
+        esp -= 2;
+        while (--level) {
+            esp -= 2;
+            ebp -= 2;
+            stw(ssp + (esp & esp_mask), lduw(ssp + (ebp & esp_mask)));
+        }
+        esp -= 2;
+        stw(ssp + (esp & esp_mask), t1);
+    }
+}
+
+#ifdef TARGET_X86_64
+void helper_enter64_level(int level, int data64, target_ulong t1)
+{
+    target_ulong esp, ebp;
+    ebp = EBP;
+    esp = ESP;
+
+    if (data64) {
+        /* 64 bit */
+        esp -= 8;
+        while (--level) {
+            esp -= 8;
+            ebp -= 8;
+            stq(esp, ldq(ebp));
+        }
+        esp -= 8;
+        stq(esp, t1);
+    } else {
+        /* 16 bit */
+        esp -= 2;
+        while (--level) {
+            esp -= 2;
+            ebp -= 2;
+            stw(esp, lduw(ebp));
+        }
+        esp -= 2;
+        stw(esp, t1);
+    }
+}
+#endif
+
+void helper_lldt(int selector)
+{
+    SegmentCache *dt;
+    uint32_t e1, e2;
+#ifndef VBOX
+    int index, entry_limit;
+#else
+    unsigned int index, entry_limit;
+#endif
+    target_ulong ptr;
+
+#ifdef VBOX
+    Log(("helper_lldt_T0: old ldtr=%RTsel {.base=%RGv, .limit=%RGv} new=%RTsel\n",
+         (RTSEL)env->ldt.selector, (RTGCPTR)env->ldt.base, (RTGCPTR)env->ldt.limit, (RTSEL)(selector & 0xffff)));
+#endif
+
+    selector &= 0xffff;
+    if ((selector & 0xfffc) == 0) {
+        /* XXX: NULL selector case: invalid LDT */
+        env->ldt.base = 0;
+        env->ldt.limit = 0;
+#ifdef VBOX
+        env->ldt.flags = DESC_INTEL_UNUSABLE;
+        env->ldt.fVBoxFlags = CPUMSELREG_FLAGS_VALID;
+        env->ldt.newselector = 0;
+#endif
+    } else {
+        if (selector & 0x4)
+            raise_exception_err(EXCP0D_GPF, selector & 0xfffc);
+        dt = &env->gdt;
+        index = selector & ~7;
+#ifdef TARGET_X86_64
+        if (env->hflags & HF_LMA_MASK)
+            entry_limit = 15;
+        else
+#endif
+            entry_limit = 7;
+        if ((index + entry_limit) > dt->limit)
+            raise_exception_err(EXCP0D_GPF, selector & 0xfffc);
+        ptr = dt->base + index;
+        e1 = ldl_kernel(ptr);
+        e2 = ldl_kernel(ptr + 4);
+        if ((e2 & DESC_S_MASK) || ((e2 >> DESC_TYPE_SHIFT) & 0xf) != 2)
+            raise_exception_err(EXCP0D_GPF, selector & 0xfffc);
+        if (!(e2 & DESC_P_MASK))
+            raise_exception_err(EXCP0B_NOSEG, selector & 0xfffc);
+#ifdef TARGET_X86_64
+        if (env->hflags & HF_LMA_MASK) {
+            uint32_t e3;
+            e3 = ldl_kernel(ptr + 8);
+            load_seg_cache_raw_dt(&env->ldt, e1, e2);
+            env->ldt.base |= (target_ulong)e3 << 32;
+        } else
+#endif
+        {
+            load_seg_cache_raw_dt(&env->ldt, e1, e2);
+        }
+    }
+    env->ldt.selector = selector;
+#ifdef VBOX
+    Log(("helper_lldt_T0: new ldtr=%RTsel {.base=%RGv, .limit=%RGv}\n",
+         (RTSEL)env->ldt.selector, (RTGCPTR)env->ldt.base, (RTGCPTR)env->ldt.limit));
+#endif
+}
+
+void helper_ltr(int selector)
+{
+    SegmentCache *dt;
+    uint32_t e1, e2;
+#ifndef VBOX
+    int index, type, entry_limit;
+#else
+    unsigned int index;
+    int type, entry_limit;
+#endif
+    target_ulong ptr;
+
+#ifdef VBOX
+    Log(("helper_ltr: pc=%RGv old tr=%RTsel {.base=%RGv, .limit=%RGv, .flags=%RX32} new=%RTsel\n",
+         (RTGCPTR)env->eip, (RTSEL)env->tr.selector, (RTGCPTR)env->tr.base, (RTGCPTR)env->tr.limit,
+         env->tr.flags, (RTSEL)(selector & 0xffff)));
+#endif
+    selector &= 0xffff;
+    if ((selector & 0xfffc) == 0) {
+        /* NULL selector case: invalid TR */
+#ifdef VBOX
+        raise_exception_err(EXCP0A_TSS, 0);
+#else
+        env->tr.base = 0;
+        env->tr.limit = 0;
+        env->tr.flags = 0;
+#endif
+    } else {
+        if (selector & 0x4)
+            raise_exception_err(EXCP0D_GPF, selector & 0xfffc);
+        dt = &env->gdt;
+        index = selector & ~7;
+#ifdef TARGET_X86_64
+        if (env->hflags & HF_LMA_MASK)
+            entry_limit = 15;
+        else
+#endif
+            entry_limit = 7;
+        if ((index + entry_limit) > dt->limit)
+            raise_exception_err(EXCP0D_GPF, selector & 0xfffc);
+        ptr = dt->base + index;
+        e1 = ldl_kernel(ptr);
+        e2 = ldl_kernel(ptr + 4);
+        type = (e2 >> DESC_TYPE_SHIFT) & 0xf;
+        if ((e2 & DESC_S_MASK) ||
+            (type != 1 && type != 9))
+            raise_exception_err(EXCP0D_GPF, selector & 0xfffc);
+        if (!(e2 & DESC_P_MASK))
+            raise_exception_err(EXCP0B_NOSEG, selector & 0xfffc);
+#ifdef TARGET_X86_64
+        if (env->hflags & HF_LMA_MASK) {
+            uint32_t e3, e4;
+            e3 = ldl_kernel(ptr + 8);
+            e4 = ldl_kernel(ptr + 12);
+            if ((e4 >> DESC_TYPE_SHIFT) & 0xf)
+                raise_exception_err(EXCP0D_GPF, selector & 0xfffc);
+            load_seg_cache_raw_dt(&env->tr, e1, e2);
+            env->tr.base |= (target_ulong)e3 << 32;
+        } else
+#endif
+        {
+            load_seg_cache_raw_dt(&env->tr, e1, e2);
+        }
+        env->tr.flags |= DESC_TSS_BUSY_MASK;
+        e2 |= DESC_TSS_BUSY_MASK;
+        stl_kernel(ptr + 4, e2);
+    }
+    env->tr.selector = selector;
+#ifdef VBOX
+    Log(("helper_ltr: new tr=%RTsel {.base=%RGv, .limit=%RGv, .flags=%RX32} new=%RTsel\n",
+         (RTSEL)env->tr.selector, (RTGCPTR)env->tr.base, (RTGCPTR)env->tr.limit,
+         env->tr.flags, (RTSEL)(selector & 0xffff)));
+#endif
+}
+
+/* only works if protected mode and not VM86. seg_reg must be != R_CS */
+void helper_load_seg(int seg_reg, int selector)
+{
+    uint32_t e1, e2;
+    int cpl, dpl, rpl;
+    SegmentCache *dt;
+#ifndef VBOX
+    int index;
+#else
+    unsigned int index;
+#endif
+    target_ulong ptr;
+
+    selector &= 0xffff;
+    cpl = env->hflags & HF_CPL_MASK;
+#ifdef VBOX
+
+    /* Trying to load a selector with CPL=1? */
+    if (cpl == 0 && (selector & 3) == 1 && (env->state & CPU_RAW_RING0))
+    {
+        Log(("RPL 1 -> sel %04X -> %04X (helper_load_seg)\n", selector, selector & 0xfffc));
+        selector = selector & 0xfffc;
+    }
+#endif /* VBOX */
+    if ((selector & 0xfffc) == 0) {
+        /* null selector case */
+#ifndef VBOX
+        if (seg_reg == R_SS
+#ifdef TARGET_X86_64
+            && (!(env->hflags & HF_CS64_MASK) || cpl == 3)
+#endif
+            )
+            raise_exception_err(EXCP0D_GPF, 0);
+        cpu_x86_load_seg_cache(env, seg_reg, selector, 0, 0, 0);
+#else
+        if (seg_reg == R_SS) {
+            if (!(env->hflags & HF_CS64_MASK) || cpl == 3)
+                raise_exception_err(EXCP0D_GPF, 0);
+            e2 = (cpl << DESC_DPL_SHIFT) | DESC_INTEL_UNUSABLE;
+        } else {
+            e2 = DESC_INTEL_UNUSABLE;
+        }
+        cpu_x86_load_seg_cache_with_clean_flags(env, seg_reg, selector, 0, 0, e2);
+#endif
+    } else {
+
+        if (selector & 0x4)
+            dt = &env->ldt;
+        else
+            dt = &env->gdt;
+        index = selector & ~7;
+        if ((index + 7) > dt->limit)
+            raise_exception_err(EXCP0D_GPF, selector & 0xfffc);
+        ptr = dt->base + index;
+        e1 = ldl_kernel(ptr);
+        e2 = ldl_kernel(ptr + 4);
+
+        if (!(e2 & DESC_S_MASK))
+            raise_exception_err(EXCP0D_GPF, selector & 0xfffc);
+        rpl = selector & 3;
+        dpl = (e2 >> DESC_DPL_SHIFT) & 3;
+        if (seg_reg == R_SS) {
+            /* must be writable segment */
+            if ((e2 & DESC_CS_MASK) || !(e2 & DESC_W_MASK))
+                raise_exception_err(EXCP0D_GPF, selector & 0xfffc);
+            if (rpl != cpl || dpl != cpl)
+                raise_exception_err(EXCP0D_GPF, selector & 0xfffc);
+        } else {
+            /* must be readable segment */
+            if ((e2 & (DESC_CS_MASK | DESC_R_MASK)) == DESC_CS_MASK)
+                raise_exception_err(EXCP0D_GPF, selector & 0xfffc);
+
+            if (!(e2 & DESC_CS_MASK) || !(e2 & DESC_C_MASK)) {
+                /* if not conforming code, test rights */
+                if (dpl < cpl || dpl < rpl)
+                    raise_exception_err(EXCP0D_GPF, selector & 0xfffc);
+            }
+        }
+
+        if (!(e2 & DESC_P_MASK)) {
+            if (seg_reg == R_SS)
+                raise_exception_err(EXCP0C_STACK, selector & 0xfffc);
+            else
+                raise_exception_err(EXCP0B_NOSEG, selector & 0xfffc);
+        }
+
+        /* set the access bit if not already set */
+        if (!(e2 & DESC_A_MASK)) {
+            e2 |= DESC_A_MASK;
+            stl_kernel(ptr + 4, e2);
+        }
+
+        cpu_x86_load_seg_cache(env, seg_reg, selector,
+                       get_seg_base(e1, e2),
+                       get_seg_limit(e1, e2),
+                       e2);
+#if 0
+        qemu_log("load_seg: sel=0x%04x base=0x%08lx limit=0x%08lx flags=%08x\n",
+                selector, (unsigned long)sc->base, sc->limit, sc->flags);
+#endif
+    }
+}
+
+/* protected mode jump */
+void helper_ljmp_protected(int new_cs, target_ulong new_eip,
+                           int next_eip_addend)
+{
+    int gate_cs, type;
+    uint32_t e1, e2, cpl, dpl, rpl, limit;
+    target_ulong next_eip;
+
+#ifdef VBOX /** @todo Why do we do this? */
+    e1 = e2 = 0;
+#endif
+    if ((new_cs & 0xfffc) == 0)
+        raise_exception_err(EXCP0D_GPF, 0);
+    if (load_segment(&e1, &e2, new_cs) != 0)
+        raise_exception_err(EXCP0D_GPF, new_cs & 0xfffc);
+    cpl = env->hflags & HF_CPL_MASK;
+    if (e2 & DESC_S_MASK) {
+        if (!(e2 & DESC_CS_MASK))
+            raise_exception_err(EXCP0D_GPF, new_cs & 0xfffc);
+        dpl = (e2 >> DESC_DPL_SHIFT) & 3;
+        if (e2 & DESC_C_MASK) {
+            /* conforming code segment */
+            if (dpl > cpl)
+                raise_exception_err(EXCP0D_GPF, new_cs & 0xfffc);
+        } else {
+            /* non conforming code segment */
+            rpl = new_cs & 3;
+            if (rpl > cpl)
+                raise_exception_err(EXCP0D_GPF, new_cs & 0xfffc);
+            if (dpl != cpl)
+                raise_exception_err(EXCP0D_GPF, new_cs & 0xfffc);
+        }
+        if (!(e2 & DESC_P_MASK))
+            raise_exception_err(EXCP0B_NOSEG, new_cs & 0xfffc);
+        limit = get_seg_limit(e1, e2);
+        if (new_eip > limit &&
+            !(env->hflags & HF_LMA_MASK) && !(e2 & DESC_L_MASK))
+            raise_exception_err(EXCP0D_GPF, new_cs & 0xfffc);
+#ifdef VBOX
+        if (!(e2 & DESC_A_MASK))
+            e2 = set_segment_accessed(new_cs, e2);
+#endif
+        cpu_x86_load_seg_cache(env, R_CS, (new_cs & 0xfffc) | cpl,
+                       get_seg_base(e1, e2), limit, e2);
+        EIP = new_eip;
+    } else {
+        /* jump to call or task gate */
+        dpl = (e2 >> DESC_DPL_SHIFT) & 3;
+        rpl = new_cs & 3;
+        cpl = env->hflags & HF_CPL_MASK;
+        type = (e2 >> DESC_TYPE_SHIFT) & 0xf;
+        switch(type) {
+        case 1: /* 286 TSS */
+        case 9: /* 386 TSS */
+        case 5: /* task gate */
+            if (dpl < cpl || dpl < rpl)
+                raise_exception_err(EXCP0D_GPF, new_cs & 0xfffc);
+            next_eip = env->eip + next_eip_addend;
+            switch_tss(new_cs, e1, e2, SWITCH_TSS_JMP, next_eip);
+            CC_OP = CC_OP_EFLAGS;
+            break;
+        case 4: /* 286 call gate */
+        case 12: /* 386 call gate */
+            if ((dpl < cpl) || (dpl < rpl))
+                raise_exception_err(EXCP0D_GPF, new_cs & 0xfffc);
+            if (!(e2 & DESC_P_MASK))
+                raise_exception_err(EXCP0B_NOSEG, new_cs & 0xfffc);
+            gate_cs = e1 >> 16;
+            new_eip = (e1 & 0xffff);
+            if (type == 12)
+                new_eip |= (e2 & 0xffff0000);
+            if (load_segment(&e1, &e2, gate_cs) != 0)
+                raise_exception_err(EXCP0D_GPF, gate_cs & 0xfffc);
+            dpl = (e2 >> DESC_DPL_SHIFT) & 3;
+            /* must be code segment */
+            if (((e2 & (DESC_S_MASK | DESC_CS_MASK)) !=
+                 (DESC_S_MASK | DESC_CS_MASK)))
+                raise_exception_err(EXCP0D_GPF, gate_cs & 0xfffc);
+            if (((e2 & DESC_C_MASK) && (dpl > cpl)) ||
+                (!(e2 & DESC_C_MASK) && (dpl != cpl)))
+                raise_exception_err(EXCP0D_GPF, gate_cs & 0xfffc);
+            if (!(e2 & DESC_P_MASK))
+#ifdef VBOX /* See page 3-514 of 253666.pdf */
+                raise_exception_err(EXCP0B_NOSEG, gate_cs & 0xfffc);
+#else
+                raise_exception_err(EXCP0D_GPF, gate_cs & 0xfffc);
+#endif
+            limit = get_seg_limit(e1, e2);
+            if (new_eip > limit)
+                raise_exception_err(EXCP0D_GPF, 0);
+            cpu_x86_load_seg_cache(env, R_CS, (gate_cs & 0xfffc) | cpl,
+                                   get_seg_base(e1, e2), limit, e2);
+            EIP = new_eip;
+            break;
+        default:
+            raise_exception_err(EXCP0D_GPF, new_cs & 0xfffc);
+            break;
+        }
+    }
+}
+
+/* real mode call */
+void helper_lcall_real(int new_cs, target_ulong new_eip1,
+                       int shift, int next_eip)
+{
+    int new_eip;
+    uint32_t esp, esp_mask;
+    target_ulong ssp;
+
+    new_eip = new_eip1;
+    esp = ESP;
+    esp_mask = get_sp_mask(env->segs[R_SS].flags);
+    ssp = env->segs[R_SS].base;
+    if (shift) {
+        PUSHL(ssp, esp, esp_mask, env->segs[R_CS].selector);
+        PUSHL(ssp, esp, esp_mask, next_eip);
+    } else {
+        PUSHW(ssp, esp, esp_mask, env->segs[R_CS].selector);
+        PUSHW(ssp, esp, esp_mask, next_eip);
+    }
+
+    SET_ESP(esp, esp_mask);
+    env->eip = new_eip;
+    env->segs[R_CS].selector = new_cs;
+    env->segs[R_CS].base = (new_cs << 4);
+}
+
+/* protected mode call */
+void helper_lcall_protected(int new_cs, target_ulong new_eip,
+                            int shift, int next_eip_addend)
+{
+    int new_stack, i;
+    uint32_t e1, e2, cpl, dpl, rpl, selector, offset, param_count;
+    uint32_t ss = 0, ss_e1 = 0, ss_e2 = 0, sp, type, ss_dpl, sp_mask;
+    uint32_t val, limit, old_sp_mask;
+    target_ulong ssp, old_ssp, next_eip;
+
+#ifdef VBOX /** @todo Why do we do this? */
+    e1 = e2 = 0;
+#endif
+    next_eip = env->eip + next_eip_addend;
+    LOG_PCALL("lcall %04x:%08x s=%d\n", new_cs, (uint32_t)new_eip, shift);
+    LOG_PCALL_STATE(env);
+    if ((new_cs & 0xfffc) == 0)
+        raise_exception_err(EXCP0D_GPF, 0);
+    if (load_segment(&e1, &e2, new_cs) != 0)
+        raise_exception_err(EXCP0D_GPF, new_cs & 0xfffc);
+    cpl = env->hflags & HF_CPL_MASK;
+    LOG_PCALL("desc=%08x:%08x\n", e1, e2);
+    if (e2 & DESC_S_MASK) {
+        if (!(e2 & DESC_CS_MASK))
+            raise_exception_err(EXCP0D_GPF, new_cs & 0xfffc);
+        dpl = (e2 >> DESC_DPL_SHIFT) & 3;
+        if (e2 & DESC_C_MASK) {
+            /* conforming code segment */
+            if (dpl > cpl)
+                raise_exception_err(EXCP0D_GPF, new_cs & 0xfffc);
+        } else {
+            /* non conforming code segment */
+            rpl = new_cs & 3;
+            if (rpl > cpl)
+                raise_exception_err(EXCP0D_GPF, new_cs & 0xfffc);
+            if (dpl != cpl)
+                raise_exception_err(EXCP0D_GPF, new_cs & 0xfffc);
+        }
+        if (!(e2 & DESC_P_MASK))
+            raise_exception_err(EXCP0B_NOSEG, new_cs & 0xfffc);
+#ifdef VBOX
+        if (!(e2 & DESC_A_MASK))
+            e2 = set_segment_accessed(new_cs, e2);
+#endif
+
+#ifdef TARGET_X86_64
+        /* XXX: check 16/32 bit cases in long mode */
+        if (shift == 2) {
+            target_ulong rsp;
+            /* 64 bit case */
+            rsp = ESP;
+            PUSHQ(rsp, env->segs[R_CS].selector);
+            PUSHQ(rsp, next_eip);
+            /* from this point, not restartable */
+            ESP = rsp;
+            cpu_x86_load_seg_cache(env, R_CS, (new_cs & 0xfffc) | cpl,
+                                   get_seg_base(e1, e2),
+                                   get_seg_limit(e1, e2), e2);
+            EIP = new_eip;
+        } else
+#endif
+        {
+            sp = ESP;
+            sp_mask = get_sp_mask(env->segs[R_SS].flags);
+            ssp = env->segs[R_SS].base;
+            if (shift) {
+                PUSHL(ssp, sp, sp_mask, env->segs[R_CS].selector);
+                PUSHL(ssp, sp, sp_mask, next_eip);
+            } else {
+                PUSHW(ssp, sp, sp_mask, env->segs[R_CS].selector);
+                PUSHW(ssp, sp, sp_mask, next_eip);
+            }
+
+            limit = get_seg_limit(e1, e2);
+            if (new_eip > limit)
+                raise_exception_err(EXCP0D_GPF, new_cs & 0xfffc);
+            /* from this point, not restartable */
+            SET_ESP(sp, sp_mask);
+            cpu_x86_load_seg_cache(env, R_CS, (new_cs & 0xfffc) | cpl,
+                                   get_seg_base(e1, e2), limit, e2);
+            EIP = new_eip;
+        }
+    } else {
+        /* check gate type */
+        type = (e2 >> DESC_TYPE_SHIFT) & 0x1f;
+        dpl = (e2 >> DESC_DPL_SHIFT) & 3;
+        rpl = new_cs & 3;
+        switch(type) {
+        case 1: /* available 286 TSS */
+        case 9: /* available 386 TSS */
+        case 5: /* task gate */
+            if (dpl < cpl || dpl < rpl)
+                raise_exception_err(EXCP0D_GPF, new_cs & 0xfffc);
+            switch_tss(new_cs, e1, e2, SWITCH_TSS_CALL, next_eip);
+            CC_OP = CC_OP_EFLAGS;
+            return;
+        case 4: /* 286 call gate */
+        case 12: /* 386 call gate */
+            break;
+        default:
+            raise_exception_err(EXCP0D_GPF, new_cs & 0xfffc);
+            break;
+        }
+        shift = type >> 3;
+
+        if (dpl < cpl || dpl < rpl)
+            raise_exception_err(EXCP0D_GPF, new_cs & 0xfffc);
+        /* check valid bit */
+        if (!(e2 & DESC_P_MASK))
+            raise_exception_err(EXCP0B_NOSEG,  new_cs & 0xfffc);
+        selector = e1 >> 16;
+        offset = (e2 & 0xffff0000) | (e1 & 0x0000ffff);
+        param_count = e2 & 0x1f;
+        if ((selector & 0xfffc) == 0)
+            raise_exception_err(EXCP0D_GPF, 0);
+
+        if (load_segment(&e1, &e2, selector) != 0)
+            raise_exception_err(EXCP0D_GPF, selector & 0xfffc);
+        if (!(e2 & DESC_S_MASK) || !(e2 & (DESC_CS_MASK)))
+            raise_exception_err(EXCP0D_GPF, selector & 0xfffc);
+        dpl = (e2 >> DESC_DPL_SHIFT) & 3;
+        if (dpl > cpl)
+            raise_exception_err(EXCP0D_GPF, selector & 0xfffc);
+        if (!(e2 & DESC_P_MASK))
+            raise_exception_err(EXCP0B_NOSEG, selector & 0xfffc);
+
+        if (!(e2 & DESC_C_MASK) && dpl < cpl) {
+            /* to inner privilege */
+            get_ss_esp_from_tss(&ss, &sp, dpl);
+            LOG_PCALL("new ss:esp=%04x:%08x param_count=%d ESP=" TARGET_FMT_lx "\n",
+                        ss, sp, param_count, ESP);
+            if ((ss & 0xfffc) == 0)
+                raise_exception_err(EXCP0A_TSS, ss & 0xfffc);
+            if ((ss & 3) != dpl)
+                raise_exception_err(EXCP0A_TSS, ss & 0xfffc);
+            if (load_segment(&ss_e1, &ss_e2, ss) != 0)
+                raise_exception_err(EXCP0A_TSS, ss & 0xfffc);
+            ss_dpl = (ss_e2 >> DESC_DPL_SHIFT) & 3;
+            if (ss_dpl != dpl)
+                raise_exception_err(EXCP0A_TSS, ss & 0xfffc);
+            if (!(ss_e2 & DESC_S_MASK) ||
+                (ss_e2 & DESC_CS_MASK) ||
+                !(ss_e2 & DESC_W_MASK))
+                raise_exception_err(EXCP0A_TSS, ss & 0xfffc);
+            if (!(ss_e2 & DESC_P_MASK))
+#ifdef VBOX /* See page 3-99 of 253666.pdf */
+                raise_exception_err(EXCP0C_STACK, ss & 0xfffc);
+#else
+                raise_exception_err(EXCP0A_TSS, ss & 0xfffc);
+#endif
+
+            //            push_size = ((param_count * 2) + 8) << shift;
+
+            old_sp_mask = get_sp_mask(env->segs[R_SS].flags);
+            old_ssp = env->segs[R_SS].base;
+
+            sp_mask = get_sp_mask(ss_e2);
+            ssp = get_seg_base(ss_e1, ss_e2);
+            if (shift) {
+                PUSHL(ssp, sp, sp_mask, env->segs[R_SS].selector);
+                PUSHL(ssp, sp, sp_mask, ESP);
+                for(i = param_count - 1; i >= 0; i--) {
+                    val = ldl_kernel(old_ssp + ((ESP + i * 4) & old_sp_mask));
+                    PUSHL(ssp, sp, sp_mask, val);
+                }
+            } else {
+                PUSHW(ssp, sp, sp_mask, env->segs[R_SS].selector);
+                PUSHW(ssp, sp, sp_mask, ESP);
+                for(i = param_count - 1; i >= 0; i--) {
+                    val = lduw_kernel(old_ssp + ((ESP + i * 2) & old_sp_mask));
+                    PUSHW(ssp, sp, sp_mask, val);
+                }
+            }
+            new_stack = 1;
+        } else {
+            /* to same privilege */
+            sp = ESP;
+            sp_mask = get_sp_mask(env->segs[R_SS].flags);
+            ssp = env->segs[R_SS].base;
+            //            push_size = (4 << shift);
+            new_stack = 0;
+        }
+
+        if (shift) {
+            PUSHL(ssp, sp, sp_mask, env->segs[R_CS].selector);
+            PUSHL(ssp, sp, sp_mask, next_eip);
+        } else {
+            PUSHW(ssp, sp, sp_mask, env->segs[R_CS].selector);
+            PUSHW(ssp, sp, sp_mask, next_eip);
+        }
+
+        /* from this point, not restartable */
+
+        if (new_stack) {
+            ss = (ss & ~3) | dpl;
+            cpu_x86_load_seg_cache(env, R_SS, ss,
+                                   ssp,
+                                   get_seg_limit(ss_e1, ss_e2),
+                                   ss_e2);
+        }
+
+        selector = (selector & ~3) | dpl;
+        cpu_x86_load_seg_cache(env, R_CS, selector,
+                       get_seg_base(e1, e2),
+                       get_seg_limit(e1, e2),
+                       e2);
+        cpu_x86_set_cpl(env, dpl);
+        SET_ESP(sp, sp_mask);
+        EIP = offset;
+    }
+}
+
+/* real and vm86 mode iret */
+void helper_iret_real(int shift)
+{
+    uint32_t sp, new_cs, new_eip, new_eflags, sp_mask;
+    target_ulong ssp;
+    int eflags_mask;
+#ifdef VBOX
+    bool fVME = false;
+
+    remR3TrapClear(env->pVM);
+#endif /* VBOX */
+
+    sp_mask = 0xffff; /* XXXX: use SS segment size ? */
+    sp = ESP;
+    ssp = env->segs[R_SS].base;
+    if (shift == 1) {
+        /* 32 bits */
+        POPL(ssp, sp, sp_mask, new_eip);
+        POPL(ssp, sp, sp_mask, new_cs);
+        new_cs &= 0xffff;
+        POPL(ssp, sp, sp_mask, new_eflags);
+    } else {
+        /* 16 bits */
+        POPW(ssp, sp, sp_mask, new_eip);
+        POPW(ssp, sp, sp_mask, new_cs);
+        POPW(ssp, sp, sp_mask, new_eflags);
+    }
+#ifdef VBOX
+    if (    (env->eflags & VM_MASK)
+        &&  ((env->eflags >> IOPL_SHIFT) & 3) != 3
+        &&  (env->cr[4] & CR4_VME_MASK)) /* implied or else we would fault earlier */
+    {
+        fVME = true;
+        /* if virtual interrupt pending and (virtual) interrupts will be enabled -> #GP */
+        /* if TF will be set -> #GP */
+        if (    ((new_eflags & IF_MASK) && (env->eflags & VIP_MASK))
+            ||  (new_eflags & TF_MASK))
+            raise_exception(EXCP0D_GPF);
+    }
+#endif /* VBOX */
+    ESP = (ESP & ~sp_mask) | (sp & sp_mask);
+    env->segs[R_CS].selector = new_cs;
+    env->segs[R_CS].base = (new_cs << 4);
+    env->eip = new_eip;
+#ifdef VBOX
+    if (fVME)
+        eflags_mask = TF_MASK | AC_MASK | ID_MASK | RF_MASK | NT_MASK;
+    else
+#endif
+    if (env->eflags & VM_MASK)
+        eflags_mask = TF_MASK | AC_MASK | ID_MASK | IF_MASK | RF_MASK | NT_MASK;
+    else
+        eflags_mask = TF_MASK | AC_MASK | ID_MASK | IF_MASK | IOPL_MASK | RF_MASK | NT_MASK;
+    if (shift == 0)
+        eflags_mask &= 0xffff;
+    load_eflags(new_eflags, eflags_mask);
+    env->hflags2 &= ~HF2_NMI_MASK;
+#ifdef VBOX
+    if (fVME)
+    {
+        if (new_eflags & IF_MASK)
+            env->eflags |= VIF_MASK;
+        else
+            env->eflags &= ~VIF_MASK;
+    }
+#endif /* VBOX */
+}
+
+static inline void validate_seg(int seg_reg, int cpl)
+{
+    int dpl;
+    uint32_t e2;
+
+    /* XXX: on x86_64, we do not want to nullify FS and GS because
+       they may still contain a valid base. I would be interested to
+       know how a real x86_64 CPU behaves */
+    if ((seg_reg == R_FS || seg_reg == R_GS) &&
+        (env->segs[seg_reg].selector & 0xfffc) == 0)
+        return;
+
+    e2 = env->segs[seg_reg].flags;
+    dpl = (e2 >> DESC_DPL_SHIFT) & 3;
+    if (!(e2 & DESC_CS_MASK) || !(e2 & DESC_C_MASK)) {
+        /* data or non conforming code segment */
+        if (dpl < cpl) {
+            cpu_x86_load_seg_cache(env, seg_reg, 0, 0, 0, 0);
+        }
+    }
+}
+
+/* protected mode iret */
+static inline void helper_ret_protected(int shift, int is_iret, int addend)
+{
+    uint32_t new_cs, new_eflags, new_ss;
+    uint32_t new_es, new_ds, new_fs, new_gs;
+    uint32_t e1, e2, ss_e1, ss_e2;
+    int cpl, dpl, rpl, eflags_mask, iopl;
+    target_ulong ssp, sp, new_eip, new_esp, sp_mask;
+
+#ifdef VBOX /** @todo Why do we do this? */
+    ss_e1 = ss_e2 = e1 = e2 = 0;
+#endif
+
+#ifdef TARGET_X86_64
+    if (shift == 2)
+        sp_mask = -1;
+    else
+#endif
+        sp_mask = get_sp_mask(env->segs[R_SS].flags);
+    sp = ESP;
+    ssp = env->segs[R_SS].base;
+    new_eflags = 0; /* avoid warning */
+#ifdef TARGET_X86_64
+    if (shift == 2) {
+        POPQ(sp, new_eip);
+        POPQ(sp, new_cs);
+        new_cs &= 0xffff;
+        if (is_iret) {
+            POPQ(sp, new_eflags);
+        }
+    } else
+#endif
+    if (shift == 1) {
+        /* 32 bits */
+        POPL(ssp, sp, sp_mask, new_eip);
+        POPL(ssp, sp, sp_mask, new_cs);
+        new_cs &= 0xffff;
+        if (is_iret) {
+            POPL(ssp, sp, sp_mask, new_eflags);
+#define LOG_GROUP LOG_GROUP_REM
+#if defined(VBOX) && defined(DEBUG)
+            Log(("iret: new CS     %04X (old=%x)\n", new_cs, env->segs[R_CS].selector));
+            Log(("iret: new EIP    %08X\n", (uint32_t)new_eip));
+            Log(("iret: new EFLAGS %08X\n", new_eflags));
+            Log(("iret: EAX=%08x\n", (uint32_t)EAX));
+#endif
+            if (new_eflags & VM_MASK)
+                goto return_to_vm86;
+        }
+#ifdef VBOX
+        if ((new_cs & 0x3) == 1 && (env->state & CPU_RAW_RING0))
+        {
+            if (   !EMIsRawRing1Enabled(env->pVM)
+                ||  env->segs[R_CS].selector == (new_cs & 0xfffc))
+            {
+                Log(("RPL 1 -> new_cs %04X -> %04X\n", new_cs, new_cs & 0xfffc));
+                new_cs = new_cs & 0xfffc;
+            }
+            else
+            {
+                /* Ugly assumption: assume a genuine switch to ring-1. */
+                Log(("Genuine switch to ring-1 (iret)\n"));
+            }
+        }
+        else if ((new_cs & 0x3) == 2 && (env->state & CPU_RAW_RING0) && EMIsRawRing1Enabled(env->pVM))
+        {
+            Log(("RPL 2 -> new_cs %04X -> %04X\n", new_cs, (new_cs & 0xfffc) | 1));
+            new_cs = (new_cs & 0xfffc) | 1;
+        }
+#endif
+    } else {
+        /* 16 bits */
+        POPW(ssp, sp, sp_mask, new_eip);
+        POPW(ssp, sp, sp_mask, new_cs);
+        if (is_iret)
+            POPW(ssp, sp, sp_mask, new_eflags);
+    }
+    LOG_PCALL("lret new %04x:" TARGET_FMT_lx " s=%d addend=0x%x\n",
+              new_cs, new_eip, shift, addend);
+    LOG_PCALL_STATE(env);
+    if ((new_cs & 0xfffc) == 0)
+    {
+#if defined(VBOX) && defined(DEBUG)
+        Log(("new_cs & 0xfffc) == 0\n"));
+#endif
+        raise_exception_err(EXCP0D_GPF, new_cs & 0xfffc);
+    }
+    if (load_segment(&e1, &e2, new_cs) != 0)
+    {
+#if defined(VBOX) && defined(DEBUG)
+        Log(("load_segment failed\n"));
+#endif
+        raise_exception_err(EXCP0D_GPF, new_cs & 0xfffc);
+    }
+    if (!(e2 & DESC_S_MASK) ||
+        !(e2 & DESC_CS_MASK))
+    {
+#if defined(VBOX) && defined(DEBUG)
+        Log(("e2 mask %08x\n", e2));
+#endif
+        raise_exception_err(EXCP0D_GPF, new_cs & 0xfffc);
+    }
+    cpl = env->hflags & HF_CPL_MASK;
+    rpl = new_cs & 3;
+    if (rpl < cpl)
+    {
+#if defined(VBOX) && defined(DEBUG)
+        Log(("rpl < cpl (%d vs %d)\n", rpl, cpl));
+#endif
+        raise_exception_err(EXCP0D_GPF, new_cs & 0xfffc);
+    }
+    dpl = (e2 >> DESC_DPL_SHIFT) & 3;
+
+    if (e2 & DESC_C_MASK) {
+        if (dpl > rpl)
+        {
+#if defined(VBOX) && defined(DEBUG)
+            Log(("dpl > rpl (%d vs %d)\n", dpl, rpl));
+#endif
+            raise_exception_err(EXCP0D_GPF, new_cs & 0xfffc);
+        }
+    } else {
+        if (dpl != rpl)
+        {
+#if defined(VBOX) && defined(DEBUG)
+            Log(("dpl != rpl (%d vs %d) e1=%x e2=%x\n", dpl, rpl, e1, e2));
+#endif
+            raise_exception_err(EXCP0D_GPF, new_cs & 0xfffc);
+        }
+    }
+    if (!(e2 & DESC_P_MASK))
+    {
+#if defined(VBOX) && defined(DEBUG)
+        Log(("DESC_P_MASK e2=%08x\n", e2));
+#endif
+        raise_exception_err(EXCP0B_NOSEG, new_cs & 0xfffc);
+    }
+
+    sp += addend;
+    if (rpl == cpl && (!(env->hflags & HF_CS64_MASK) ||
+                       ((env->hflags & HF_CS64_MASK) && !is_iret))) {
+        /* return to same privilege level */
+#ifdef VBOX
+        if (!(e2 & DESC_A_MASK))
+            e2 = set_segment_accessed(new_cs, e2);
+#endif
+        cpu_x86_load_seg_cache(env, R_CS, new_cs,
+                       get_seg_base(e1, e2),
+                       get_seg_limit(e1, e2),
+                       e2);
+    } else {
+        /* return to different privilege level */
+#ifdef TARGET_X86_64
+        if (shift == 2) {
+            POPQ(sp, new_esp);
+            POPQ(sp, new_ss);
+            new_ss &= 0xffff;
+        } else
+#endif
+        if (shift == 1) {
+            /* 32 bits */
+            POPL(ssp, sp, sp_mask, new_esp);
+            POPL(ssp, sp, sp_mask, new_ss);
+            new_ss &= 0xffff;
+        } else {
+            /* 16 bits */
+            POPW(ssp, sp, sp_mask, new_esp);
+            POPW(ssp, sp, sp_mask, new_ss);
+        }
+        LOG_PCALL("new ss:esp=%04x:" TARGET_FMT_lx "\n",
+                    new_ss, new_esp);
+        if ((new_ss & 0xfffc) == 0) {
+#ifdef TARGET_X86_64
+            /* NULL ss is allowed in long mode if cpl != 3*/
+# ifndef VBOX
+            /* XXX: test CS64 ? */
+            if ((env->hflags & HF_LMA_MASK) && rpl != 3) {
+                cpu_x86_load_seg_cache(env, R_SS, new_ss,
+                                       0, 0xffffffff,
+                                       DESC_G_MASK | DESC_B_MASK | DESC_P_MASK |
+                                       DESC_S_MASK | (rpl << DESC_DPL_SHIFT) |
+                                       DESC_W_MASK | DESC_A_MASK);
+                ss_e2 = DESC_B_MASK; /* XXX: should not be needed ? */
+            } else
+# else /* VBOX */
+            if ((env->hflags & HF_LMA_MASK) && rpl != 3 && (e2 & DESC_L_MASK)) {
+                if (!(e2 & DESC_A_MASK))
+                    e2 = set_segment_accessed(new_cs, e2);
+                cpu_x86_load_seg_cache_with_clean_flags(env, R_SS, new_ss,
+                                                        0, 0xffffffff,
+                                                        DESC_INTEL_UNUSABLE | (rpl << DESC_DPL_SHIFT) );
+                ss_e2 = DESC_B_MASK; /* not really used */
+            } else
+# endif
+#endif
+            {
+#if defined(VBOX) && defined(DEBUG)
+                Log(("NULL ss, rpl=%d\n", rpl));
+#endif
+                raise_exception_err(EXCP0D_GPF, 0);
+            }
+        } else {
+            if ((new_ss & 3) != rpl)
+            {
+#if defined(VBOX) && defined(DEBUG)
+                Log(("new_ss=%x != rpl=%d\n", new_ss, rpl));
+#endif
+                raise_exception_err(EXCP0D_GPF, new_ss & 0xfffc);
+            }
+            if (load_segment(&ss_e1, &ss_e2, new_ss) != 0)
+            {
+#if defined(VBOX) && defined(DEBUG)
+                Log(("new_ss=%x load error\n", new_ss));
+#endif
+                raise_exception_err(EXCP0D_GPF, new_ss & 0xfffc);
+            }
+            if (!(ss_e2 & DESC_S_MASK) ||
+                (ss_e2 & DESC_CS_MASK) ||
+                !(ss_e2 & DESC_W_MASK))
+            {
+#if defined(VBOX) && defined(DEBUG)
+                Log(("new_ss=%x ss_e2=%#x bad type\n", new_ss, ss_e2));
+#endif
+                raise_exception_err(EXCP0D_GPF, new_ss & 0xfffc);
+            }
+            dpl = (ss_e2 >> DESC_DPL_SHIFT) & 3;
+            if (dpl != rpl)
+            {
+#if defined(VBOX) && defined(DEBUG)
+                Log(("SS.dpl=%u  !=  rpl=%u\n", dpl, rpl));
+#endif
+                raise_exception_err(EXCP0D_GPF, new_ss & 0xfffc);
+            }
+            if (!(ss_e2 & DESC_P_MASK))
+            {
+#if defined(VBOX) && defined(DEBUG)
+                Log(("new_ss=%#x #NP\n", new_ss));
+#endif
+                raise_exception_err(EXCP0B_NOSEG, new_ss & 0xfffc);
+            }
+#ifdef VBOX
+            if (!(e2 & DESC_A_MASK))
+                e2 = set_segment_accessed(new_cs, e2);
+            if (!(ss_e2 & DESC_A_MASK))
+                ss_e2 = set_segment_accessed(new_ss, ss_e2);
+#endif
+            cpu_x86_load_seg_cache(env, R_SS, new_ss,
+                                   get_seg_base(ss_e1, ss_e2),
+                                   get_seg_limit(ss_e1, ss_e2),
+                                   ss_e2);
+        }
+
+        cpu_x86_load_seg_cache(env, R_CS, new_cs,
+                       get_seg_base(e1, e2),
+                       get_seg_limit(e1, e2),
+                       e2);
+        cpu_x86_set_cpl(env, rpl);
+        sp = new_esp;
+#ifdef TARGET_X86_64
+        if (env->hflags & HF_CS64_MASK)
+            sp_mask = -1;
+        else
+#endif
+            sp_mask = get_sp_mask(ss_e2);
+
+        /* validate data segments */
+        validate_seg(R_ES, rpl);
+        validate_seg(R_DS, rpl);
+        validate_seg(R_FS, rpl);
+        validate_seg(R_GS, rpl);
+
+        sp += addend;
+    }
+    SET_ESP(sp, sp_mask);
+    env->eip = new_eip;
+    if (is_iret) {
+        /* NOTE: 'cpl' is the _old_ CPL */
+        eflags_mask = TF_MASK | AC_MASK | ID_MASK | RF_MASK | NT_MASK;
+        if (cpl == 0)
+#ifdef VBOX
+            eflags_mask |= IOPL_MASK | VIF_MASK | VIP_MASK;
+#else
+            eflags_mask |= IOPL_MASK;
+#endif
+        iopl = (env->eflags >> IOPL_SHIFT) & 3;
+        if (cpl <= iopl)
+            eflags_mask |= IF_MASK;
+        if (shift == 0)
+            eflags_mask &= 0xffff;
+        load_eflags(new_eflags, eflags_mask);
+    }
+    return;
+
+ return_to_vm86:
+    POPL(ssp, sp, sp_mask, new_esp);
+    POPL(ssp, sp, sp_mask, new_ss);
+    POPL(ssp, sp, sp_mask, new_es);
+    POPL(ssp, sp, sp_mask, new_ds);
+    POPL(ssp, sp, sp_mask, new_fs);
+    POPL(ssp, sp, sp_mask, new_gs);
+
+    /* modify processor state */
+    load_eflags(new_eflags, TF_MASK | AC_MASK | ID_MASK |
+                IF_MASK | IOPL_MASK | VM_MASK | NT_MASK | VIF_MASK | VIP_MASK);
+    load_seg_vm(R_CS, new_cs & 0xffff);
+    cpu_x86_set_cpl(env, 3);
+    load_seg_vm(R_SS, new_ss & 0xffff);
+    load_seg_vm(R_ES, new_es & 0xffff);
+    load_seg_vm(R_DS, new_ds & 0xffff);
+    load_seg_vm(R_FS, new_fs & 0xffff);
+    load_seg_vm(R_GS, new_gs & 0xffff);
+
+    env->eip = new_eip & 0xffff;
+    ESP = new_esp;
+}
+
+void helper_iret_protected(int shift, int next_eip)
+{
+    int tss_selector, type;
+    uint32_t e1, e2;
+
+#ifdef VBOX
+    Log(("iret (shift=%d new_eip=%#x)\n", shift, next_eip));
+    e1 = e2 = 0; /** @todo Why do we do this? */
+    remR3TrapClear(env->pVM);
+#endif
+
+    /* specific case for TSS */
+    if (env->eflags & NT_MASK) {
+#ifdef TARGET_X86_64
+        if (env->hflags & HF_LMA_MASK)
+        {
+#if defined(VBOX) && defined(DEBUG)
+            Log(("eflags.NT=1 on iret in long mode\n"));
+#endif
+            raise_exception_err(EXCP0D_GPF, 0);
+        }
+#endif
+        tss_selector = lduw_kernel(env->tr.base + 0);
+        if (tss_selector & 4)
+            raise_exception_err(EXCP0A_TSS, tss_selector & 0xfffc);
+        if (load_segment(&e1, &e2, tss_selector) != 0)
+            raise_exception_err(EXCP0A_TSS, tss_selector & 0xfffc);
+        type = (e2 >> DESC_TYPE_SHIFT) & 0x17;
+        /* NOTE: we check both segment and busy TSS */
+        if (type != 3)
+            raise_exception_err(EXCP0A_TSS, tss_selector & 0xfffc);
+        switch_tss(tss_selector, e1, e2, SWITCH_TSS_IRET, next_eip);
+    } else {
+        helper_ret_protected(shift, 1, 0);
+    }
+    env->hflags2 &= ~HF2_NMI_MASK;
+}
+
+void helper_lret_protected(int shift, int addend)
+{
+    helper_ret_protected(shift, 0, addend);
+}
+
+void helper_sysenter(void)
+{
+    if (env->sysenter_cs == 0) {
+        raise_exception_err(EXCP0D_GPF, 0);
+    }
+    env->eflags &= ~(VM_MASK | IF_MASK | RF_MASK);
+    cpu_x86_set_cpl(env, 0);
+
+#ifdef TARGET_X86_64
+    if (env->hflags & HF_LMA_MASK) {
+        cpu_x86_load_seg_cache(env, R_CS, env->sysenter_cs & 0xfffc,
+                               0, 0xffffffff,
+                               DESC_G_MASK | DESC_B_MASK | DESC_P_MASK |
+                               DESC_S_MASK |
+                               DESC_CS_MASK | DESC_R_MASK | DESC_A_MASK | DESC_L_MASK);
+    } else
+#endif
+    {
+        cpu_x86_load_seg_cache(env, R_CS, env->sysenter_cs & 0xfffc,
+                               0, 0xffffffff,
+                               DESC_G_MASK | DESC_B_MASK | DESC_P_MASK |
+                               DESC_S_MASK |
+                               DESC_CS_MASK | DESC_R_MASK | DESC_A_MASK);
+    }
+    cpu_x86_load_seg_cache(env, R_SS, (env->sysenter_cs + 8) & 0xfffc,
+                           0, 0xffffffff,
+                           DESC_G_MASK | DESC_B_MASK | DESC_P_MASK |
+                           DESC_S_MASK |
+                           DESC_W_MASK | DESC_A_MASK);
+    ESP = env->sysenter_esp;
+    EIP = env->sysenter_eip;
+}
+
+void helper_sysexit(int dflag)
+{
+    int cpl;
+
+    cpl = env->hflags & HF_CPL_MASK;
+    if (env->sysenter_cs == 0 || cpl != 0) {
+        raise_exception_err(EXCP0D_GPF, 0);
+    }
+    cpu_x86_set_cpl(env, 3);
+#ifdef TARGET_X86_64
+    if (dflag == 2) {
+        cpu_x86_load_seg_cache(env, R_CS, ((env->sysenter_cs + 32) & 0xfffc) | 3,
+                               0, 0xffffffff,
+                               DESC_G_MASK | DESC_B_MASK | DESC_P_MASK |
+                               DESC_S_MASK | (3 << DESC_DPL_SHIFT) |
+                               DESC_CS_MASK | DESC_R_MASK | DESC_A_MASK | DESC_L_MASK);
+        cpu_x86_load_seg_cache(env, R_SS, ((env->sysenter_cs + 40) & 0xfffc) | 3,
+                               0, 0xffffffff,
+                               DESC_G_MASK | DESC_B_MASK | DESC_P_MASK |
+                               DESC_S_MASK | (3 << DESC_DPL_SHIFT) |
+                               DESC_W_MASK | DESC_A_MASK);
+    } else
+#endif
+    {
+        cpu_x86_load_seg_cache(env, R_CS, ((env->sysenter_cs + 16) & 0xfffc) | 3,
+                               0, 0xffffffff,
+                               DESC_G_MASK | DESC_B_MASK | DESC_P_MASK |
+                               DESC_S_MASK | (3 << DESC_DPL_SHIFT) |
+                               DESC_CS_MASK | DESC_R_MASK | DESC_A_MASK);
+        cpu_x86_load_seg_cache(env, R_SS, ((env->sysenter_cs + 24) & 0xfffc) | 3,
+                               0, 0xffffffff,
+                               DESC_G_MASK | DESC_B_MASK | DESC_P_MASK |
+                               DESC_S_MASK | (3 << DESC_DPL_SHIFT) |
+                               DESC_W_MASK | DESC_A_MASK);
+    }
+    ESP = ECX;
+    EIP = EDX;
+}
+
+#if defined(CONFIG_USER_ONLY)
+target_ulong helper_read_crN(int reg)
+{
+    return 0;
+}
+
+void helper_write_crN(int reg, target_ulong t0)
+{
+}
+
+void helper_movl_drN_T0(int reg, target_ulong t0)
+{
+}
+#else
+target_ulong helper_read_crN(int reg)
+{
+    target_ulong val;
+
+    helper_svm_check_intercept_param(SVM_EXIT_READ_CR0 + reg, 0);
+    switch(reg) {
+    default:
+        val = env->cr[reg];
+        break;
+    case 8:
+        if (!(env->hflags2 & HF2_VINTR_MASK)) {
+#ifndef VBOX
+            val = cpu_get_apic_tpr(env->apic_state);
+#else  /* VBOX */
+            val = cpu_get_apic_tpr(env);
+#endif /* VBOX */
+        } else {
+            val = env->v_tpr;
+        }
+        break;
+    }
+    return val;
+}
+
+void helper_write_crN(int reg, target_ulong t0)
+{
+    helper_svm_check_intercept_param(SVM_EXIT_WRITE_CR0 + reg, 0);
+    switch(reg) {
+    case 0:
+        cpu_x86_update_cr0(env, t0);
+        break;
+    case 3:
+        cpu_x86_update_cr3(env, t0);
+        break;
+    case 4:
+        cpu_x86_update_cr4(env, t0);
+        break;
+    case 8:
+        if (!(env->hflags2 & HF2_VINTR_MASK)) {
+#ifndef VBOX
+            cpu_set_apic_tpr(env->apic_state, t0);
+#else  /* VBOX */
+            cpu_set_apic_tpr(env, t0);
+#endif /* VBOX */
+        }
+        env->v_tpr = t0 & 0x0f;
+        break;
+    default:
+        env->cr[reg] = t0;
+        break;
+    }
+}
+
+void helper_movl_drN_T0(int reg, target_ulong t0)
+{
+    int i;
+
+    if (reg < 4) {
+        hw_breakpoint_remove(env, reg);
+        env->dr[reg] = t0;
+        hw_breakpoint_insert(env, reg);
+# ifndef VBOX
+    } else if (reg == 7) {
+# else
+    } else if (reg == 7 || reg == 5) {  /* (DR5 is an alias for DR7.) */
+        if (t0 & X86_DR7_MBZ_MASK)
+            raise_exception_err(EXCP0D_GPF, 0);
+        t0 |= X86_DR7_RA1_MASK;
+        t0 &= ~X86_DR7_RAZ_MASK;
+# endif
+        for (i = 0; i < 4; i++)
+            hw_breakpoint_remove(env, i);
+        env->dr[7] = t0;
+        for (i = 0; i < 4; i++)
+            hw_breakpoint_insert(env, i);
+    } else {
+# ifndef VBOX
+        env->dr[reg] = t0;
+# else
+        if (t0 & X86_DR6_MBZ_MASK)
+            raise_exception_err(EXCP0D_GPF, 0);
+        t0 |= X86_DR6_RA1_MASK;
+        t0 &= ~X86_DR6_RAZ_MASK;
+        env->dr[6] = t0;                /* (DR4 is an alias for DR6.) */
+# endif
+    }
+}
+#endif
+
+void helper_lmsw(target_ulong t0)
+{
+    /* only 4 lower bits of CR0 are modified. PE cannot be set to zero
+       if already set to one. */
+    t0 = (env->cr[0] & ~0xe) | (t0 & 0xf);
+    helper_write_crN(0, t0);
+}
+
+void helper_clts(void)
+{
+    env->cr[0] &= ~CR0_TS_MASK;
+    env->hflags &= ~HF_TS_MASK;
+}
+
+void helper_invlpg(target_ulong addr)
+{
+    helper_svm_check_intercept_param(SVM_EXIT_INVLPG, 0);
+    tlb_flush_page(env, addr);
+}
+
+void helper_rdtsc(void)
+{
+    uint64_t val;
+
+    if ((env->cr[4] & CR4_TSD_MASK) && ((env->hflags & HF_CPL_MASK) != 0)) {
+        raise_exception(EXCP0D_GPF);
+    }
+    helper_svm_check_intercept_param(SVM_EXIT_RDTSC, 0);
+
+    val = cpu_get_tsc(env) + env->tsc_offset;
+    EAX = (uint32_t)(val);
+    EDX = (uint32_t)(val >> 32);
+}
+
+void helper_rdtscp(void)
+{
+    helper_rdtsc();
+#ifndef VBOX
+    ECX = (uint32_t)(env->tsc_aux);
+#else  /* VBOX */
+    uint64_t val;
+    if (cpu_rdmsr(env, MSR_K8_TSC_AUX, &val) == 0)
+        ECX = (uint32_t)(val);
+    else
+        ECX = 0;
+#endif /* VBOX */
+}
+
+void helper_rdpmc(void)
+{
+#ifdef VBOX
+    /* If X86_CR4_PCE is *not* set, then CPL must be zero. */
+    if (!(env->cr[4] & CR4_PCE_MASK) && ((env->hflags & HF_CPL_MASK) != 0)) {
+        raise_exception(EXCP0D_GPF);
+    }
+    /* Just return zero here; rather tricky to properly emulate this, especially as the specs are a mess. */
+    EAX = 0;
+    EDX = 0;
+#else  /* !VBOX */
+    if ((env->cr[4] & CR4_PCE_MASK) && ((env->hflags & HF_CPL_MASK) != 0)) {
+        raise_exception(EXCP0D_GPF);
+    }
+    helper_svm_check_intercept_param(SVM_EXIT_RDPMC, 0);
+
+    /* currently unimplemented */
+    raise_exception_err(EXCP06_ILLOP, 0);
+#endif /* !VBOX */
+}
+
+#if defined(CONFIG_USER_ONLY)
+void helper_wrmsr(void)
+{
+}
+
+void helper_rdmsr(void)
+{
+}
+#else
+void helper_wrmsr(void)
+{
+    uint64_t val;
+
+    helper_svm_check_intercept_param(SVM_EXIT_MSR, 1);
+
+    val = ((uint32_t)EAX) | ((uint64_t)((uint32_t)EDX) << 32);
+
+    switch((uint32_t)ECX) {
+    case MSR_IA32_SYSENTER_CS:
+        env->sysenter_cs = val & 0xffff;
+        break;
+    case MSR_IA32_SYSENTER_ESP:
+        env->sysenter_esp = val;
+        break;
+    case MSR_IA32_SYSENTER_EIP:
+        env->sysenter_eip = val;
+        break;
+    case MSR_IA32_APICBASE:
+# ifndef VBOX /* The CPUMSetGuestMsr call below does this now. */
+        cpu_set_apic_base(env->apic_state, val);
+# endif
+        break;
+    case MSR_EFER:
+        {
+            uint64_t update_mask;
+            update_mask = 0;
+            if (env->cpuid_ext2_features & CPUID_EXT2_SYSCALL)
+                update_mask |= MSR_EFER_SCE;
+            if (env->cpuid_ext2_features & CPUID_EXT2_LM)
+                update_mask |= MSR_EFER_LME;
+            if (env->cpuid_ext2_features & CPUID_EXT2_FFXSR)
+                update_mask |= MSR_EFER_FFXSR;
+            if (env->cpuid_ext2_features & CPUID_EXT2_NX)
+                update_mask |= MSR_EFER_NXE;
+            if (env->cpuid_ext3_features & CPUID_EXT3_SVM)
+                update_mask |= MSR_EFER_SVME;
+            if (env->cpuid_ext2_features & CPUID_EXT2_FFXSR)
+                update_mask |= MSR_EFER_FFXSR;
+            cpu_load_efer(env, (env->efer & ~update_mask) |
+                          (val & update_mask));
+        }
+        break;
+    case MSR_STAR:
+        env->star = val;
+        break;
+    case MSR_PAT:
+        env->pat = val;
+        break;
+    case MSR_VM_HSAVE_PA:
+        env->vm_hsave = val;
+        break;
+#ifdef TARGET_X86_64
+    case MSR_LSTAR:
+        env->lstar = val;
+        break;
+    case MSR_CSTAR:
+        env->cstar = val;
+        break;
+    case MSR_FMASK:
+        env->fmask = val;
+        break;
+    case MSR_FSBASE:
+        env->segs[R_FS].base = val;
+        break;
+    case MSR_GSBASE:
+        env->segs[R_GS].base = val;
+        break;
+    case MSR_KERNELGSBASE:
+        env->kernelgsbase = val;
+        break;
+#endif
+# ifndef VBOX
+    case MSR_MTRRphysBase(0):
+    case MSR_MTRRphysBase(1):
+    case MSR_MTRRphysBase(2):
+    case MSR_MTRRphysBase(3):
+    case MSR_MTRRphysBase(4):
+    case MSR_MTRRphysBase(5):
+    case MSR_MTRRphysBase(6):
+    case MSR_MTRRphysBase(7):
+        env->mtrr_var[((uint32_t)ECX - MSR_MTRRphysBase(0)) / 2].base = val;
+        break;
+    case MSR_MTRRphysMask(0):
+    case MSR_MTRRphysMask(1):
+    case MSR_MTRRphysMask(2):
+    case MSR_MTRRphysMask(3):
+    case MSR_MTRRphysMask(4):
+    case MSR_MTRRphysMask(5):
+    case MSR_MTRRphysMask(6):
+    case MSR_MTRRphysMask(7):
+        env->mtrr_var[((uint32_t)ECX - MSR_MTRRphysMask(0)) / 2].mask = val;
+        break;
+    case MSR_MTRRfix64K_00000:
+        env->mtrr_fixed[(uint32_t)ECX - MSR_MTRRfix64K_00000] = val;
+        break;
+    case MSR_MTRRfix16K_80000:
+    case MSR_MTRRfix16K_A0000:
+        env->mtrr_fixed[(uint32_t)ECX - MSR_MTRRfix16K_80000 + 1] = val;
+        break;
+    case MSR_MTRRfix4K_C0000:
+    case MSR_MTRRfix4K_C8000:
+    case MSR_MTRRfix4K_D0000:
+    case MSR_MTRRfix4K_D8000:
+    case MSR_MTRRfix4K_E0000:
+    case MSR_MTRRfix4K_E8000:
+    case MSR_MTRRfix4K_F0000:
+    case MSR_MTRRfix4K_F8000:
+        env->mtrr_fixed[(uint32_t)ECX - MSR_MTRRfix4K_C0000 + 3] = val;
+        break;
+    case MSR_MTRRdefType:
+        env->mtrr_deftype = val;
+        break;
+    case MSR_MCG_STATUS:
+        env->mcg_status = val;
+        break;
+    case MSR_MCG_CTL:
+        if ((env->mcg_cap & MCG_CTL_P)
+            && (val == 0 || val == ~(uint64_t)0))
+            env->mcg_ctl = val;
+        break;
+    case MSR_TSC_AUX:
+        env->tsc_aux = val;
+        break;
+# endif /* !VBOX */
+    default:
+# ifndef VBOX
+        if ((uint32_t)ECX >= MSR_MC0_CTL
+            && (uint32_t)ECX < MSR_MC0_CTL + (4 * env->mcg_cap & 0xff)) {
+            uint32_t offset = (uint32_t)ECX - MSR_MC0_CTL;
+            if ((offset & 0x3) != 0
+                || (val == 0 || val == ~(uint64_t)0))
+                env->mce_banks[offset] = val;
+            break;
+        }
+        /* XXX: exception ? */
+# endif
+        break;
+    }
+
+# ifdef VBOX
+    /* call CPUM. */
+    if (cpu_wrmsr(env, (uint32_t)ECX, val) != 0)
+    {
+        /** @todo be a brave man and raise a \#GP(0) here as we should... */
+    }
+# endif
+}
+
+void helper_rdmsr(void)
+{
+    uint64_t val;
+
+    helper_svm_check_intercept_param(SVM_EXIT_MSR, 0);
+
+    switch((uint32_t)ECX) {
+    case MSR_IA32_SYSENTER_CS:
+        val = env->sysenter_cs;
+        break;
+    case MSR_IA32_SYSENTER_ESP:
+        val = env->sysenter_esp;
+        break;
+    case MSR_IA32_SYSENTER_EIP:
+        val = env->sysenter_eip;
+        break;
+    case MSR_IA32_APICBASE:
+#ifndef VBOX
+        val = cpu_get_apic_base(env->apic_state);
+#else  /* VBOX */
+        val = cpu_get_apic_base(env);
+#endif /* VBOX */
+        break;
+    case MSR_EFER:
+        val = env->efer;
+        break;
+    case MSR_STAR:
+        val = env->star;
+        break;
+    case MSR_PAT:
+        val = env->pat;
+        break;
+    case MSR_VM_HSAVE_PA:
+        val = env->vm_hsave;
+        break;
+# ifndef VBOX /* forward to CPUMQueryGuestMsr. */
+    case MSR_IA32_PERF_STATUS:
+        /* tsc_increment_by_tick */
+        val = 1000ULL;
+        /* CPU multiplier */
+        val |= (((uint64_t)4ULL) << 40);
+        break;
+# endif /* !VBOX */
+#ifdef TARGET_X86_64
+    case MSR_LSTAR:
+        val = env->lstar;
+        break;
+    case MSR_CSTAR:
+        val = env->cstar;
+        break;
+    case MSR_FMASK:
+        val = env->fmask;
+        break;
+    case MSR_FSBASE:
+        val = env->segs[R_FS].base;
+        break;
+    case MSR_GSBASE:
+        val = env->segs[R_GS].base;
+        break;
+    case MSR_KERNELGSBASE:
+        val = env->kernelgsbase;
+        break;
+# ifndef VBOX
+    case MSR_TSC_AUX:
+        val = env->tsc_aux;
+        break;
+# endif /*!VBOX*/
+#endif
+# ifndef VBOX
+    case MSR_MTRRphysBase(0):
+    case MSR_MTRRphysBase(1):
+    case MSR_MTRRphysBase(2):
+    case MSR_MTRRphysBase(3):
+    case MSR_MTRRphysBase(4):
+    case MSR_MTRRphysBase(5):
+    case MSR_MTRRphysBase(6):
+    case MSR_MTRRphysBase(7):
+        val = env->mtrr_var[((uint32_t)ECX - MSR_MTRRphysBase(0)) / 2].base;
+        break;
+    case MSR_MTRRphysMask(0):
+    case MSR_MTRRphysMask(1):
+    case MSR_MTRRphysMask(2):
+    case MSR_MTRRphysMask(3):
+    case MSR_MTRRphysMask(4):
+    case MSR_MTRRphysMask(5):
+    case MSR_MTRRphysMask(6):
+    case MSR_MTRRphysMask(7):
+        val = env->mtrr_var[((uint32_t)ECX - MSR_MTRRphysMask(0)) / 2].mask;
+        break;
+    case MSR_MTRRfix64K_00000:
+        val = env->mtrr_fixed[0];
+        break;
+    case MSR_MTRRfix16K_80000:
+    case MSR_MTRRfix16K_A0000:
+        val = env->mtrr_fixed[(uint32_t)ECX - MSR_MTRRfix16K_80000 + 1];
+        break;
+    case MSR_MTRRfix4K_C0000:
+    case MSR_MTRRfix4K_C8000:
+    case MSR_MTRRfix4K_D0000:
+    case MSR_MTRRfix4K_D8000:
+    case MSR_MTRRfix4K_E0000:
+    case MSR_MTRRfix4K_E8000:
+    case MSR_MTRRfix4K_F0000:
+    case MSR_MTRRfix4K_F8000:
+        val = env->mtrr_fixed[(uint32_t)ECX - MSR_MTRRfix4K_C0000 + 3];
+        break;
+    case MSR_MTRRdefType:
+        val = env->mtrr_deftype;
+        break;
+    case MSR_MTRRcap:
+        if (env->cpuid_features & CPUID_MTRR)
+            val = MSR_MTRRcap_VCNT | MSR_MTRRcap_FIXRANGE_SUPPORT | MSR_MTRRcap_WC_SUPPORTED;
+        else
+            /* XXX: exception ? */
+            val = 0;
+        break;
+    case MSR_MCG_CAP:
+        val = env->mcg_cap;
+        break;
+    case MSR_MCG_CTL:
+        if (env->mcg_cap & MCG_CTL_P)
+            val = env->mcg_ctl;
+        else
+            val = 0;
+        break;
+    case MSR_MCG_STATUS:
+        val = env->mcg_status;
+        break;
+# endif /* !VBOX */
+    default:
+# ifndef VBOX
+        if ((uint32_t)ECX >= MSR_MC0_CTL
+            && (uint32_t)ECX < MSR_MC0_CTL + (4 * env->mcg_cap & 0xff)) {
+            uint32_t offset = (uint32_t)ECX - MSR_MC0_CTL;
+            val = env->mce_banks[offset];
+            break;
+        }
+        /* XXX: exception ? */
+        val = 0;
+# else  /* VBOX */
+        if (cpu_rdmsr(env, (uint32_t)ECX, &val) != 0)
+        {
+            /** @todo be a brave man and raise a \#GP(0) here as we should... */
+            val = 0;
+        }
+# endif /* VBOX */
+        break;
+    }
+    EAX = (uint32_t)(val);
+    EDX = (uint32_t)(val >> 32);
+
+# ifdef VBOX_STRICT
+    if ((uint32_t)ECX != MSR_IA32_TSC) {
+        if (cpu_rdmsr(env, (uint32_t)ECX, &val) != 0)
+            val = 0;
+        AssertMsg(val == RT_MAKE_U64(EAX, EDX), ("idMsr=%#x val=%#llx eax:edx=%#llx\n", (uint32_t)ECX, val, RT_MAKE_U64(EAX, EDX)));
+    }
+# endif
+}
+#endif
+
+target_ulong helper_lsl(target_ulong selector1)
+{
+    unsigned int limit;
+    uint32_t e1, e2, eflags, selector;
+    int rpl, dpl, cpl, type;
+
+    selector = selector1 & 0xffff;
+    eflags = helper_cc_compute_all(CC_OP);
+    if ((selector & 0xfffc) == 0)
+        goto fail;
+    if (load_segment(&e1, &e2, selector) != 0)
+        goto fail;
+    rpl = selector & 3;
+    dpl = (e2 >> DESC_DPL_SHIFT) & 3;
+    cpl = env->hflags & HF_CPL_MASK;
+    if (e2 & DESC_S_MASK) {
+        if ((e2 & DESC_CS_MASK) && (e2 & DESC_C_MASK)) {
+            /* conforming */
+        } else {
+            if (dpl < cpl || dpl < rpl)
+                goto fail;
+        }
+    } else {
+        type = (e2 >> DESC_TYPE_SHIFT) & 0xf;
+        switch(type) {
+        case 1:
+        case 2:
+        case 3:
+        case 9:
+        case 11:
+            break;
+        default:
+            goto fail;
+        }
+        if (dpl < cpl || dpl < rpl) {
+        fail:
+            CC_SRC = eflags & ~CC_Z;
+            return 0;
+        }
+    }
+    limit = get_seg_limit(e1, e2);
+    CC_SRC = eflags | CC_Z;
+    return limit;
+}
+
+target_ulong helper_lar(target_ulong selector1)
+{
+    uint32_t e1, e2, eflags, selector;
+    int rpl, dpl, cpl, type;
+
+    selector = selector1 & 0xffff;
+    eflags = helper_cc_compute_all(CC_OP);
+    if ((selector & 0xfffc) == 0)
+        goto fail;
+    if (load_segment(&e1, &e2, selector) != 0)
+        goto fail;
+    rpl = selector & 3;
+    dpl = (e2 >> DESC_DPL_SHIFT) & 3;
+    cpl = env->hflags & HF_CPL_MASK;
+    if (e2 & DESC_S_MASK) {
+        if ((e2 & DESC_CS_MASK) && (e2 & DESC_C_MASK)) {
+            /* conforming */
+        } else {
+            if (dpl < cpl || dpl < rpl)
+                goto fail;
+        }
+    } else {
+        type = (e2 >> DESC_TYPE_SHIFT) & 0xf;
+        switch(type) {
+        case 1:
+        case 2:
+        case 3:
+        case 4:
+        case 5:
+        case 9:
+        case 11:
+        case 12:
+            break;
+        default:
+            goto fail;
+        }
+        if (dpl < cpl || dpl < rpl) {
+        fail:
+            CC_SRC = eflags & ~CC_Z;
+            return 0;
+        }
+    }
+    CC_SRC = eflags | CC_Z;
+#ifdef VBOX /* AMD says 0x00ffff00, while intel says 0x00fxff00. Bochs and IEM does like AMD says (x=f). */
+    return e2 & 0x00ffff00;
+#else
+    return e2 & 0x00f0ff00;
+#endif
+}
+
+void helper_verr(target_ulong selector1)
+{
+    uint32_t e1, e2, eflags, selector;
+    int rpl, dpl, cpl;
+
+    selector = selector1 & 0xffff;
+    eflags = helper_cc_compute_all(CC_OP);
+    if ((selector & 0xfffc) == 0)
+        goto fail;
+    if (load_segment(&e1, &e2, selector) != 0)
+        goto fail;
+    if (!(e2 & DESC_S_MASK))
+        goto fail;
+    rpl = selector & 3;
+    dpl = (e2 >> DESC_DPL_SHIFT) & 3;
+    cpl = env->hflags & HF_CPL_MASK;
+    if (e2 & DESC_CS_MASK) {
+        if (!(e2 & DESC_R_MASK))
+            goto fail;
+        if (!(e2 & DESC_C_MASK)) {
+            if (dpl < cpl || dpl < rpl)
+                goto fail;
+        }
+    } else {
+        if (dpl < cpl || dpl < rpl) {
+        fail:
+            CC_SRC = eflags & ~CC_Z;
+            return;
+        }
+    }
+    CC_SRC = eflags | CC_Z;
+}
+
+void helper_verw(target_ulong selector1)
+{
+    uint32_t e1, e2, eflags, selector;
+    int rpl, dpl, cpl;
+
+    selector = selector1 & 0xffff;
+    eflags = helper_cc_compute_all(CC_OP);
+    if ((selector & 0xfffc) == 0)
+        goto fail;
+    if (load_segment(&e1, &e2, selector) != 0)
+        goto fail;
+    if (!(e2 & DESC_S_MASK))
+        goto fail;
+    rpl = selector & 3;
+    dpl = (e2 >> DESC_DPL_SHIFT) & 3;
+    cpl = env->hflags & HF_CPL_MASK;
+    if (e2 & DESC_CS_MASK) {
+        goto fail;
+    } else {
+        if (dpl < cpl || dpl < rpl)
+            goto fail;
+        if (!(e2 & DESC_W_MASK)) {
+        fail:
+            CC_SRC = eflags & ~CC_Z;
+            return;
+        }
+    }
+    CC_SRC = eflags | CC_Z;
+}
+
+/* x87 FPU helpers */
+
+static void fpu_set_exception(int mask)
+{
+    env->fpus |= mask;
+    if (env->fpus & (~env->fpuc & FPUC_EM))
+        env->fpus |= FPUS_SE | FPUS_B;
+}
+
+static inline CPU86_LDouble helper_fdiv(CPU86_LDouble a, CPU86_LDouble b)
+{
+    if (b == 0.0)
+        fpu_set_exception(FPUS_ZE);
+    return a / b;
+}
+
+static void fpu_raise_exception(void)
+{
+    if (env->cr[0] & CR0_NE_MASK) {
+        raise_exception(EXCP10_COPR);
+    }
+#if !defined(CONFIG_USER_ONLY)
+    else {
+        cpu_set_ferr(env);
+    }
+#endif
+}
+
+void helper_flds_FT0(uint32_t val)
+{
+    union {
+        float32 f;
+        uint32_t i;
+    } u;
+    u.i = val;
+    FT0 = float32_to_floatx(u.f, &env->fp_status);
+}
+
+void helper_fldl_FT0(uint64_t val)
+{
+    union {
+        float64 f;
+        uint64_t i;
+    } u;
+    u.i = val;
+    FT0 = float64_to_floatx(u.f, &env->fp_status);
+}
+
+void helper_fildl_FT0(int32_t val)
+{
+    FT0 = int32_to_floatx(val, &env->fp_status);
+}
+
+void helper_flds_ST0(uint32_t val)
+{
+    int new_fpstt;
+    union {
+        float32 f;
+        uint32_t i;
+    } u;
+    new_fpstt = (env->fpstt - 1) & 7;
+    u.i = val;
+    env->fpregs[new_fpstt].d = float32_to_floatx(u.f, &env->fp_status);
+    env->fpstt = new_fpstt;
+    env->fptags[new_fpstt] = 0; /* validate stack entry */
+}
+
+void helper_fldl_ST0(uint64_t val)
+{
+    int new_fpstt;
+    union {
+        float64 f;
+        uint64_t i;
+    } u;
+    new_fpstt = (env->fpstt - 1) & 7;
+    u.i = val;
+    env->fpregs[new_fpstt].d = float64_to_floatx(u.f, &env->fp_status);
+    env->fpstt = new_fpstt;
+    env->fptags[new_fpstt] = 0; /* validate stack entry */
+}
+
+void helper_fildl_ST0(int32_t val)
+{
+    int new_fpstt;
+    new_fpstt = (env->fpstt - 1) & 7;
+    env->fpregs[new_fpstt].d = int32_to_floatx(val, &env->fp_status);
+    env->fpstt = new_fpstt;
+    env->fptags[new_fpstt] = 0; /* validate stack entry */
+}
+
+void helper_fildll_ST0(int64_t val)
+{
+    int new_fpstt;
+    new_fpstt = (env->fpstt - 1) & 7;
+    env->fpregs[new_fpstt].d = int64_to_floatx(val, &env->fp_status);
+    env->fpstt = new_fpstt;
+    env->fptags[new_fpstt] = 0; /* validate stack entry */
+}
+
+#ifndef VBOX
+uint32_t helper_fsts_ST0(void)
+#else
+RTCCUINTREG helper_fsts_ST0(void)
+#endif
+{
+    union {
+        float32 f;
+        uint32_t i;
+    } u;
+    u.f = floatx_to_float32(ST0, &env->fp_status);
+    return u.i;
+}
+
+uint64_t helper_fstl_ST0(void)
+{
+    union {
+        float64 f;
+        uint64_t i;
+    } u;
+    u.f = floatx_to_float64(ST0, &env->fp_status);
+    return u.i;
+}
+
+#ifndef VBOX
+int32_t helper_fist_ST0(void)
+#else
+RTCCINTREG helper_fist_ST0(void)
+#endif
+{
+    int32_t val;
+    val = floatx_to_int32(ST0, &env->fp_status);
+    if (val != (int16_t)val)
+        val = -32768;
+    return val;
+}
+
+#ifndef VBOX
+int32_t helper_fistl_ST0(void)
+#else
+RTCCINTREG helper_fistl_ST0(void)
+#endif
+{
+    int32_t val;
+    val = floatx_to_int32(ST0, &env->fp_status);
+    return val;
+}
+
+int64_t helper_fistll_ST0(void)
+{
+    int64_t val;
+    val = floatx_to_int64(ST0, &env->fp_status);
+    return val;
+}
+
+#ifndef VBOX
+int32_t helper_fistt_ST0(void)
+#else
+RTCCINTREG helper_fistt_ST0(void)
+#endif
+{
+    int32_t val;
+    val = floatx_to_int32_round_to_zero(ST0, &env->fp_status);
+    if (val != (int16_t)val)
+        val = -32768;
+    return val;
+}
+
+#ifndef VBOX
+int32_t helper_fisttl_ST0(void)
+#else
+RTCCINTREG helper_fisttl_ST0(void)
+#endif
+{
+    int32_t val;
+    val = floatx_to_int32_round_to_zero(ST0, &env->fp_status);
+    return val;
+}
+
+int64_t helper_fisttll_ST0(void)
+{
+    int64_t val;
+    val = floatx_to_int64_round_to_zero(ST0, &env->fp_status);
+    return val;
+}
+
+void helper_fldt_ST0(target_ulong ptr)
+{
+    int new_fpstt;
+    new_fpstt = (env->fpstt - 1) & 7;
+    env->fpregs[new_fpstt].d = helper_fldt(ptr);
+    env->fpstt = new_fpstt;
+    env->fptags[new_fpstt] = 0; /* validate stack entry */
+}
+
+void helper_fstt_ST0(target_ulong ptr)
+{
+    helper_fstt(ST0, ptr);
+}
+
+void helper_fpush(void)
+{
+    fpush();
+}
+
+void helper_fpop(void)
+{
+    fpop();
+}
+
+void helper_fdecstp(void)
+{
+    env->fpstt = (env->fpstt - 1) & 7;
+    env->fpus &= (~0x4700);
+}
+
+void helper_fincstp(void)
+{
+    env->fpstt = (env->fpstt + 1) & 7;
+    env->fpus &= (~0x4700);
+}
+
+/* FPU move */
+
+void helper_ffree_STN(int st_index)
+{
+    env->fptags[(env->fpstt + st_index) & 7] = 1;
+}
+
+void helper_fmov_ST0_FT0(void)
+{
+    ST0 = FT0;
+}
+
+void helper_fmov_FT0_STN(int st_index)
+{
+    FT0 = ST(st_index);
+}
+
+void helper_fmov_ST0_STN(int st_index)
+{
+    ST0 = ST(st_index);
+}
+
+void helper_fmov_STN_ST0(int st_index)
+{
+    ST(st_index) = ST0;
+}
+
+void helper_fxchg_ST0_STN(int st_index)
+{
+    CPU86_LDouble tmp;
+    tmp = ST(st_index);
+    ST(st_index) = ST0;
+    ST0 = tmp;
+}
+
+/* FPU operations */
+
+static const int fcom_ccval[4] = {0x0100, 0x4000, 0x0000, 0x4500};
+
+void helper_fcom_ST0_FT0(void)
+{
+    int ret;
+
+    ret = floatx_compare(ST0, FT0, &env->fp_status);
+    env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
+}
+
+void helper_fucom_ST0_FT0(void)
+{
+    int ret;
+
+    ret = floatx_compare_quiet(ST0, FT0, &env->fp_status);
+    env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret+ 1];
+}
+
+static const int fcomi_ccval[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
+
+void helper_fcomi_ST0_FT0(void)
+{
+    int eflags;
+    int ret;
+
+    ret = floatx_compare(ST0, FT0, &env->fp_status);
+    eflags = helper_cc_compute_all(CC_OP);
+    eflags = (eflags & ~(CC_Z | CC_P | CC_C)) | fcomi_ccval[ret + 1];
+    CC_SRC = eflags;
+}
+
+void helper_fucomi_ST0_FT0(void)
+{
+    int eflags;
+    int ret;
+
+    ret = floatx_compare_quiet(ST0, FT0, &env->fp_status);
+    eflags = helper_cc_compute_all(CC_OP);
+    eflags = (eflags & ~(CC_Z | CC_P | CC_C)) | fcomi_ccval[ret + 1];
+    CC_SRC = eflags;
+}
+
+void helper_fadd_ST0_FT0(void)
+{
+    ST0 += FT0;
+}
+
+void helper_fmul_ST0_FT0(void)
+{
+    ST0 *= FT0;
+}
+
+void helper_fsub_ST0_FT0(void)
+{
+    ST0 -= FT0;
+}
+
+void helper_fsubr_ST0_FT0(void)
+{
+    ST0 = FT0 - ST0;
+}
+
+void helper_fdiv_ST0_FT0(void)
+{
+    ST0 = helper_fdiv(ST0, FT0);
+}
+
+void helper_fdivr_ST0_FT0(void)
+{
+    ST0 = helper_fdiv(FT0, ST0);
+}
+
+/* fp operations between STN and ST0 */
+
+void helper_fadd_STN_ST0(int st_index)
+{
+    ST(st_index) += ST0;
+}
+
+void helper_fmul_STN_ST0(int st_index)
+{
+    ST(st_index) *= ST0;
+}
+
+void helper_fsub_STN_ST0(int st_index)
+{
+    ST(st_index) -= ST0;
+}
+
+void helper_fsubr_STN_ST0(int st_index)
+{
+    CPU86_LDouble *p;
+    p = &ST(st_index);
+    *p = ST0 - *p;
+}
+
+void helper_fdiv_STN_ST0(int st_index)
+{
+    CPU86_LDouble *p;
+    p = &ST(st_index);
+    *p = helper_fdiv(*p, ST0);
+}
+
+void helper_fdivr_STN_ST0(int st_index)
+{
+    CPU86_LDouble *p;
+    p = &ST(st_index);
+    *p = helper_fdiv(ST0, *p);
+}
+
+/* misc FPU operations */
+void helper_fchs_ST0(void)
+{
+    ST0 = floatx_chs(ST0);
+}
+
+void helper_fabs_ST0(void)
+{
+    ST0 = floatx_abs(ST0);
+}
+
+void helper_fld1_ST0(void)
+{
+    ST0 = f15rk[1];
+}
+
+void helper_fldl2t_ST0(void)
+{
+    ST0 = f15rk[6];
+}
+
+void helper_fldl2e_ST0(void)
+{
+    ST0 = f15rk[5];
+}
+
+void helper_fldpi_ST0(void)
+{
+    ST0 = f15rk[2];
+}
+
+void helper_fldlg2_ST0(void)
+{
+    ST0 = f15rk[3];
+}
+
+void helper_fldln2_ST0(void)
+{
+    ST0 = f15rk[4];
+}
+
+void helper_fldz_ST0(void)
+{
+    ST0 = f15rk[0];
+}
+
+void helper_fldz_FT0(void)
+{
+    FT0 = f15rk[0];
+}
+
+#ifndef VBOX
+uint32_t helper_fnstsw(void)
+#else
+RTCCUINTREG helper_fnstsw(void)
+#endif
+{
+    return (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
+}
+
+#ifndef VBOX
+uint32_t helper_fnstcw(void)
+#else
+RTCCUINTREG helper_fnstcw(void)
+#endif
+{
+    return env->fpuc;
+}
+
+static void update_fp_status(void)
+{
+    int rnd_type;
+
+    /* set rounding mode */
+    switch(env->fpuc & RC_MASK) {
+    default:
+    case RC_NEAR:
+        rnd_type = float_round_nearest_even;
+        break;
+    case RC_DOWN:
+        rnd_type = float_round_down;
+        break;
+    case RC_UP:
+        rnd_type = float_round_up;
+        break;
+    case RC_CHOP:
+        rnd_type = float_round_to_zero;
+        break;
+    }
+    set_float_rounding_mode(rnd_type, &env->fp_status);
+#ifdef FLOATX80
+    switch((env->fpuc >> 8) & 3) {
+    case 0:
+        rnd_type = 32;
+        break;
+    case 2:
+        rnd_type = 64;
+        break;
+    case 3:
+    default:
+        rnd_type = 80;
+        break;
+    }
+    set_floatx80_rounding_precision(rnd_type, &env->fp_status);
+#endif
+}
+
+void helper_fldcw(uint32_t val)
+{
+    env->fpuc = val;
+    update_fp_status();
+}
+
+void helper_fclex(void)
+{
+    env->fpus &= 0x7f00;
+}
+
+void helper_fwait(void)
+{
+    if (env->fpus & FPUS_SE)
+        fpu_raise_exception();
+}
+
+void helper_fninit(void)
+{
+    env->fpus = 0;
+    env->fpstt = 0;
+    env->fpuc = 0x37f;
+    env->fptags[0] = 1;
+    env->fptags[1] = 1;
+    env->fptags[2] = 1;
+    env->fptags[3] = 1;
+    env->fptags[4] = 1;
+    env->fptags[5] = 1;
+    env->fptags[6] = 1;
+    env->fptags[7] = 1;
+}
+
+/* BCD ops */
+
+void helper_fbld_ST0(target_ulong ptr)
+{
+    CPU86_LDouble tmp;
+    uint64_t val;
+    unsigned int v;
+    int i;
+
+    val = 0;
+    for(i = 8; i >= 0; i--) {
+        v = ldub(ptr + i);
+        val = (val * 100) + ((v >> 4) * 10) + (v & 0xf);
+    }
+    tmp = val;
+    if (ldub(ptr + 9) & 0x80)
+        tmp = -tmp;
+    fpush();
+    ST0 = tmp;
+}
+
+void helper_fbst_ST0(target_ulong ptr)
+{
+    int v;
+    target_ulong mem_ref, mem_end;
+    int64_t val;
+
+    val = floatx_to_int64(ST0, &env->fp_status);
+    mem_ref = ptr;
+    mem_end = mem_ref + 9;
+    if (val < 0) {
+        stb(mem_end, 0x80);
+        val = -val;
+    } else {
+        stb(mem_end, 0x00);
+    }
+    while (mem_ref < mem_end) {
+        if (val == 0)
+            break;
+        v = val % 100;
+        val = val / 100;
+        v = ((v / 10) << 4) | (v % 10);
+        stb(mem_ref++, v);
+    }
+    while (mem_ref < mem_end) {
+        stb(mem_ref++, 0);
+    }
+}
+
+void helper_f2xm1(void)
+{
+    ST0 = pow(2.0,ST0) - 1.0;
+}
+
+void helper_fyl2x(void)
+{
+    CPU86_LDouble fptemp;
+
+    fptemp = ST0;
+    if (fptemp>0.0){
+        fptemp = log(fptemp)/log(2.0);	 /* log2(ST) */
+        ST1 *= fptemp;
+        fpop();
+    } else {
+        env->fpus &= (~0x4700);
+        env->fpus |= 0x400;
+    }
+}
+
+void helper_fptan(void)
+{
+    CPU86_LDouble fptemp;
+
+    fptemp = ST0;
+    if((fptemp > MAXTAN)||(fptemp < -MAXTAN)) {
+        env->fpus |= 0x400;
+    } else {
+        ST0 = tan(fptemp);
+        fpush();
+        ST0 = 1.0;
+        env->fpus &= (~0x400);  /* C2 <-- 0 */
+        /* the above code is for  |arg| < 2**52 only */
+    }
+}
+
+void helper_fpatan(void)
+{
+    CPU86_LDouble fptemp, fpsrcop;
+
+    fpsrcop = ST1;
+    fptemp = ST0;
+    ST1 = atan2(fpsrcop,fptemp);
+    fpop();
+}
+
+void helper_fxtract(void)
+{
+    CPU86_LDoubleU temp;
+    unsigned int expdif;
+
+    temp.d = ST0;
+    expdif = EXPD(temp) - EXPBIAS;
+    /*DP exponent bias*/
+    ST0 = expdif;
+    fpush();
+    BIASEXPONENT(temp);
+    ST0 = temp.d;
+}
+
+void helper_fprem1(void)
+{
+    CPU86_LDouble dblq, fpsrcop, fptemp;
+    CPU86_LDoubleU fpsrcop1, fptemp1;
+    int expdif;
+    signed long long int q;
+
+#ifndef VBOX /* Unfortunately, we cannot handle isinf/isnan easily in wrapper */
+    if (isinf(ST0) || isnan(ST0) || isnan(ST1) || (ST1 == 0.0)) {
+#else
+    if ((ST0 != ST0) || (ST1 != ST1) || (ST1 == 0.0)) {
+#endif
+        ST0 = 0.0 / 0.0; /* NaN */
+        env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
+        return;
+    }
+
+    fpsrcop = ST0;
+    fptemp = ST1;
+    fpsrcop1.d = fpsrcop;
+    fptemp1.d = fptemp;
+    expdif = EXPD(fpsrcop1) - EXPD(fptemp1);
+
+    if (expdif < 0) {
+        /* optimisation? taken from the AMD docs */
+        env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
+        /* ST0 is unchanged */
+        return;
+    }
+
+    if (expdif < 53) {
+        dblq = fpsrcop / fptemp;
+        /* round dblq towards nearest integer */
+        dblq = rint(dblq);
+        ST0 = fpsrcop - fptemp * dblq;
+
+        /* convert dblq to q by truncating towards zero */
+        if (dblq < 0.0)
+           q = (signed long long int)(-dblq);
+        else
+           q = (signed long long int)dblq;
+
+        env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
+                                /* (C0,C3,C1) <-- (q2,q1,q0) */
+        env->fpus |= (q & 0x4) << (8 - 2);  /* (C0) <-- q2 */
+        env->fpus |= (q & 0x2) << (14 - 1); /* (C3) <-- q1 */
+        env->fpus |= (q & 0x1) << (9 - 0);  /* (C1) <-- q0 */
+    } else {
+        env->fpus |= 0x400;  /* C2 <-- 1 */
+        fptemp = pow(2.0, expdif - 50);
+        fpsrcop = (ST0 / ST1) / fptemp;
+        /* fpsrcop = integer obtained by chopping */
+        fpsrcop = (fpsrcop < 0.0) ?
+                  -(floor(fabs(fpsrcop))) : floor(fpsrcop);
+        ST0 -= (ST1 * fpsrcop * fptemp);
+    }
+}
+
+void helper_fprem(void)
+{
+    CPU86_LDouble dblq, fpsrcop, fptemp;
+    CPU86_LDoubleU fpsrcop1, fptemp1;
+    int expdif;
+    signed long long int q;
+
+#ifndef VBOX /* Unfortunately, we cannot easily handle isinf/isnan in wrapper */
+    if (isinf(ST0) || isnan(ST0) || isnan(ST1) || (ST1 == 0.0)) {
+#else
+    if ((ST0 != ST0) || (ST1 != ST1) || (ST1 == 0.0)) {
+#endif
+       ST0 = 0.0 / 0.0; /* NaN */
+       env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
+       return;
+    }
+
+    fpsrcop = (CPU86_LDouble)ST0;
+    fptemp = (CPU86_LDouble)ST1;
+    fpsrcop1.d = fpsrcop;
+    fptemp1.d = fptemp;
+    expdif = EXPD(fpsrcop1) - EXPD(fptemp1);
+
+    if (expdif < 0) {
+        /* optimisation? taken from the AMD docs */
+        env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
+        /* ST0 is unchanged */
+        return;
+    }
+
+    if ( expdif < 53 ) {
+        dblq = fpsrcop/*ST0*/ / fptemp/*ST1*/;
+        /* round dblq towards zero */
+        dblq = (dblq < 0.0) ? ceil(dblq) : floor(dblq);
+        ST0 = fpsrcop/*ST0*/ - fptemp * dblq;
+
+        /* convert dblq to q by truncating towards zero */
+        if (dblq < 0.0)
+           q = (signed long long int)(-dblq);
+        else
+           q = (signed long long int)dblq;
+
+        env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
+                                /* (C0,C3,C1) <-- (q2,q1,q0) */
+        env->fpus |= (q & 0x4) << (8 - 2);  /* (C0) <-- q2 */
+        env->fpus |= (q & 0x2) << (14 - 1); /* (C3) <-- q1 */
+        env->fpus |= (q & 0x1) << (9 - 0);  /* (C1) <-- q0 */
+    } else {
+        int N = 32 + (expdif % 32); /* as per AMD docs */
+        env->fpus |= 0x400;  /* C2 <-- 1 */
+        fptemp = pow(2.0, (double)(expdif - N));
+        fpsrcop = (ST0 / ST1) / fptemp;
+        /* fpsrcop = integer obtained by chopping */
+        fpsrcop = (fpsrcop < 0.0) ?
+                  -(floor(fabs(fpsrcop))) : floor(fpsrcop);
+        ST0 -= (ST1 * fpsrcop * fptemp);
+    }
+}
+
+void helper_fyl2xp1(void)
+{
+    CPU86_LDouble fptemp;
+
+    fptemp = ST0;
+    if ((fptemp+1.0)>0.0) {
+        fptemp = log(fptemp+1.0) / log(2.0); /* log2(ST+1.0) */
+        ST1 *= fptemp;
+        fpop();
+    } else {
+        env->fpus &= (~0x4700);
+        env->fpus |= 0x400;
+    }
+}
+
+void helper_fsqrt(void)
+{
+    CPU86_LDouble fptemp;
+
+    fptemp = ST0;
+    if (fptemp<0.0) {
+        env->fpus &= (~0x4700);  /* (C3,C2,C1,C0) <-- 0000 */
+        env->fpus |= 0x400;
+    }
+    ST0 = sqrt(fptemp);
+}
+
+void helper_fsincos(void)
+{
+    CPU86_LDouble fptemp;
+
+    fptemp = ST0;
+    if ((fptemp > MAXTAN)||(fptemp < -MAXTAN)) {
+        env->fpus |= 0x400;
+    } else {
+        ST0 = sin(fptemp);
+        fpush();
+        ST0 = cos(fptemp);
+        env->fpus &= (~0x400);  /* C2 <-- 0 */
+        /* the above code is for  |arg| < 2**63 only */
+    }
+}
+
+void helper_frndint(void)
+{
+    ST0 = floatx_round_to_int(ST0, &env->fp_status);
+}
+
+void helper_fscale(void)
+{
+    ST0 = ldexp (ST0, (int)(ST1));
+}
+
+void helper_fsin(void)
+{
+    CPU86_LDouble fptemp;
+
+    fptemp = ST0;
+    if ((fptemp > MAXTAN)||(fptemp < -MAXTAN)) {
+        env->fpus |= 0x400;
+    } else {
+        ST0 = sin(fptemp);
+        env->fpus &= (~0x400);  /* C2 <-- 0 */
+        /* the above code is for  |arg| < 2**53 only */
+    }
+}
+
+void helper_fcos(void)
+{
+    CPU86_LDouble fptemp;
+
+    fptemp = ST0;
+    if((fptemp > MAXTAN)||(fptemp < -MAXTAN)) {
+        env->fpus |= 0x400;
+    } else {
+        ST0 = cos(fptemp);
+        env->fpus &= (~0x400);  /* C2 <-- 0 */
+        /* the above code is for  |arg5 < 2**63 only */
+    }
+}
+
+void helper_fxam_ST0(void)
+{
+    CPU86_LDoubleU temp;
+    int expdif;
+
+    temp.d = ST0;
+
+    env->fpus &= (~0x4700);  /* (C3,C2,C1,C0) <-- 0000 */
+    if (SIGND(temp))
+        env->fpus |= 0x200; /* C1 <-- 1 */
+
+    /* XXX: test fptags too */
+    expdif = EXPD(temp);
+    if (expdif == MAXEXPD) {
+#ifdef USE_X86LDOUBLE
+        if (MANTD(temp) == 0x8000000000000000ULL)
+#else
+        if (MANTD(temp) == 0)
+#endif
+            env->fpus |=  0x500 /*Infinity*/;
+        else
+            env->fpus |=  0x100 /*NaN*/;
+    } else if (expdif == 0) {
+        if (MANTD(temp) == 0)
+            env->fpus |=  0x4000 /*Zero*/;
+        else
+            env->fpus |= 0x4400 /*Denormal*/;
+    } else {
+        env->fpus |= 0x400;
+    }
+}
+
+void helper_fstenv(target_ulong ptr, int data32)
+{
+    int fpus, fptag, exp, i;
+    uint64_t mant;
+    CPU86_LDoubleU tmp;
+
+    fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
+    fptag = 0;
+    for (i=7; i>=0; i--) {
+	fptag <<= 2;
+	if (env->fptags[i]) {
+            fptag |= 3;
+	} else {
+            tmp.d = env->fpregs[i].d;
+            exp = EXPD(tmp);
+            mant = MANTD(tmp);
+            if (exp == 0 && mant == 0) {
+                /* zero */
+	        fptag |= 1;
+	    } else if (exp == 0 || exp == MAXEXPD
+#ifdef USE_X86LDOUBLE
+                       || (mant & (1LL << 63)) == 0
+#endif
+                       ) {
+                /* NaNs, infinity, denormal */
+                fptag |= 2;
+            }
+        }
+    }
+    if (data32) {
+        /* 32 bit */
+        stl(ptr, env->fpuc);
+        stl(ptr + 4, fpus);
+        stl(ptr + 8, fptag);
+        stl(ptr + 12, 0); /* fpip */
+        stl(ptr + 16, 0); /* fpcs */
+        stl(ptr + 20, 0); /* fpoo */
+        stl(ptr + 24, 0); /* fpos */
+    } else {
+        /* 16 bit */
+        stw(ptr, env->fpuc);
+        stw(ptr + 2, fpus);
+        stw(ptr + 4, fptag);
+        stw(ptr + 6, 0);
+        stw(ptr + 8, 0);
+        stw(ptr + 10, 0);
+        stw(ptr + 12, 0);
+    }
+}
+
+void helper_fldenv(target_ulong ptr, int data32)
+{
+    int i, fpus, fptag;
+
+    if (data32) {
+	env->fpuc = lduw(ptr);
+        fpus = lduw(ptr + 4);
+        fptag = lduw(ptr + 8);
+    }
+    else {
+	env->fpuc = lduw(ptr);
+        fpus = lduw(ptr + 2);
+        fptag = lduw(ptr + 4);
+    }
+    env->fpstt = (fpus >> 11) & 7;
+    env->fpus = fpus & ~0x3800;
+    for(i = 0;i < 8; i++) {
+        env->fptags[i] = ((fptag & 3) == 3);
+        fptag >>= 2;
+    }
+}
+
+void helper_fsave(target_ulong ptr, int data32)
+{
+    CPU86_LDouble tmp;
+    int i;
+
+    helper_fstenv(ptr, data32);
+
+    ptr += (14 << data32);
+    for(i = 0;i < 8; i++) {
+        tmp = ST(i);
+        helper_fstt(tmp, ptr);
+        ptr += 10;
+    }
+
+    /* fninit */
+    env->fpus = 0;
+    env->fpstt = 0;
+    env->fpuc = 0x37f;
+    env->fptags[0] = 1;
+    env->fptags[1] = 1;
+    env->fptags[2] = 1;
+    env->fptags[3] = 1;
+    env->fptags[4] = 1;
+    env->fptags[5] = 1;
+    env->fptags[6] = 1;
+    env->fptags[7] = 1;
+}
+
+void helper_frstor(target_ulong ptr, int data32)
+{
+    CPU86_LDouble tmp;
+    int i;
+
+    helper_fldenv(ptr, data32);
+    ptr += (14 << data32);
+
+    for(i = 0;i < 8; i++) {
+        tmp = helper_fldt(ptr);
+        ST(i) = tmp;
+        ptr += 10;
+    }
+}
+
+void helper_fxsave(target_ulong ptr, int data64)
+{
+    int fpus, fptag, i, nb_xmm_regs;
+    CPU86_LDouble tmp;
+    target_ulong addr;
+
+    /* The operand must be 16 byte aligned */
+    if (ptr & 0xf) {
+        raise_exception(EXCP0D_GPF);
+    }
+
+    fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
+    fptag = 0;
+    for(i = 0; i < 8; i++) {
+        fptag |= (env->fptags[i] << i);
+    }
+    stw(ptr, env->fpuc);
+    stw(ptr + 2, fpus);
+    stw(ptr + 4, fptag ^ 0xff);
+#ifdef TARGET_X86_64
+    if (data64) {
+        stq(ptr + 0x08, 0); /* rip */
+        stq(ptr + 0x10, 0); /* rdp */
+    } else
+#endif
+    {
+        stl(ptr + 0x08, 0); /* eip */
+        stl(ptr + 0x0c, 0); /* sel  */
+        stl(ptr + 0x10, 0); /* dp */
+        stl(ptr + 0x14, 0); /* sel  */
+    }
+
+    addr = ptr + 0x20;
+    for(i = 0;i < 8; i++) {
+        tmp = ST(i);
+        helper_fstt(tmp, addr);
+        addr += 16;
+    }
+
+    if (env->cr[4] & CR4_OSFXSR_MASK) {
+        /* XXX: finish it */
+        stl(ptr + 0x18, env->mxcsr); /* mxcsr */
+        stl(ptr + 0x1c, 0x0000ffff); /* mxcsr_mask */
+        if (env->hflags & HF_CS64_MASK)
+            nb_xmm_regs = 16;
+        else
+            nb_xmm_regs = 8;
+        addr = ptr + 0xa0;
+        /* Fast FXSAVE leaves out the XMM registers */
+        if (!(env->efer & MSR_EFER_FFXSR)
+          || (env->hflags & HF_CPL_MASK)
+          || !(env->hflags & HF_LMA_MASK)) {
+            for(i = 0; i < nb_xmm_regs; i++) {
+                stq(addr, env->xmm_regs[i].XMM_Q(0));
+                stq(addr + 8, env->xmm_regs[i].XMM_Q(1));
+                addr += 16;
+            }
+        }
+    }
+}
+
+void helper_fxrstor(target_ulong ptr, int data64)
+{
+    int i, fpus, fptag, nb_xmm_regs;
+    CPU86_LDouble tmp;
+    target_ulong addr;
+
+    /* The operand must be 16 byte aligned */
+    if (ptr & 0xf) {
+        raise_exception(EXCP0D_GPF);
+    }
+
+    env->fpuc = lduw(ptr);
+    fpus = lduw(ptr + 2);
+    fptag = lduw(ptr + 4);
+    env->fpstt = (fpus >> 11) & 7;
+    env->fpus = fpus & ~0x3800;
+    fptag ^= 0xff;
+    for(i = 0;i < 8; i++) {
+        env->fptags[i] = ((fptag >> i) & 1);
+    }
+
+    addr = ptr + 0x20;
+    for(i = 0;i < 8; i++) {
+        tmp = helper_fldt(addr);
+        ST(i) = tmp;
+        addr += 16;
+    }
+
+    if (env->cr[4] & CR4_OSFXSR_MASK) {
+        /* XXX: finish it */
+        env->mxcsr = ldl(ptr + 0x18);
+        //ldl(ptr + 0x1c);
+        if (env->hflags & HF_CS64_MASK)
+            nb_xmm_regs = 16;
+        else
+            nb_xmm_regs = 8;
+        addr = ptr + 0xa0;
+        /* Fast FXRESTORE leaves out the XMM registers */
+        if (!(env->efer & MSR_EFER_FFXSR)
+          || (env->hflags & HF_CPL_MASK)
+          || !(env->hflags & HF_LMA_MASK)) {
+            for(i = 0; i < nb_xmm_regs; i++) {
+#if !defined(VBOX) || __GNUC__ < 4
+                env->xmm_regs[i].XMM_Q(0) = ldq(addr);
+                env->xmm_regs[i].XMM_Q(1) = ldq(addr + 8);
+#else /* VBOX + __GNUC__ >= 4: gcc 4.x compiler bug - it runs out of registers for the 64-bit value. */
+# if 1
+                env->xmm_regs[i].XMM_L(0) = ldl(addr);
+                env->xmm_regs[i].XMM_L(1) = ldl(addr + 4);
+                env->xmm_regs[i].XMM_L(2) = ldl(addr + 8);
+                env->xmm_regs[i].XMM_L(3) = ldl(addr + 12);
+# else
+                /* this works fine on Mac OS X, gcc 4.0.1 */
+                uint64_t u64 = ldq(addr);
+                env->xmm_regs[i].XMM_Q(0);
+                u64 = ldq(addr + 4);
+                env->xmm_regs[i].XMM_Q(1) = u64;
+# endif
+#endif
+                addr += 16;
+            }
+        }
+    }
+}
+
+#ifndef USE_X86LDOUBLE
+
+void cpu_get_fp80(uint64_t *pmant, uint16_t *pexp, CPU86_LDouble f)
+{
+    CPU86_LDoubleU temp;
+    int e;
+
+    temp.d = f;
+    /* mantissa */
+    *pmant = (MANTD(temp) << 11) | (1LL << 63);
+    /* exponent + sign */
+    e = EXPD(temp) - EXPBIAS + 16383;
+    e |= SIGND(temp) >> 16;
+    *pexp = e;
+}
+
+CPU86_LDouble cpu_set_fp80(uint64_t mant, uint16_t upper)
+{
+    CPU86_LDoubleU temp;
+    int e;
+    uint64_t ll;
+
+    /* XXX: handle overflow ? */
+    e = (upper & 0x7fff) - 16383 + EXPBIAS; /* exponent */
+    e |= (upper >> 4) & 0x800; /* sign */
+    ll = (mant >> 11) & ((1LL << 52) - 1);
+#ifdef __arm__
+    temp.l.upper = (e << 20) | (ll >> 32);
+    temp.l.lower = ll;
+#else
+    temp.ll = ll | ((uint64_t)e << 52);
+#endif
+    return temp.d;
+}
+
+#else
+
+void cpu_get_fp80(uint64_t *pmant, uint16_t *pexp, CPU86_LDouble f)
+{
+    CPU86_LDoubleU temp;
+
+    temp.d = f;
+    *pmant = temp.l.lower;
+    *pexp = temp.l.upper;
+}
+
+CPU86_LDouble cpu_set_fp80(uint64_t mant, uint16_t upper)
+{
+    CPU86_LDoubleU temp;
+
+    temp.l.upper = upper;
+    temp.l.lower = mant;
+    return temp.d;
+}
+#endif
+
+#ifdef TARGET_X86_64
+
+//#define DEBUG_MULDIV
+
+static void add128(uint64_t *plow, uint64_t *phigh, uint64_t a, uint64_t b)
+{
+    *plow += a;
+    /* carry test */
+    if (*plow < a)
+        (*phigh)++;
+    *phigh += b;
+}
+
+static void neg128(uint64_t *plow, uint64_t *phigh)
+{
+    *plow = ~ *plow;
+    *phigh = ~ *phigh;
+    add128(plow, phigh, 1, 0);
+}
+
+/* return TRUE if overflow */
+static int div64(uint64_t *plow, uint64_t *phigh, uint64_t b)
+{
+    uint64_t q, r, a1, a0;
+    int i, qb, ab;
+
+    a0 = *plow;
+    a1 = *phigh;
+    if (a1 == 0) {
+        q = a0 / b;
+        r = a0 % b;
+        *plow = q;
+        *phigh = r;
+    } else {
+        if (a1 >= b)
+            return 1;
+        /* XXX: use a better algorithm */
+        for(i = 0; i < 64; i++) {
+            ab = a1 >> 63;
+            a1 = (a1 << 1) | (a0 >> 63);
+            if (ab || a1 >= b) {
+                a1 -= b;
+                qb = 1;
+            } else {
+                qb = 0;
+            }
+            a0 = (a0 << 1) | qb;
+        }
+#if defined(DEBUG_MULDIV)
+        printf("div: 0x%016" PRIx64 "%016" PRIx64 " / 0x%016" PRIx64 ": q=0x%016" PRIx64 " r=0x%016" PRIx64 "\n",
+               *phigh, *plow, b, a0, a1);
+#endif
+        *plow = a0;
+        *phigh = a1;
+    }
+    return 0;
+}
+
+/* return TRUE if overflow */
+static int idiv64(uint64_t *plow, uint64_t *phigh, int64_t b)
+{
+    int sa, sb;
+    sa = ((int64_t)*phigh < 0);
+    if (sa)
+        neg128(plow, phigh);
+    sb = (b < 0);
+    if (sb)
+        b = -b;
+    if (div64(plow, phigh, b) != 0)
+        return 1;
+    if (sa ^ sb) {
+        if (*plow > (1ULL << 63))
+            return 1;
+        *plow = - *plow;
+    } else {
+        if (*plow >= (1ULL << 63))
+            return 1;
+    }
+    if (sa)
+        *phigh = - *phigh;
+    return 0;
+}
+
+void helper_mulq_EAX_T0(target_ulong t0)
+{
+    uint64_t r0, r1;
+
+    mulu64(&r0, &r1, EAX, t0);
+    EAX = r0;
+    EDX = r1;
+    CC_DST = r0;
+    CC_SRC = r1;
+}
+
+void helper_imulq_EAX_T0(target_ulong t0)
+{
+    uint64_t r0, r1;
+
+    muls64(&r0, &r1, EAX, t0);
+    EAX = r0;
+    EDX = r1;
+    CC_DST = r0;
+    CC_SRC = ((int64_t)r1 != ((int64_t)r0 >> 63));
+}
+
+target_ulong helper_imulq_T0_T1(target_ulong t0, target_ulong t1)
+{
+    uint64_t r0, r1;
+
+    muls64(&r0, &r1, t0, t1);
+    CC_DST = r0;
+    CC_SRC = ((int64_t)r1 != ((int64_t)r0 >> 63));
+    return r0;
+}
+
+void helper_divq_EAX(target_ulong t0)
+{
+    uint64_t r0, r1;
+    if (t0 == 0) {
+        raise_exception(EXCP00_DIVZ);
+    }
+    r0 = EAX;
+    r1 = EDX;
+    if (div64(&r0, &r1, t0))
+        raise_exception(EXCP00_DIVZ);
+    EAX = r0;
+    EDX = r1;
+}
+
+void helper_idivq_EAX(target_ulong t0)
+{
+    uint64_t r0, r1;
+    if (t0 == 0) {
+        raise_exception(EXCP00_DIVZ);
+    }
+    r0 = EAX;
+    r1 = EDX;
+    if (idiv64(&r0, &r1, t0))
+        raise_exception(EXCP00_DIVZ);
+    EAX = r0;
+    EDX = r1;
+}
+#endif
+
+static void do_hlt(void)
+{
+    env->hflags &= ~HF_INHIBIT_IRQ_MASK; /* needed if sti is just before */
+    env->halted = 1;
+    env->exception_index = EXCP_HLT;
+    cpu_loop_exit();
+}
+
+void helper_hlt(int next_eip_addend)
+{
+    helper_svm_check_intercept_param(SVM_EXIT_HLT, 0);
+    EIP += next_eip_addend;
+
+    do_hlt();
+}
+
+void helper_monitor(target_ulong ptr)
+{
+#ifdef VBOX
+    if ((uint32_t)ECX > 1)
+        raise_exception(EXCP0D_GPF);
+#else  /* !VBOX */
+    if ((uint32_t)ECX != 0)
+        raise_exception(EXCP0D_GPF);
+#endif /* !VBOX */
+    /* XXX: store address ? */
+    helper_svm_check_intercept_param(SVM_EXIT_MONITOR, 0);
+}
+
+void helper_mwait(int next_eip_addend)
+{
+    if ((uint32_t)ECX != 0)
+        raise_exception(EXCP0D_GPF);
+#ifdef VBOX
+    helper_hlt(next_eip_addend);
+#else /* !VBOX */
+    helper_svm_check_intercept_param(SVM_EXIT_MWAIT, 0);
+    EIP += next_eip_addend;
+
+    /* XXX: not complete but not completely erroneous */
+    if (env->cpu_index != 0 || env->next_cpu != NULL) {
+        /* more than one CPU: do not sleep because another CPU may
+           wake this one */
+    } else {
+        do_hlt();
+    }
+#endif /* !VBOX */
+}
+
+void helper_debug(void)
+{
+    env->exception_index = EXCP_DEBUG;
+    cpu_loop_exit();
+}
+
+void helper_reset_rf(void)
+{
+    env->eflags &= ~RF_MASK;
+}
+
+void helper_raise_interrupt(int intno, int next_eip_addend)
+{
+    raise_interrupt(intno, 1, 0, next_eip_addend);
+}
+
+void helper_raise_exception(int exception_index)
+{
+    raise_exception(exception_index);
+}
+
+void helper_cli(void)
+{
+    env->eflags &= ~IF_MASK;
+}
+
+void helper_sti(void)
+{
+    env->eflags |= IF_MASK;
+}
+
+#ifdef VBOX
+void helper_cli_vme(void)
+{
+    env->eflags &= ~VIF_MASK;
+}
+
+void helper_sti_vme(void)
+{
+    /* First check, then change eflags according to the AMD manual */
+    if (env->eflags & VIP_MASK) {
+        raise_exception(EXCP0D_GPF);
+    }
+    env->eflags |= VIF_MASK;
+}
+#endif /* VBOX */
+
+#if 0
+/* vm86plus instructions */
+void helper_cli_vm(void)
+{
+    env->eflags &= ~VIF_MASK;
+}
+
+void helper_sti_vm(void)
+{
+    env->eflags |= VIF_MASK;
+    if (env->eflags & VIP_MASK) {
+        raise_exception(EXCP0D_GPF);
+    }
+}
+#endif
+
+void helper_set_inhibit_irq(void)
+{
+    env->hflags |= HF_INHIBIT_IRQ_MASK;
+}
+
+void helper_reset_inhibit_irq(void)
+{
+    env->hflags &= ~HF_INHIBIT_IRQ_MASK;
+}
+
+void helper_boundw(target_ulong a0, int v)
+{
+    int low, high;
+    low = ldsw(a0);
+    high = ldsw(a0 + 2);
+    v = (int16_t)v;
+    if (v < low || v > high) {
+        raise_exception(EXCP05_BOUND);
+    }
+}
+
+void helper_boundl(target_ulong a0, int v)
+{
+    int low, high;
+    low = ldl(a0);
+    high = ldl(a0 + 4);
+    if (v < low || v > high) {
+        raise_exception(EXCP05_BOUND);
+    }
+}
+
+static float approx_rsqrt(float a)
+{
+    return 1.0 / sqrt(a);
+}
+
+static float approx_rcp(float a)
+{
+    return 1.0 / a;
+}
+
+#if !defined(CONFIG_USER_ONLY)
+
+#define MMUSUFFIX _mmu
+
+#define SHIFT 0
+#include "softmmu_template.h"
+
+#define SHIFT 1
+#include "softmmu_template.h"
+
+#define SHIFT 2
+#include "softmmu_template.h"
+
+#define SHIFT 3
+#include "softmmu_template.h"
+
+#endif
+
+#if defined(VBOX) && defined(REM_PHYS_ADDR_IN_TLB)
+/* This code assumes real physical address always fit into host CPU reg,
+   which is wrong in general, but true for our current use cases. */
+RTCCUINTREG REGPARM __ldb_vbox_phys(RTCCUINTREG addr)
+{
+    return remR3PhysReadS8(addr);
+}
+RTCCUINTREG REGPARM __ldub_vbox_phys(RTCCUINTREG addr)
+{
+    return remR3PhysReadU8(addr);
+}
+void REGPARM __stb_vbox_phys(RTCCUINTREG addr, RTCCUINTREG val)
+{
+    remR3PhysWriteU8(addr, val);
+}
+RTCCUINTREG REGPARM __ldw_vbox_phys(RTCCUINTREG addr)
+{
+    return remR3PhysReadS16(addr);
+}
+RTCCUINTREG REGPARM __lduw_vbox_phys(RTCCUINTREG addr)
+{
+    return remR3PhysReadU16(addr);
+}
+void REGPARM __stw_vbox_phys(RTCCUINTREG addr, RTCCUINTREG val)
+{
+    remR3PhysWriteU16(addr, val);
+}
+RTCCUINTREG REGPARM __ldl_vbox_phys(RTCCUINTREG addr)
+{
+     return remR3PhysReadS32(addr);
+}
+RTCCUINTREG REGPARM __ldul_vbox_phys(RTCCUINTREG addr)
+{
+     return remR3PhysReadU32(addr);
+}
+void REGPARM __stl_vbox_phys(RTCCUINTREG addr, RTCCUINTREG val)
+{
+    remR3PhysWriteU32(addr, val);
+}
+uint64_t REGPARM __ldq_vbox_phys(RTCCUINTREG addr)
+{
+     return remR3PhysReadU64(addr);
+}
+void REGPARM __stq_vbox_phys(RTCCUINTREG addr, uint64_t val)
+{
+    remR3PhysWriteU64(addr, val);
+}
+#endif /* VBOX */
+
+#if !defined(CONFIG_USER_ONLY)
+/* try to fill the TLB and return an exception if error. If retaddr is
+   NULL, it means that the function was called in C code (i.e. not
+   from generated code or from helper.c) */
+/* XXX: fix it to restore all registers */
+void tlb_fill(target_ulong addr, int is_write, int mmu_idx, void *retaddr)
+{
+    TranslationBlock *tb;
+    int ret;
+    uintptr_t pc;
+    CPUX86State *saved_env;
+
+    /* XXX: hack to restore env in all cases, even if not called from
+       generated code */
+    saved_env = env;
+    env = cpu_single_env;
+
+    ret = cpu_x86_handle_mmu_fault(env, addr, is_write, mmu_idx, 1);
+    if (ret) {
+        if (retaddr) {
+            /* now we have a real cpu fault */
+            pc = (uintptr_t)retaddr;
+            tb = tb_find_pc(pc);
+            if (tb) {
+                /* the PC is inside the translated code. It means that we have
+                   a virtual CPU fault */
+                cpu_restore_state(tb, env, pc, NULL);
+            }
+        }
+        raise_exception_err(env->exception_index, env->error_code);
+    }
+    env = saved_env;
+}
+#endif
+
+#ifdef VBOX
+
+/**
+ * Correctly computes the eflags.
+ * @returns eflags.
+ * @param   env1    CPU environment.
+ */
+uint32_t raw_compute_eflags(CPUX86State *env1)
+{
+    CPUX86State *savedenv = env;
+    uint32_t efl;
+    env = env1;
+    efl = compute_eflags();
+    env = savedenv;
+    return efl;
+}
+
+/**
+ * Reads byte from virtual address in guest memory area.
+ * XXX: is it working for any addresses? swapped out pages?
+ * @returns read data byte.
+ * @param   env1    CPU environment.
+ * @param   pvAddr  GC Virtual address.
+ */
+uint8_t read_byte(CPUX86State *env1, target_ulong addr)
+{
+    CPUX86State *savedenv = env;
+    uint8_t u8;
+    env = env1;
+    u8 = ldub_kernel(addr);
+    env = savedenv;
+    return u8;
+}
+
+/**
+ * Reads byte from virtual address in guest memory area.
+ * XXX: is it working for any addresses? swapped out pages?
+ * @returns read data byte.
+ * @param   env1    CPU environment.
+ * @param   pvAddr  GC Virtual address.
+ */
+uint16_t read_word(CPUX86State *env1, target_ulong addr)
+{
+    CPUX86State *savedenv = env;
+    uint16_t u16;
+    env = env1;
+    u16 = lduw_kernel(addr);
+    env = savedenv;
+    return u16;
+}
+
+/**
+ * Reads byte from virtual address in guest memory area.
+ * XXX: is it working for any addresses? swapped out pages?
+ * @returns read data byte.
+ * @param   env1    CPU environment.
+ * @param   pvAddr  GC Virtual address.
+ */
+uint32_t read_dword(CPUX86State *env1, target_ulong addr)
+{
+    CPUX86State *savedenv = env;
+    uint32_t u32;
+    env = env1;
+    u32 = ldl_kernel(addr);
+    env = savedenv;
+    return u32;
+}
+
+/**
+ * Writes byte to virtual address in guest memory area.
+ * XXX: is it working for any addresses? swapped out pages?
+ * @returns read data byte.
+ * @param   env1    CPU environment.
+ * @param   pvAddr  GC Virtual address.
+ * @param   val     byte value
+ */
+void write_byte(CPUX86State *env1, target_ulong addr, uint8_t val)
+{
+    CPUX86State *savedenv = env;
+    env = env1;
+    stb(addr, val);
+    env = savedenv;
+}
+
+void write_word(CPUX86State *env1, target_ulong addr, uint16_t val)
+{
+    CPUX86State *savedenv = env;
+    env = env1;
+    stw(addr, val);
+    env = savedenv;
+}
+
+void write_dword(CPUX86State *env1, target_ulong addr, uint32_t val)
+{
+    CPUX86State *savedenv = env;
+    env = env1;
+    stl(addr, val);
+    env = savedenv;
+}
+
+/**
+ * Correctly loads selector into segment register with updating internal
+ * qemu data/caches.
+ * @param   env1        CPU environment.
+ * @param   seg_reg     Segment register.
+ * @param   selector    Selector to load.
+ */
+void sync_seg(CPUX86State *env1, int seg_reg, int selector)
+{
+    CPUX86State *savedenv = env;
+#ifdef FORCE_SEGMENT_SYNC
+    jmp_buf old_buf;
+#endif
+
+    env = env1;
+
+    if (    env->eflags & X86_EFL_VM
+        ||  !(env->cr[0] & X86_CR0_PE))
+    {
+        load_seg_vm(seg_reg, selector);
+
+        env = savedenv;
+
+        /* Successful sync. */
+        Assert(env1->segs[seg_reg].newselector == 0);
+    }
+    else
+    {
+        /* For some reasons, it works even w/o save/restore of the jump buffer, so as code is
+           time critical - let's not do that */
+#ifdef FORCE_SEGMENT_SYNC
+        memcpy(&old_buf, &env1->jmp_env, sizeof(old_buf));
+#endif
+        if (setjmp(env1->jmp_env) == 0)
+        {
+            if (seg_reg == R_CS)
+            {
+                uint32_t e1, e2;
+                e1 = e2 = 0;
+                load_segment(&e1, &e2, selector);
+                cpu_x86_load_seg_cache(env, R_CS, selector,
+                                       get_seg_base(e1, e2),
+                                       get_seg_limit(e1, e2),
+                                       e2);
+            }
+            else
+                helper_load_seg(seg_reg, selector);
+            /* We used to use tss_load_seg(seg_reg, selector); which, for some reasons ignored
+               loading 0 selectors, what, in order, lead to subtle problems like #3588 */
+
+            env = savedenv;
+
+            /* Successful sync. */
+            Assert(env1->segs[seg_reg].newselector == 0);
+        }
+        else
+        {
+            env = savedenv;
+
+            /* Postpone sync until the guest uses the selector. */
+            env1->segs[seg_reg].selector    = selector;     /* hidden values are now incorrect, but will be resynced when this register is accessed. */
+            env1->segs[seg_reg].newselector = selector;
+            Log(("sync_seg: out of sync seg_reg=%d selector=%#x\n", seg_reg, selector));
+            env1->exception_index = -1;
+            env1->error_code = 0;
+            env1->old_exception = -1;
+        }
+#ifdef FORCE_SEGMENT_SYNC
+        memcpy(&env1->jmp_env, &old_buf, sizeof(old_buf));
+#endif
+    }
+
+}
+
+DECLINLINE(void) tb_reset_jump(TranslationBlock *tb, int n)
+{
+    tb_set_jmp_target(tb, n, (uintptr_t)(tb->tc_ptr + tb->tb_next_offset[n]));
+}
+
+
+int emulate_single_instr(CPUX86State *env1)
+{
+    TranslationBlock *tb;
+    TranslationBlock *current;
+    int flags;
+    uint8_t *tc_ptr;
+    target_ulong old_eip;
+
+    /* ensures env is loaded! */
+    CPUX86State *savedenv = env;
+    env = env1;
+
+    RAWEx_ProfileStart(env, STATS_EMULATE_SINGLE_INSTR);
+
+    current = env->current_tb;
+    env->current_tb = NULL;
+    flags = env->hflags | (env->eflags & (IOPL_MASK | TF_MASK | VM_MASK));
+
+    /*
+     * Translate only one instruction.
+     */
+    ASMAtomicOrU32(&env->state, CPU_EMULATE_SINGLE_INSTR);
+    tb = tb_gen_code(env, env->eip + env->segs[R_CS].base,
+                     env->segs[R_CS].base, flags, 0);
+
+    ASMAtomicAndU32(&env->state, ~CPU_EMULATE_SINGLE_INSTR);
+
+
+    /* tb_link_phys: */
+    tb->jmp_first = (TranslationBlock *)((intptr_t)tb | 2);
+    tb->jmp_next[0] = NULL;
+    tb->jmp_next[1] = NULL;
+    Assert(tb->jmp_next[0] == NULL);
+    Assert(tb->jmp_next[1] == NULL);
+    if (tb->tb_next_offset[0] != 0xffff)
+        tb_reset_jump(tb, 0);
+    if (tb->tb_next_offset[1] != 0xffff)
+        tb_reset_jump(tb, 1);
+
+    /*
+     * Execute it using emulation
+     */
+    old_eip = env->eip;
+    env->current_tb = tb;
+
+    /*
+     *  eip remains the same for repeated instructions; no idea why qemu doesn't do a jump inside the generated code
+     * perhaps not a very safe hack
+     */
+    while (old_eip == env->eip)
+    {
+        tc_ptr = tb->tc_ptr;
+
+#if defined(VBOX) && defined(GCC_WITH_BUGGY_REGPARM)
+        int fake_ret;
+        tcg_qemu_tb_exec(tc_ptr, fake_ret);
+#else
+        tcg_qemu_tb_exec(tc_ptr);
+#endif
+
+        /*
+         * Exit once we detect an external interrupt and interrupts are enabled
+         */
+        if (   (env->interrupt_request & (CPU_INTERRUPT_EXTERNAL_EXIT | CPU_INTERRUPT_EXTERNAL_TIMER))
+            || (   (env->eflags & IF_MASK)
+                && !(env->hflags & HF_INHIBIT_IRQ_MASK)
+                && (env->interrupt_request & CPU_INTERRUPT_EXTERNAL_HARD) )
+           )
+        {
+            break;
+        }
+        if (env->interrupt_request & CPU_INTERRUPT_EXTERNAL_FLUSH_TLB) {
+            tlb_flush(env, true);
+        }
+    }
+    env->current_tb = current;
+
+    tb_phys_invalidate(tb, -1);
+    tb_free(tb);
+/*
+    Assert(tb->tb_next_offset[0] == 0xffff);
+    Assert(tb->tb_next_offset[1] == 0xffff);
+    Assert(tb->tb_next[0] == 0xffff);
+    Assert(tb->tb_next[1] == 0xffff);
+    Assert(tb->jmp_next[0] == NULL);
+    Assert(tb->jmp_next[1] == NULL);
+    Assert(tb->jmp_first == NULL); */
+
+    RAWEx_ProfileStop(env, STATS_EMULATE_SINGLE_INSTR);
+
+    /*
+     * Execute the next instruction when we encounter instruction fusing.
+     */
+    if (env->hflags & HF_INHIBIT_IRQ_MASK)
+    {
+        Log(("REM: Emulating next instruction due to instruction fusing (HF_INHIBIT_IRQ_MASK) at %RGv\n", env->eip));
+        env->hflags &= ~HF_INHIBIT_IRQ_MASK;
+        emulate_single_instr(env);
+    }
+
+    env = savedenv;
+    return 0;
+}
+
+/**
+ * Correctly loads a new ldtr selector.
+ *
+ * @param   env1        CPU environment.
+ * @param   selector    Selector to load.
+ */
+void sync_ldtr(CPUX86State *env1, int selector)
+{
+    CPUX86State *saved_env = env;
+    if (setjmp(env1->jmp_env) == 0)
+    {
+        env = env1;
+        helper_lldt(selector);
+        env = saved_env;
+    }
+    else
+    {
+        env = saved_env;
+#ifdef VBOX_STRICT
+        cpu_abort(env1, "sync_ldtr: selector=%#x\n", selector);
+#endif
+    }
+}
+
+int get_ss_esp_from_tss_raw(CPUX86State *env1, uint32_t *ss_ptr,
+                             uint32_t *esp_ptr, int dpl)
+{
+    int type, index, shift;
+
+    CPUX86State *savedenv = env;
+    env = env1;
+
+    if (!(env->tr.flags & DESC_P_MASK))
+        cpu_abort(env, "invalid tss");
+    type = (env->tr.flags >> DESC_TYPE_SHIFT) & 0xf;
+    if ((type & 7) != 3)
+        cpu_abort(env, "invalid tss type %d", type);
+    shift = type >> 3;
+    index = (dpl * 4 + 2) << shift;
+    if (index + (4 << shift) - 1 > env->tr.limit)
+    {
+        env = savedenv;
+        return 0;
+    }
+        //raise_exception_err(EXCP0A_TSS, env->tr.selector & 0xfffc);
+
+    if (shift == 0) {
+        *esp_ptr = lduw_kernel(env->tr.base + index);
+        *ss_ptr = lduw_kernel(env->tr.base + index + 2);
+    } else {
+        *esp_ptr = ldl_kernel(env->tr.base + index);
+        *ss_ptr = lduw_kernel(env->tr.base + index + 4);
+    }
+
+    env = savedenv;
+    return 1;
+}
+
+//*****************************************************************************
+// Needs to be at the bottom of the file (overriding macros)
+
+static inline CPU86_LDouble helper_fldt_raw(uint8_t *ptr)
+{
+#ifdef USE_X86LDOUBLE
+    CPU86_LDoubleU tmp;
+    tmp.l.lower = *(uint64_t const *)ptr;
+    tmp.l.upper = *(uint16_t const *)(ptr + 8);
+    return tmp.d;
+#else
+# error "Busted FPU saving/restoring!"
+    return *(CPU86_LDouble *)ptr;
+#endif
+}
+
+static inline void helper_fstt_raw(CPU86_LDouble f, uint8_t *ptr)
+{
+#ifdef USE_X86LDOUBLE
+    CPU86_LDoubleU tmp;
+    tmp.d = f;
+    *(uint64_t *)(ptr +  0) = tmp.l.lower;
+    *(uint16_t *)(ptr +  8) = tmp.l.upper;
+    *(uint16_t *)(ptr + 10) = 0;
+    *(uint32_t *)(ptr + 12) = 0;
+    AssertCompile(sizeof(long double) > 8);
+#else
+# error "Busted FPU saving/restoring!"
+    *(CPU86_LDouble *)ptr = f;
+#endif
+}
+
+#undef stw
+#undef stl
+#undef stq
+#define stw(a,b) *(uint16_t *)(a) = (uint16_t)(b)
+#define stl(a,b) *(uint32_t *)(a) = (uint32_t)(b)
+#define stq(a,b) *(uint64_t *)(a) = (uint64_t)(b)
+
+//*****************************************************************************
+void restore_raw_fp_state(CPUX86State *env, uint8_t *ptr)
+{
+    int fpus, fptag, i, nb_xmm_regs;
+    CPU86_LDouble tmp;
+    uint8_t *addr;
+    int data64 = !!(env->hflags & HF_LMA_MASK);
+
+    if (env->cpuid_features & CPUID_FXSR)
+    {
+        fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
+        fptag = 0;
+        for(i = 0; i < 8; i++) {
+            fptag |= (env->fptags[i] << i);
+        }
+        stw(ptr, env->fpuc);
+        stw(ptr + 2, fpus);
+        stw(ptr + 4, fptag ^ 0xff);
+
+        addr = ptr + 0x20;
+        for(i = 0;i < 8; i++) {
+            tmp = ST(i);
+            helper_fstt_raw(tmp, addr);
+            addr += 16;
+        }
+
+        if (env->cr[4] & CR4_OSFXSR_MASK) {
+            /* XXX: finish it */
+            stl(ptr + 0x18, env->mxcsr); /* mxcsr */
+            stl(ptr + 0x1c, 0x0000ffff); /* mxcsr_mask */
+            nb_xmm_regs = 8 << data64;
+            addr = ptr + 0xa0;
+            for(i = 0; i < nb_xmm_regs; i++) {
+#if __GNUC__ < 4
+                stq(addr, env->xmm_regs[i].XMM_Q(0));
+                stq(addr + 8, env->xmm_regs[i].XMM_Q(1));
+#else /* VBOX + __GNUC__ >= 4: gcc 4.x compiler bug - it runs out of registers for the 64-bit value. */
+                stl(addr, env->xmm_regs[i].XMM_L(0));
+                stl(addr + 4, env->xmm_regs[i].XMM_L(1));
+                stl(addr + 8, env->xmm_regs[i].XMM_L(2));
+                stl(addr + 12, env->xmm_regs[i].XMM_L(3));
+#endif
+                addr += 16;
+            }
+        }
+    }
+    else
+    {
+        PX86FPUSTATE fp = (PX86FPUSTATE)ptr;
+        int fptag;
+
+        fp->FCW = env->fpuc;
+        fp->FSW = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
+        fptag = 0;
+        for (i=7; i>=0; i--) {
+	        fptag <<= 2;
+	        if (env->fptags[i]) {
+                fptag |= 3;
+            } else {
+                /* the FPU automatically computes it */
+            }
+        }
+        fp->FTW = fptag;
+
+        for(i = 0;i < 8; i++) {
+            tmp = ST(i);
+            helper_fstt_raw(tmp, &fp->regs[i].au8[0]);
+        }
+    }
+}
+
+//*****************************************************************************
+#undef lduw
+#undef ldl
+#undef ldq
+#define lduw(a) *(uint16_t *)(a)
+#define ldl(a)  *(uint32_t *)(a)
+#define ldq(a)  *(uint64_t *)(a)
+//*****************************************************************************
+void save_raw_fp_state(CPUX86State *env, uint8_t *ptr)
+{
+    int i, fpus, fptag, nb_xmm_regs;
+    CPU86_LDouble tmp;
+    uint8_t *addr;
+    int data64 = !!(env->hflags & HF_LMA_MASK); /* don't use HF_CS64_MASK here as cs hasn't been synced when this function is called. */
+
+    if (env->cpuid_features & CPUID_FXSR)
+    {
+        env->fpuc = lduw(ptr);
+        fpus = lduw(ptr + 2);
+        fptag = lduw(ptr + 4);
+        env->fpstt = (fpus >> 11) & 7;
+        env->fpus = fpus & ~0x3800;
+        fptag ^= 0xff;
+        for(i = 0;i < 8; i++) {
+            env->fptags[i] = ((fptag >> i) & 1);
+        }
+
+        addr = ptr + 0x20;
+        for(i = 0;i < 8; i++) {
+            tmp = helper_fldt_raw(addr);
+            ST(i) = tmp;
+            addr += 16;
+        }
+
+        if (env->cr[4] & CR4_OSFXSR_MASK) {
+            /* XXX: finish it, endianness */
+            env->mxcsr = ldl(ptr + 0x18);
+            //ldl(ptr + 0x1c);
+            nb_xmm_regs = 8 << data64;
+            addr = ptr + 0xa0;
+            for(i = 0; i < nb_xmm_regs; i++) {
+#if HC_ARCH_BITS == 32
+                /* this is a workaround for http://gcc.gnu.org/bugzilla/show_bug.cgi?id=35135 */
+                env->xmm_regs[i].XMM_L(0) = ldl(addr);
+                env->xmm_regs[i].XMM_L(1) = ldl(addr + 4);
+                env->xmm_regs[i].XMM_L(2) = ldl(addr + 8);
+                env->xmm_regs[i].XMM_L(3) = ldl(addr + 12);
+#else
+                env->xmm_regs[i].XMM_Q(0) = ldq(addr);
+                env->xmm_regs[i].XMM_Q(1) = ldq(addr + 8);
+#endif
+                addr += 16;
+            }
+        }
+    }
+    else
+    {
+        PX86FPUSTATE fp = (PX86FPUSTATE)ptr;
+        int fptag, j;
+
+        env->fpuc = fp->FCW;
+        env->fpstt = (fp->FSW >> 11) & 7;
+        env->fpus = fp->FSW & ~0x3800;
+        fptag = fp->FTW;
+        for(i = 0;i < 8; i++) {
+            env->fptags[i] = ((fptag & 3) == 3);
+            fptag >>= 2;
+        }
+        j = env->fpstt;
+        for(i = 0;i < 8; i++) {
+            tmp = helper_fldt_raw(&fp->regs[i].au8[0]);
+            ST(i) = tmp;
+        }
+    }
+}
+//*****************************************************************************
+//*****************************************************************************
+
+#endif /* VBOX */
+
+/* Secure Virtual Machine helpers */
+
+#if defined(CONFIG_USER_ONLY)
+
+void helper_vmrun(int aflag, int next_eip_addend)
+{
+}
+void helper_vmmcall(void)
+{
+}
+void helper_vmload(int aflag)
+{
+}
+void helper_vmsave(int aflag)
+{
+}
+void helper_stgi(void)
+{
+}
+void helper_clgi(void)
+{
+}
+void helper_skinit(void)
+{
+}
+void helper_invlpga(int aflag)
+{
+}
+void helper_vmexit(uint32_t exit_code, uint64_t exit_info_1)
+{
+}
+void helper_svm_check_intercept_param(uint32_t type, uint64_t param)
+{
+}
+
+void helper_svm_check_io(uint32_t port, uint32_t param,
+                         uint32_t next_eip_addend)
+{
+}
+#else
+
+static inline void svm_save_seg(target_phys_addr_t addr,
+                                const SegmentCache *sc)
+{
+    stw_phys(addr + offsetof(struct vmcb_seg, selector),
+             sc->selector);
+    stq_phys(addr + offsetof(struct vmcb_seg, base),
+             sc->base);
+    stl_phys(addr + offsetof(struct vmcb_seg, limit),
+             sc->limit);
+    stw_phys(addr + offsetof(struct vmcb_seg, attrib),
+             ((sc->flags >> 8) & 0xff) | ((sc->flags >> 12) & 0x0f00));
+}
+
+static inline void svm_load_seg(target_phys_addr_t addr, SegmentCache *sc)
+{
+    unsigned int flags;
+
+    sc->selector = lduw_phys(addr + offsetof(struct vmcb_seg, selector));
+    sc->base = ldq_phys(addr + offsetof(struct vmcb_seg, base));
+    sc->limit = ldl_phys(addr + offsetof(struct vmcb_seg, limit));
+    flags = lduw_phys(addr + offsetof(struct vmcb_seg, attrib));
+    sc->flags = ((flags & 0xff) << 8) | ((flags & 0x0f00) << 12);
+}
+
+static inline void svm_load_seg_cache(target_phys_addr_t addr,
+                                      CPUState *env, int seg_reg)
+{
+    SegmentCache sc1, *sc = &sc1;
+    svm_load_seg(addr, sc);
+    cpu_x86_load_seg_cache(env, seg_reg, sc->selector,
+                           sc->base, sc->limit, sc->flags);
+}
+
+void helper_vmrun(int aflag, int next_eip_addend)
+{
+    target_ulong addr;
+    uint32_t event_inj;
+    uint32_t int_ctl;
+
+    helper_svm_check_intercept_param(SVM_EXIT_VMRUN, 0);
+
+    if (aflag == 2)
+        addr = EAX;
+    else
+        addr = (uint32_t)EAX;
+
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "vmrun! " TARGET_FMT_lx "\n", addr);
+
+    env->vm_vmcb = addr;
+
+    /* save the current CPU state in the hsave page */
+    stq_phys(env->vm_hsave + offsetof(struct vmcb, save.gdtr.base), env->gdt.base);
+    stl_phys(env->vm_hsave + offsetof(struct vmcb, save.gdtr.limit), env->gdt.limit);
+
+    stq_phys(env->vm_hsave + offsetof(struct vmcb, save.idtr.base), env->idt.base);
+    stl_phys(env->vm_hsave + offsetof(struct vmcb, save.idtr.limit), env->idt.limit);
+
+    stq_phys(env->vm_hsave + offsetof(struct vmcb, save.cr0), env->cr[0]);
+    stq_phys(env->vm_hsave + offsetof(struct vmcb, save.cr2), env->cr[2]);
+    stq_phys(env->vm_hsave + offsetof(struct vmcb, save.cr3), env->cr[3]);
+    stq_phys(env->vm_hsave + offsetof(struct vmcb, save.cr4), env->cr[4]);
+    stq_phys(env->vm_hsave + offsetof(struct vmcb, save.dr6), env->dr[6]);
+    stq_phys(env->vm_hsave + offsetof(struct vmcb, save.dr7), env->dr[7]);
+
+    stq_phys(env->vm_hsave + offsetof(struct vmcb, save.efer), env->efer);
+    stq_phys(env->vm_hsave + offsetof(struct vmcb, save.rflags), compute_eflags());
+
+    svm_save_seg(env->vm_hsave + offsetof(struct vmcb, save.es),
+                  &env->segs[R_ES]);
+    svm_save_seg(env->vm_hsave + offsetof(struct vmcb, save.cs),
+                 &env->segs[R_CS]);
+    svm_save_seg(env->vm_hsave + offsetof(struct vmcb, save.ss),
+                 &env->segs[R_SS]);
+    svm_save_seg(env->vm_hsave + offsetof(struct vmcb, save.ds),
+                 &env->segs[R_DS]);
+
+    stq_phys(env->vm_hsave + offsetof(struct vmcb, save.rip),
+             EIP + next_eip_addend);
+    stq_phys(env->vm_hsave + offsetof(struct vmcb, save.rsp), ESP);
+    stq_phys(env->vm_hsave + offsetof(struct vmcb, save.rax), EAX);
+
+    /* load the interception bitmaps so we do not need to access the
+       vmcb in svm mode */
+    env->intercept            = ldq_phys(env->vm_vmcb + offsetof(struct vmcb, control.intercept));
+    env->intercept_cr_read    = lduw_phys(env->vm_vmcb + offsetof(struct vmcb, control.intercept_cr_read));
+    env->intercept_cr_write   = lduw_phys(env->vm_vmcb + offsetof(struct vmcb, control.intercept_cr_write));
+    env->intercept_dr_read    = lduw_phys(env->vm_vmcb + offsetof(struct vmcb, control.intercept_dr_read));
+    env->intercept_dr_write   = lduw_phys(env->vm_vmcb + offsetof(struct vmcb, control.intercept_dr_write));
+    env->intercept_exceptions = ldl_phys(env->vm_vmcb + offsetof(struct vmcb, control.intercept_exceptions));
+
+    /* enable intercepts */
+    env->hflags |= HF_SVMI_MASK;
+
+    env->tsc_offset = ldq_phys(env->vm_vmcb + offsetof(struct vmcb, control.tsc_offset));
+
+    env->gdt.base  = ldq_phys(env->vm_vmcb + offsetof(struct vmcb, save.gdtr.base));
+    env->gdt.limit = ldl_phys(env->vm_vmcb + offsetof(struct vmcb, save.gdtr.limit));
+
+    env->idt.base  = ldq_phys(env->vm_vmcb + offsetof(struct vmcb, save.idtr.base));
+    env->idt.limit = ldl_phys(env->vm_vmcb + offsetof(struct vmcb, save.idtr.limit));
+
+    /* clear exit_info_2 so we behave like the real hardware */
+    stq_phys(env->vm_vmcb + offsetof(struct vmcb, control.exit_info_2), 0);
+
+    cpu_x86_update_cr0(env, ldq_phys(env->vm_vmcb + offsetof(struct vmcb, save.cr0)));
+    cpu_x86_update_cr4(env, ldq_phys(env->vm_vmcb + offsetof(struct vmcb, save.cr4)));
+    cpu_x86_update_cr3(env, ldq_phys(env->vm_vmcb + offsetof(struct vmcb, save.cr3)));
+    env->cr[2] = ldq_phys(env->vm_vmcb + offsetof(struct vmcb, save.cr2));
+    int_ctl = ldl_phys(env->vm_vmcb + offsetof(struct vmcb, control.int_ctl));
+    env->hflags2 &= ~(HF2_HIF_MASK | HF2_VINTR_MASK);
+    if (int_ctl & V_INTR_MASKING_MASK) {
+        env->v_tpr = int_ctl & V_TPR_MASK;
+        env->hflags2 |= HF2_VINTR_MASK;
+        if (env->eflags & IF_MASK)
+            env->hflags2 |= HF2_HIF_MASK;
+    }
+
+    cpu_load_efer(env,
+                  ldq_phys(env->vm_vmcb + offsetof(struct vmcb, save.efer)));
+    env->eflags = 0;
+    load_eflags(ldq_phys(env->vm_vmcb + offsetof(struct vmcb, save.rflags)),
+                ~(CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C | DF_MASK));
+    CC_OP = CC_OP_EFLAGS;
+
+    svm_load_seg_cache(env->vm_vmcb + offsetof(struct vmcb, save.es),
+                       env, R_ES);
+    svm_load_seg_cache(env->vm_vmcb + offsetof(struct vmcb, save.cs),
+                       env, R_CS);
+    svm_load_seg_cache(env->vm_vmcb + offsetof(struct vmcb, save.ss),
+                       env, R_SS);
+    svm_load_seg_cache(env->vm_vmcb + offsetof(struct vmcb, save.ds),
+                       env, R_DS);
+
+    EIP = ldq_phys(env->vm_vmcb + offsetof(struct vmcb, save.rip));
+    env->eip = EIP;
+    ESP = ldq_phys(env->vm_vmcb + offsetof(struct vmcb, save.rsp));
+    EAX = ldq_phys(env->vm_vmcb + offsetof(struct vmcb, save.rax));
+    env->dr[7] = ldq_phys(env->vm_vmcb + offsetof(struct vmcb, save.dr7));
+    env->dr[6] = ldq_phys(env->vm_vmcb + offsetof(struct vmcb, save.dr6));
+    cpu_x86_set_cpl(env, ldub_phys(env->vm_vmcb + offsetof(struct vmcb, save.cpl)));
+
+    /* FIXME: guest state consistency checks */
+
+    switch(ldub_phys(env->vm_vmcb + offsetof(struct vmcb, control.tlb_ctl))) {
+        case TLB_CONTROL_DO_NOTHING:
+            break;
+        case TLB_CONTROL_FLUSH_ALL_ASID:
+            /* FIXME: this is not 100% correct but should work for now */
+            tlb_flush(env, 1);
+        break;
+    }
+
+    env->hflags2 |= HF2_GIF_MASK;
+
+    if (int_ctl & V_IRQ_MASK) {
+        env->interrupt_request |= CPU_INTERRUPT_VIRQ;
+    }
+
+    /* maybe we need to inject an event */
+    event_inj = ldl_phys(env->vm_vmcb + offsetof(struct vmcb, control.event_inj));
+    if (event_inj & SVM_EVTINJ_VALID) {
+        uint8_t vector = event_inj & SVM_EVTINJ_VEC_MASK;
+        uint16_t valid_err = event_inj & SVM_EVTINJ_VALID_ERR;
+        uint32_t event_inj_err = ldl_phys(env->vm_vmcb + offsetof(struct vmcb, control.event_inj_err));
+
+        qemu_log_mask(CPU_LOG_TB_IN_ASM, "Injecting(%#hx): ", valid_err);
+        /* FIXME: need to implement valid_err */
+        switch (event_inj & SVM_EVTINJ_TYPE_MASK) {
+        case SVM_EVTINJ_TYPE_INTR:
+                env->exception_index = vector;
+                env->error_code = event_inj_err;
+                env->exception_is_int = 0;
+                env->exception_next_eip = -1;
+                qemu_log_mask(CPU_LOG_TB_IN_ASM, "INTR");
+                /* XXX: is it always correct ? */
+                do_interrupt(vector, 0, 0, 0, 1);
+                break;
+        case SVM_EVTINJ_TYPE_NMI:
+                env->exception_index = EXCP02_NMI;
+                env->error_code = event_inj_err;
+                env->exception_is_int = 0;
+                env->exception_next_eip = EIP;
+                qemu_log_mask(CPU_LOG_TB_IN_ASM, "NMI");
+                cpu_loop_exit();
+                break;
+        case SVM_EVTINJ_TYPE_EXEPT:
+                env->exception_index = vector;
+                env->error_code = event_inj_err;
+                env->exception_is_int = 0;
+                env->exception_next_eip = -1;
+                qemu_log_mask(CPU_LOG_TB_IN_ASM, "EXEPT");
+                cpu_loop_exit();
+                break;
+        case SVM_EVTINJ_TYPE_SOFT:
+                env->exception_index = vector;
+                env->error_code = event_inj_err;
+                env->exception_is_int = 1;
+                env->exception_next_eip = EIP;
+                qemu_log_mask(CPU_LOG_TB_IN_ASM, "SOFT");
+                cpu_loop_exit();
+                break;
+        }
+        qemu_log_mask(CPU_LOG_TB_IN_ASM, " %#x %#x\n", env->exception_index, env->error_code);
+    }
+}
+
+void helper_vmmcall(void)
+{
+    helper_svm_check_intercept_param(SVM_EXIT_VMMCALL, 0);
+    raise_exception(EXCP06_ILLOP);
+}
+
+void helper_vmload(int aflag)
+{
+    target_ulong addr;
+    helper_svm_check_intercept_param(SVM_EXIT_VMLOAD, 0);
+
+    if (aflag == 2)
+        addr = EAX;
+    else
+        addr = (uint32_t)EAX;
+
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "vmload! " TARGET_FMT_lx "\nFS: %016" PRIx64 " | " TARGET_FMT_lx "\n",
+                addr, ldq_phys(addr + offsetof(struct vmcb, save.fs.base)),
+                env->segs[R_FS].base);
+
+    svm_load_seg_cache(addr + offsetof(struct vmcb, save.fs),
+                       env, R_FS);
+    svm_load_seg_cache(addr + offsetof(struct vmcb, save.gs),
+                       env, R_GS);
+    svm_load_seg(addr + offsetof(struct vmcb, save.tr),
+                 &env->tr);
+    svm_load_seg(addr + offsetof(struct vmcb, save.ldtr),
+                 &env->ldt);
+
+#ifdef TARGET_X86_64
+    env->kernelgsbase = ldq_phys(addr + offsetof(struct vmcb, save.kernel_gs_base));
+    env->lstar = ldq_phys(addr + offsetof(struct vmcb, save.lstar));
+    env->cstar = ldq_phys(addr + offsetof(struct vmcb, save.cstar));
+    env->fmask = ldq_phys(addr + offsetof(struct vmcb, save.sfmask));
+#endif
+    env->star = ldq_phys(addr + offsetof(struct vmcb, save.star));
+    env->sysenter_cs = ldq_phys(addr + offsetof(struct vmcb, save.sysenter_cs));
+    env->sysenter_esp = ldq_phys(addr + offsetof(struct vmcb, save.sysenter_esp));
+    env->sysenter_eip = ldq_phys(addr + offsetof(struct vmcb, save.sysenter_eip));
+}
+
+void helper_vmsave(int aflag)
+{
+    target_ulong addr;
+    helper_svm_check_intercept_param(SVM_EXIT_VMSAVE, 0);
+
+    if (aflag == 2)
+        addr = EAX;
+    else
+        addr = (uint32_t)EAX;
+
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "vmsave! " TARGET_FMT_lx "\nFS: %016" PRIx64 " | " TARGET_FMT_lx "\n",
+                addr, ldq_phys(addr + offsetof(struct vmcb, save.fs.base)),
+                env->segs[R_FS].base);
+
+    svm_save_seg(addr + offsetof(struct vmcb, save.fs),
+                 &env->segs[R_FS]);
+    svm_save_seg(addr + offsetof(struct vmcb, save.gs),
+                 &env->segs[R_GS]);
+    svm_save_seg(addr + offsetof(struct vmcb, save.tr),
+                 &env->tr);
+    svm_save_seg(addr + offsetof(struct vmcb, save.ldtr),
+                 &env->ldt);
+
+#ifdef TARGET_X86_64
+    stq_phys(addr + offsetof(struct vmcb, save.kernel_gs_base), env->kernelgsbase);
+    stq_phys(addr + offsetof(struct vmcb, save.lstar), env->lstar);
+    stq_phys(addr + offsetof(struct vmcb, save.cstar), env->cstar);
+    stq_phys(addr + offsetof(struct vmcb, save.sfmask), env->fmask);
+#endif
+    stq_phys(addr + offsetof(struct vmcb, save.star), env->star);
+    stq_phys(addr + offsetof(struct vmcb, save.sysenter_cs), env->sysenter_cs);
+    stq_phys(addr + offsetof(struct vmcb, save.sysenter_esp), env->sysenter_esp);
+    stq_phys(addr + offsetof(struct vmcb, save.sysenter_eip), env->sysenter_eip);
+}
+
+void helper_stgi(void)
+{
+    helper_svm_check_intercept_param(SVM_EXIT_STGI, 0);
+    env->hflags2 |= HF2_GIF_MASK;
+}
+
+void helper_clgi(void)
+{
+    helper_svm_check_intercept_param(SVM_EXIT_CLGI, 0);
+    env->hflags2 &= ~HF2_GIF_MASK;
+}
+
+void helper_skinit(void)
+{
+    helper_svm_check_intercept_param(SVM_EXIT_SKINIT, 0);
+    /* XXX: not implemented */
+    raise_exception(EXCP06_ILLOP);
+}
+
+void helper_invlpga(int aflag)
+{
+    target_ulong addr;
+    helper_svm_check_intercept_param(SVM_EXIT_INVLPGA, 0);
+
+    if (aflag == 2)
+        addr = EAX;
+    else
+        addr = (uint32_t)EAX;
+
+    /* XXX: could use the ASID to see if it is needed to do the
+       flush */
+    tlb_flush_page(env, addr);
+}
+
+void helper_svm_check_intercept_param(uint32_t type, uint64_t param)
+{
+    if (likely(!(env->hflags & HF_SVMI_MASK)))
+        return;
+#ifndef VBOX
+    switch(type) {
+    case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR0 + 8:
+        if (env->intercept_cr_read & (1 << (type - SVM_EXIT_READ_CR0))) {
+            helper_vmexit(type, param);
+        }
+        break;
+    case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR0 + 8:
+        if (env->intercept_cr_write & (1 << (type - SVM_EXIT_WRITE_CR0))) {
+            helper_vmexit(type, param);
+        }
+        break;
+    case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR0 + 7:
+        if (env->intercept_dr_read & (1 << (type - SVM_EXIT_READ_DR0))) {
+            helper_vmexit(type, param);
+        }
+        break;
+    case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR0 + 7:
+        if (env->intercept_dr_write & (1 << (type - SVM_EXIT_WRITE_DR0))) {
+            helper_vmexit(type, param);
+        }
+        break;
+    case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 31:
+        if (env->intercept_exceptions & (1 << (type - SVM_EXIT_EXCP_BASE))) {
+            helper_vmexit(type, param);
+        }
+        break;
+    case SVM_EXIT_MSR:
+        if (env->intercept & (1ULL << (SVM_EXIT_MSR - SVM_EXIT_INTR))) {
+            /* FIXME: this should be read in at vmrun (faster this way?) */
+            uint64_t addr = ldq_phys(env->vm_vmcb + offsetof(struct vmcb, control.msrpm_base_pa));
+            uint32_t t0, t1;
+            switch((uint32_t)ECX) {
+            case 0 ... 0x1fff:
+                t0 = (ECX * 2) % 8;
+                t1 = ECX / 8;
+                break;
+            case 0xc0000000 ... 0xc0001fff:
+                t0 = (8192 + ECX - 0xc0000000) * 2;
+                t1 = (t0 / 8);
+                t0 %= 8;
+                break;
+            case 0xc0010000 ... 0xc0011fff:
+                t0 = (16384 + ECX - 0xc0010000) * 2;
+                t1 = (t0 / 8);
+                t0 %= 8;
+                break;
+            default:
+                helper_vmexit(type, param);
+                t0 = 0;
+                t1 = 0;
+                break;
+            }
+            if (ldub_phys(addr + t1) & ((1 << param) << t0))
+                helper_vmexit(type, param);
+        }
+        break;
+    default:
+        if (env->intercept & (1ULL << (type - SVM_EXIT_INTR))) {
+            helper_vmexit(type, param);
+        }
+        break;
+    }
+#else  /* VBOX */
+     AssertMsgFailed(("We shouldn't be here, HM supported differently!"));
+#endif /* VBOX */
+}
+
+void helper_svm_check_io(uint32_t port, uint32_t param,
+                         uint32_t next_eip_addend)
+{
+    if (env->intercept & (1ULL << (SVM_EXIT_IOIO - SVM_EXIT_INTR))) {
+        /* FIXME: this should be read in at vmrun (faster this way?) */
+        uint64_t addr = ldq_phys(env->vm_vmcb + offsetof(struct vmcb, control.iopm_base_pa));
+        uint16_t mask = (1 << ((param >> 4) & 7)) - 1;
+        if(lduw_phys(addr + port / 8) & (mask << (port & 7))) {
+            /* next EIP */
+            stq_phys(env->vm_vmcb + offsetof(struct vmcb, control.exit_info_2),
+                     env->eip + next_eip_addend);
+            helper_vmexit(SVM_EXIT_IOIO, param | (port << 16));
+        }
+    }
+}
+
+/* Note: currently only 32 bits of exit_code are used */
+void helper_vmexit(uint32_t exit_code, uint64_t exit_info_1)
+{
+    uint32_t int_ctl;
+
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "vmexit(%08x, %016" PRIx64 ", %016" PRIx64 ", " TARGET_FMT_lx ")!\n",
+                exit_code, exit_info_1,
+                ldq_phys(env->vm_vmcb + offsetof(struct vmcb, control.exit_info_2)),
+                EIP);
+
+    if(env->hflags & HF_INHIBIT_IRQ_MASK) {
+        stl_phys(env->vm_vmcb + offsetof(struct vmcb, control.int_state), SVM_INTERRUPT_SHADOW_MASK);
+        env->hflags &= ~HF_INHIBIT_IRQ_MASK;
+    } else {
+        stl_phys(env->vm_vmcb + offsetof(struct vmcb, control.int_state), 0);
+    }
+
+    /* Save the VM state in the vmcb */
+    svm_save_seg(env->vm_vmcb + offsetof(struct vmcb, save.es),
+                 &env->segs[R_ES]);
+    svm_save_seg(env->vm_vmcb + offsetof(struct vmcb, save.cs),
+                 &env->segs[R_CS]);
+    svm_save_seg(env->vm_vmcb + offsetof(struct vmcb, save.ss),
+                 &env->segs[R_SS]);
+    svm_save_seg(env->vm_vmcb + offsetof(struct vmcb, save.ds),
+                 &env->segs[R_DS]);
+
+    stq_phys(env->vm_vmcb + offsetof(struct vmcb, save.gdtr.base), env->gdt.base);
+    stl_phys(env->vm_vmcb + offsetof(struct vmcb, save.gdtr.limit), env->gdt.limit);
+
+    stq_phys(env->vm_vmcb + offsetof(struct vmcb, save.idtr.base), env->idt.base);
+    stl_phys(env->vm_vmcb + offsetof(struct vmcb, save.idtr.limit), env->idt.limit);
+
+    stq_phys(env->vm_vmcb + offsetof(struct vmcb, save.efer), env->efer);
+    stq_phys(env->vm_vmcb + offsetof(struct vmcb, save.cr0), env->cr[0]);
+    stq_phys(env->vm_vmcb + offsetof(struct vmcb, save.cr2), env->cr[2]);
+    stq_phys(env->vm_vmcb + offsetof(struct vmcb, save.cr3), env->cr[3]);
+    stq_phys(env->vm_vmcb + offsetof(struct vmcb, save.cr4), env->cr[4]);
+
+    int_ctl = ldl_phys(env->vm_vmcb + offsetof(struct vmcb, control.int_ctl));
+    int_ctl &= ~(V_TPR_MASK | V_IRQ_MASK);
+    int_ctl |= env->v_tpr & V_TPR_MASK;
+    if (env->interrupt_request & CPU_INTERRUPT_VIRQ)
+        int_ctl |= V_IRQ_MASK;
+    stl_phys(env->vm_vmcb + offsetof(struct vmcb, control.int_ctl), int_ctl);
+
+    stq_phys(env->vm_vmcb + offsetof(struct vmcb, save.rflags), compute_eflags());
+    stq_phys(env->vm_vmcb + offsetof(struct vmcb, save.rip), env->eip);
+    stq_phys(env->vm_vmcb + offsetof(struct vmcb, save.rsp), ESP);
+    stq_phys(env->vm_vmcb + offsetof(struct vmcb, save.rax), EAX);
+    stq_phys(env->vm_vmcb + offsetof(struct vmcb, save.dr7), env->dr[7]);
+    stq_phys(env->vm_vmcb + offsetof(struct vmcb, save.dr6), env->dr[6]);
+    stb_phys(env->vm_vmcb + offsetof(struct vmcb, save.cpl), env->hflags & HF_CPL_MASK);
+
+    /* Reload the host state from vm_hsave */
+    env->hflags2 &= ~(HF2_HIF_MASK | HF2_VINTR_MASK);
+    env->hflags &= ~HF_SVMI_MASK;
+    env->intercept = 0;
+    env->intercept_exceptions = 0;
+    env->interrupt_request &= ~CPU_INTERRUPT_VIRQ;
+    env->tsc_offset = 0;
+
+    env->gdt.base  = ldq_phys(env->vm_hsave + offsetof(struct vmcb, save.gdtr.base));
+    env->gdt.limit = ldl_phys(env->vm_hsave + offsetof(struct vmcb, save.gdtr.limit));
+
+    env->idt.base  = ldq_phys(env->vm_hsave + offsetof(struct vmcb, save.idtr.base));
+    env->idt.limit = ldl_phys(env->vm_hsave + offsetof(struct vmcb, save.idtr.limit));
+
+    cpu_x86_update_cr0(env, ldq_phys(env->vm_hsave + offsetof(struct vmcb, save.cr0)) | CR0_PE_MASK);
+    cpu_x86_update_cr4(env, ldq_phys(env->vm_hsave + offsetof(struct vmcb, save.cr4)));
+    cpu_x86_update_cr3(env, ldq_phys(env->vm_hsave + offsetof(struct vmcb, save.cr3)));
+    /* we need to set the efer after the crs so the hidden flags get
+       set properly */
+    cpu_load_efer(env,
+                  ldq_phys(env->vm_hsave + offsetof(struct vmcb, save.efer)));
+    env->eflags = 0;
+    load_eflags(ldq_phys(env->vm_hsave + offsetof(struct vmcb, save.rflags)),
+                ~(CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C | DF_MASK));
+    CC_OP = CC_OP_EFLAGS;
+
+    svm_load_seg_cache(env->vm_hsave + offsetof(struct vmcb, save.es),
+                       env, R_ES);
+    svm_load_seg_cache(env->vm_hsave + offsetof(struct vmcb, save.cs),
+                       env, R_CS);
+    svm_load_seg_cache(env->vm_hsave + offsetof(struct vmcb, save.ss),
+                       env, R_SS);
+    svm_load_seg_cache(env->vm_hsave + offsetof(struct vmcb, save.ds),
+                       env, R_DS);
+
+    EIP = ldq_phys(env->vm_hsave + offsetof(struct vmcb, save.rip));
+    ESP = ldq_phys(env->vm_hsave + offsetof(struct vmcb, save.rsp));
+    EAX = ldq_phys(env->vm_hsave + offsetof(struct vmcb, save.rax));
+
+    env->dr[6] = ldq_phys(env->vm_hsave + offsetof(struct vmcb, save.dr6));
+    env->dr[7] = ldq_phys(env->vm_hsave + offsetof(struct vmcb, save.dr7));
+
+    /* other setups */
+    cpu_x86_set_cpl(env, 0);
+    stq_phys(env->vm_vmcb + offsetof(struct vmcb, control.exit_code), exit_code);
+    stq_phys(env->vm_vmcb + offsetof(struct vmcb, control.exit_info_1), exit_info_1);
+
+    stl_phys(env->vm_vmcb + offsetof(struct vmcb, control.exit_int_info),
+             ldl_phys(env->vm_vmcb + offsetof(struct vmcb, control.event_inj)));
+    stl_phys(env->vm_vmcb + offsetof(struct vmcb, control.exit_int_info_err),
+             ldl_phys(env->vm_vmcb + offsetof(struct vmcb, control.event_inj_err)));
+    stl_phys(env->vm_vmcb + offsetof(struct vmcb, control.event_inj), 0);
+
+    env->hflags2 &= ~HF2_GIF_MASK;
+    /* FIXME: Resets the current ASID register to zero (host ASID). */
+
+    /* Clears the V_IRQ and V_INTR_MASKING bits inside the processor. */
+
+    /* Clears the TSC_OFFSET inside the processor. */
+
+    /* If the host is in PAE mode, the processor reloads the host's PDPEs
+       from the page table indicated the host's CR3. If the PDPEs contain
+       illegal state, the processor causes a shutdown. */
+
+    /* Forces CR0.PE = 1, RFLAGS.VM = 0. */
+    env->cr[0] |= CR0_PE_MASK;
+    env->eflags &= ~VM_MASK;
+
+    /* Disables all breakpoints in the host DR7 register. */
+
+    /* Checks the reloaded host state for consistency. */
+
+    /* If the host's rIP reloaded by #VMEXIT is outside the limit of the
+       host's code segment or non-canonical (in the case of long mode), a
+       #GP fault is delivered inside the host.) */
+
+    /* remove any pending exception */
+    env->exception_index = -1;
+    env->error_code = 0;
+    env->old_exception = -1;
+
+    cpu_loop_exit();
+}
+
+#endif
+
+/* MMX/SSE */
+/* XXX: optimize by storing fptt and fptags in the static cpu state */
+void helper_enter_mmx(void)
+{
+    env->fpstt = 0;
+    *(uint32_t *)(env->fptags) = 0;
+    *(uint32_t *)(env->fptags + 4) = 0;
+}
+
+void helper_emms(void)
+{
+    /* set to empty state */
+    *(uint32_t *)(env->fptags) = 0x01010101;
+    *(uint32_t *)(env->fptags + 4) = 0x01010101;
+}
+
+/* XXX: suppress */
+void helper_movq(void *d, void *s)
+{
+    *(uint64_t *)d = *(uint64_t *)s;
+}
+
+#define SHIFT 0
+#include "ops_sse.h"
+
+#define SHIFT 1
+#include "ops_sse.h"
+
+#define SHIFT 0
+#include "helper_template.h"
+#undef SHIFT
+
+#define SHIFT 1
+#include "helper_template.h"
+#undef SHIFT
+
+#define SHIFT 2
+#include "helper_template.h"
+#undef SHIFT
+
+#ifdef TARGET_X86_64
+
+#define SHIFT 3
+#include "helper_template.h"
+#undef SHIFT
+
+#endif
+
+/* bit operations */
+target_ulong helper_bsf(target_ulong t0)
+{
+    int count;
+    target_ulong res;
+
+    res = t0;
+    count = 0;
+    while ((res & 1) == 0) {
+        count++;
+        res >>= 1;
+    }
+    return count;
+}
+
+target_ulong helper_lzcnt(target_ulong t0, int wordsize)
+{
+    int count;
+    target_ulong res, mask;
+
+    if (wordsize > 0 && t0 == 0) {
+        return wordsize;
+    }
+    res = t0;
+    count = TARGET_LONG_BITS - 1;
+    mask = (target_ulong)1 << (TARGET_LONG_BITS - 1);
+    while ((res & mask) == 0) {
+        count--;
+        res <<= 1;
+    }
+    if (wordsize > 0) {
+        return wordsize - 1 - count;
+    }
+    return count;
+}
+
+target_ulong helper_bsr(target_ulong t0)
+{
+	return helper_lzcnt(t0, 0);
+}
+
+static int compute_all_eflags(void)
+{
+    return CC_SRC;
+}
+
+static int compute_c_eflags(void)
+{
+    return CC_SRC & CC_C;
+}
+
+uint32_t helper_cc_compute_all(int op)
+{
+    switch (op) {
+    default: /* should never happen */ return 0;
+
+    case CC_OP_EFLAGS: return compute_all_eflags();
+
+    case CC_OP_MULB: return compute_all_mulb();
+    case CC_OP_MULW: return compute_all_mulw();
+    case CC_OP_MULL: return compute_all_mull();
+
+    case CC_OP_ADDB: return compute_all_addb();
+    case CC_OP_ADDW: return compute_all_addw();
+    case CC_OP_ADDL: return compute_all_addl();
+
+    case CC_OP_ADCB: return compute_all_adcb();
+    case CC_OP_ADCW: return compute_all_adcw();
+    case CC_OP_ADCL: return compute_all_adcl();
+
+    case CC_OP_SUBB: return compute_all_subb();
+    case CC_OP_SUBW: return compute_all_subw();
+    case CC_OP_SUBL: return compute_all_subl();
+
+    case CC_OP_SBBB: return compute_all_sbbb();
+    case CC_OP_SBBW: return compute_all_sbbw();
+    case CC_OP_SBBL: return compute_all_sbbl();
+
+    case CC_OP_LOGICB: return compute_all_logicb();
+    case CC_OP_LOGICW: return compute_all_logicw();
+    case CC_OP_LOGICL: return compute_all_logicl();
+
+    case CC_OP_INCB: return compute_all_incb();
+    case CC_OP_INCW: return compute_all_incw();
+    case CC_OP_INCL: return compute_all_incl();
+
+    case CC_OP_DECB: return compute_all_decb();
+    case CC_OP_DECW: return compute_all_decw();
+    case CC_OP_DECL: return compute_all_decl();
+
+    case CC_OP_SHLB: return compute_all_shlb();
+    case CC_OP_SHLW: return compute_all_shlw();
+    case CC_OP_SHLL: return compute_all_shll();
+
+    case CC_OP_SARB: return compute_all_sarb();
+    case CC_OP_SARW: return compute_all_sarw();
+    case CC_OP_SARL: return compute_all_sarl();
+
+#ifdef TARGET_X86_64
+    case CC_OP_MULQ: return compute_all_mulq();
+
+    case CC_OP_ADDQ: return compute_all_addq();
+
+    case CC_OP_ADCQ: return compute_all_adcq();
+
+    case CC_OP_SUBQ: return compute_all_subq();
+
+    case CC_OP_SBBQ: return compute_all_sbbq();
+
+    case CC_OP_LOGICQ: return compute_all_logicq();
+
+    case CC_OP_INCQ: return compute_all_incq();
+
+    case CC_OP_DECQ: return compute_all_decq();
+
+    case CC_OP_SHLQ: return compute_all_shlq();
+
+    case CC_OP_SARQ: return compute_all_sarq();
+#endif
+    }
+}
+
+uint32_t helper_cc_compute_c(int op)
+{
+    switch (op) {
+    default: /* should never happen */ return 0;
+
+    case CC_OP_EFLAGS: return compute_c_eflags();
+
+    case CC_OP_MULB: return compute_c_mull();
+    case CC_OP_MULW: return compute_c_mull();
+    case CC_OP_MULL: return compute_c_mull();
+
+    case CC_OP_ADDB: return compute_c_addb();
+    case CC_OP_ADDW: return compute_c_addw();
+    case CC_OP_ADDL: return compute_c_addl();
+
+    case CC_OP_ADCB: return compute_c_adcb();
+    case CC_OP_ADCW: return compute_c_adcw();
+    case CC_OP_ADCL: return compute_c_adcl();
+
+    case CC_OP_SUBB: return compute_c_subb();
+    case CC_OP_SUBW: return compute_c_subw();
+    case CC_OP_SUBL: return compute_c_subl();
+
+    case CC_OP_SBBB: return compute_c_sbbb();
+    case CC_OP_SBBW: return compute_c_sbbw();
+    case CC_OP_SBBL: return compute_c_sbbl();
+
+    case CC_OP_LOGICB: return compute_c_logicb();
+    case CC_OP_LOGICW: return compute_c_logicw();
+    case CC_OP_LOGICL: return compute_c_logicl();
+
+    case CC_OP_INCB: return compute_c_incl();
+    case CC_OP_INCW: return compute_c_incl();
+    case CC_OP_INCL: return compute_c_incl();
+
+    case CC_OP_DECB: return compute_c_incl();
+    case CC_OP_DECW: return compute_c_incl();
+    case CC_OP_DECL: return compute_c_incl();
+
+    case CC_OP_SHLB: return compute_c_shlb();
+    case CC_OP_SHLW: return compute_c_shlw();
+    case CC_OP_SHLL: return compute_c_shll();
+
+    case CC_OP_SARB: return compute_c_sarl();
+    case CC_OP_SARW: return compute_c_sarl();
+    case CC_OP_SARL: return compute_c_sarl();
+
+#ifdef TARGET_X86_64
+    case CC_OP_MULQ: return compute_c_mull();
+
+    case CC_OP_ADDQ: return compute_c_addq();
+
+    case CC_OP_ADCQ: return compute_c_adcq();
+
+    case CC_OP_SUBQ: return compute_c_subq();
+
+    case CC_OP_SBBQ: return compute_c_sbbq();
+
+    case CC_OP_LOGICQ: return compute_c_logicq();
+
+    case CC_OP_INCQ: return compute_c_incl();
+
+    case CC_OP_DECQ: return compute_c_incl();
+
+    case CC_OP_SHLQ: return compute_c_shlq();
+
+    case CC_OP_SARQ: return compute_c_sarl();
+#endif
+    }
+}
diff --git a/src/recompiler/target-i386/ops_sse.h b/src/recompiler/target-i386/ops_sse.h
new file mode 100644
index 00000000..4c8b89e2
--- /dev/null
+++ b/src/recompiler/target-i386/ops_sse.h
@@ -0,0 +1,2111 @@
+/*
+ *  MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI support
+ *
+ *  Copyright (c) 2005 Fabrice Bellard
+ *  Copyright (c) 2008 Intel Corporation  <andrew.zaborowski@intel.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Oracle LGPL Disclaimer: For the avoidance of doubt, except that if any license choice
+ * other than GPL or LGPL is available it will apply instead, Oracle elects to use only
+ * the Lesser General Public License version 2.1 (LGPLv2) at this time for any software where
+ * a choice of LGPL license versions is made available with the language indicating
+ * that LGPLv2 or any later version may be used, or where a choice of which version
+ * of the LGPL is applied is otherwise unspecified.
+ */
+
+#if SHIFT == 0
+#define Reg MMXReg
+#define XMM_ONLY(...)
+#define B(n) MMX_B(n)
+#define W(n) MMX_W(n)
+#define L(n) MMX_L(n)
+#define Q(n) q
+#define SUFFIX _mmx
+#else
+#define Reg XMMReg
+#define XMM_ONLY(...) __VA_ARGS__
+#define B(n) XMM_B(n)
+#define W(n) XMM_W(n)
+#define L(n) XMM_L(n)
+#define Q(n) XMM_Q(n)
+#define SUFFIX _xmm
+#endif
+
+void glue(helper_psrlw, SUFFIX)(Reg *d, Reg *s)
+{
+    int shift;
+
+    if (s->Q(0) > 15) {
+        d->Q(0) = 0;
+#if SHIFT == 1
+        d->Q(1) = 0;
+#endif
+    } else {
+        shift = s->B(0);
+        d->W(0) >>= shift;
+        d->W(1) >>= shift;
+        d->W(2) >>= shift;
+        d->W(3) >>= shift;
+#if SHIFT == 1
+        d->W(4) >>= shift;
+        d->W(5) >>= shift;
+        d->W(6) >>= shift;
+        d->W(7) >>= shift;
+#endif
+    }
+}
+
+void glue(helper_psraw, SUFFIX)(Reg *d, Reg *s)
+{
+    int shift;
+
+    if (s->Q(0) > 15) {
+        shift = 15;
+    } else {
+        shift = s->B(0);
+    }
+    d->W(0) = (int16_t)d->W(0) >> shift;
+    d->W(1) = (int16_t)d->W(1) >> shift;
+    d->W(2) = (int16_t)d->W(2) >> shift;
+    d->W(3) = (int16_t)d->W(3) >> shift;
+#if SHIFT == 1
+    d->W(4) = (int16_t)d->W(4) >> shift;
+    d->W(5) = (int16_t)d->W(5) >> shift;
+    d->W(6) = (int16_t)d->W(6) >> shift;
+    d->W(7) = (int16_t)d->W(7) >> shift;
+#endif
+}
+
+void glue(helper_psllw, SUFFIX)(Reg *d, Reg *s)
+{
+    int shift;
+
+    if (s->Q(0) > 15) {
+        d->Q(0) = 0;
+#if SHIFT == 1
+        d->Q(1) = 0;
+#endif
+    } else {
+        shift = s->B(0);
+        d->W(0) <<= shift;
+        d->W(1) <<= shift;
+        d->W(2) <<= shift;
+        d->W(3) <<= shift;
+#if SHIFT == 1
+        d->W(4) <<= shift;
+        d->W(5) <<= shift;
+        d->W(6) <<= shift;
+        d->W(7) <<= shift;
+#endif
+    }
+}
+
+void glue(helper_psrld, SUFFIX)(Reg *d, Reg *s)
+{
+    int shift;
+
+    if (s->Q(0) > 31) {
+        d->Q(0) = 0;
+#if SHIFT == 1
+        d->Q(1) = 0;
+#endif
+    } else {
+        shift = s->B(0);
+        d->L(0) >>= shift;
+        d->L(1) >>= shift;
+#if SHIFT == 1
+        d->L(2) >>= shift;
+        d->L(3) >>= shift;
+#endif
+    }
+}
+
+void glue(helper_psrad, SUFFIX)(Reg *d, Reg *s)
+{
+    int shift;
+
+    if (s->Q(0) > 31) {
+        shift = 31;
+    } else {
+        shift = s->B(0);
+    }
+    d->L(0) = (int32_t)d->L(0) >> shift;
+    d->L(1) = (int32_t)d->L(1) >> shift;
+#if SHIFT == 1
+    d->L(2) = (int32_t)d->L(2) >> shift;
+    d->L(3) = (int32_t)d->L(3) >> shift;
+#endif
+}
+
+void glue(helper_pslld, SUFFIX)(Reg *d, Reg *s)
+{
+    int shift;
+
+    if (s->Q(0) > 31) {
+        d->Q(0) = 0;
+#if SHIFT == 1
+        d->Q(1) = 0;
+#endif
+    } else {
+        shift = s->B(0);
+        d->L(0) <<= shift;
+        d->L(1) <<= shift;
+#if SHIFT == 1
+        d->L(2) <<= shift;
+        d->L(3) <<= shift;
+#endif
+    }
+}
+
+void glue(helper_psrlq, SUFFIX)(Reg *d, Reg *s)
+{
+    int shift;
+
+    if (s->Q(0) > 63) {
+        d->Q(0) = 0;
+#if SHIFT == 1
+        d->Q(1) = 0;
+#endif
+    } else {
+        shift = s->B(0);
+        d->Q(0) >>= shift;
+#if SHIFT == 1
+        d->Q(1) >>= shift;
+#endif
+    }
+}
+
+void glue(helper_psllq, SUFFIX)(Reg *d, Reg *s)
+{
+    int shift;
+
+    if (s->Q(0) > 63) {
+        d->Q(0) = 0;
+#if SHIFT == 1
+        d->Q(1) = 0;
+#endif
+    } else {
+        shift = s->B(0);
+        d->Q(0) <<= shift;
+#if SHIFT == 1
+        d->Q(1) <<= shift;
+#endif
+    }
+}
+
+#if SHIFT == 1
+void glue(helper_psrldq, SUFFIX)(Reg *d, Reg *s)
+{
+    int shift, i;
+
+    shift = s->L(0);
+    if (shift > 16)
+        shift = 16;
+    for(i = 0; i < 16 - shift; i++)
+        d->B(i) = d->B(i + shift);
+    for(i = 16 - shift; i < 16; i++)
+        d->B(i) = 0;
+}
+
+void glue(helper_pslldq, SUFFIX)(Reg *d, Reg *s)
+{
+    int shift, i;
+
+    shift = s->L(0);
+    if (shift > 16)
+        shift = 16;
+    for(i = 15; i >= shift; i--)
+        d->B(i) = d->B(i - shift);
+    for(i = 0; i < shift; i++)
+        d->B(i) = 0;
+}
+#endif
+
+#define SSE_HELPER_B(name, F)\
+void glue(name, SUFFIX) (Reg *d, Reg *s)\
+{\
+    d->B(0) = F(d->B(0), s->B(0));\
+    d->B(1) = F(d->B(1), s->B(1));\
+    d->B(2) = F(d->B(2), s->B(2));\
+    d->B(3) = F(d->B(3), s->B(3));\
+    d->B(4) = F(d->B(4), s->B(4));\
+    d->B(5) = F(d->B(5), s->B(5));\
+    d->B(6) = F(d->B(6), s->B(6));\
+    d->B(7) = F(d->B(7), s->B(7));\
+    XMM_ONLY(\
+    d->B(8) = F(d->B(8), s->B(8));\
+    d->B(9) = F(d->B(9), s->B(9));\
+    d->B(10) = F(d->B(10), s->B(10));\
+    d->B(11) = F(d->B(11), s->B(11));\
+    d->B(12) = F(d->B(12), s->B(12));\
+    d->B(13) = F(d->B(13), s->B(13));\
+    d->B(14) = F(d->B(14), s->B(14));\
+    d->B(15) = F(d->B(15), s->B(15));\
+    )\
+}
+
+#define SSE_HELPER_W(name, F)\
+void glue(name, SUFFIX) (Reg *d, Reg *s)\
+{\
+    d->W(0) = F(d->W(0), s->W(0));\
+    d->W(1) = F(d->W(1), s->W(1));\
+    d->W(2) = F(d->W(2), s->W(2));\
+    d->W(3) = F(d->W(3), s->W(3));\
+    XMM_ONLY(\
+    d->W(4) = F(d->W(4), s->W(4));\
+    d->W(5) = F(d->W(5), s->W(5));\
+    d->W(6) = F(d->W(6), s->W(6));\
+    d->W(7) = F(d->W(7), s->W(7));\
+    )\
+}
+
+#define SSE_HELPER_L(name, F)\
+void glue(name, SUFFIX) (Reg *d, Reg *s)\
+{\
+    d->L(0) = F(d->L(0), s->L(0));\
+    d->L(1) = F(d->L(1), s->L(1));\
+    XMM_ONLY(\
+    d->L(2) = F(d->L(2), s->L(2));\
+    d->L(3) = F(d->L(3), s->L(3));\
+    )\
+}
+
+#define SSE_HELPER_Q(name, F)\
+void glue(name, SUFFIX) (Reg *d, Reg *s)\
+{\
+    d->Q(0) = F(d->Q(0), s->Q(0));\
+    XMM_ONLY(\
+    d->Q(1) = F(d->Q(1), s->Q(1));\
+    )\
+}
+
+#if SHIFT == 0
+static inline int satub(int x)
+{
+    if (x < 0)
+        return 0;
+    else if (x > 255)
+        return 255;
+    else
+        return x;
+}
+
+static inline int satuw(int x)
+{
+    if (x < 0)
+        return 0;
+    else if (x > 65535)
+        return 65535;
+    else
+        return x;
+}
+
+static inline int satsb(int x)
+{
+    if (x < -128)
+        return -128;
+    else if (x > 127)
+        return 127;
+    else
+        return x;
+}
+
+static inline int satsw(int x)
+{
+    if (x < -32768)
+        return -32768;
+    else if (x > 32767)
+        return 32767;
+    else
+        return x;
+}
+
+#define FADD(a, b) ((a) + (b))
+#define FADDUB(a, b) satub((a) + (b))
+#define FADDUW(a, b) satuw((a) + (b))
+#define FADDSB(a, b) satsb((int8_t)(a) + (int8_t)(b))
+#define FADDSW(a, b) satsw((int16_t)(a) + (int16_t)(b))
+
+#define FSUB(a, b) ((a) - (b))
+#define FSUBUB(a, b) satub((a) - (b))
+#define FSUBUW(a, b) satuw((a) - (b))
+#define FSUBSB(a, b) satsb((int8_t)(a) - (int8_t)(b))
+#define FSUBSW(a, b) satsw((int16_t)(a) - (int16_t)(b))
+#define FMINUB(a, b) ((a) < (b)) ? (a) : (b)
+#define FMINSW(a, b) ((int16_t)(a) < (int16_t)(b)) ? (a) : (b)
+#define FMAXUB(a, b) ((a) > (b)) ? (a) : (b)
+#define FMAXSW(a, b) ((int16_t)(a) > (int16_t)(b)) ? (a) : (b)
+
+#define FAND(a, b) (a) & (b)
+#define FANDN(a, b) ((~(a)) & (b))
+#define FOR(a, b) (a) | (b)
+#define FXOR(a, b) (a) ^ (b)
+
+#define FCMPGTB(a, b) (int8_t)(a) > (int8_t)(b) ? -1 : 0
+#define FCMPGTW(a, b) (int16_t)(a) > (int16_t)(b) ? -1 : 0
+#define FCMPGTL(a, b) (int32_t)(a) > (int32_t)(b) ? -1 : 0
+#define FCMPEQ(a, b) (a) == (b) ? -1 : 0
+
+#define FMULLW(a, b) (a) * (b)
+#define FMULHRW(a, b) ((int16_t)(a) * (int16_t)(b) + 0x8000) >> 16
+#define FMULHUW(a, b) (a) * (b) >> 16
+#define FMULHW(a, b) (int16_t)(a) * (int16_t)(b) >> 16
+
+#define FAVG(a, b) ((a) + (b) + 1) >> 1
+#endif
+
+SSE_HELPER_B(helper_paddb, FADD)
+SSE_HELPER_W(helper_paddw, FADD)
+SSE_HELPER_L(helper_paddl, FADD)
+SSE_HELPER_Q(helper_paddq, FADD)
+
+SSE_HELPER_B(helper_psubb, FSUB)
+SSE_HELPER_W(helper_psubw, FSUB)
+SSE_HELPER_L(helper_psubl, FSUB)
+SSE_HELPER_Q(helper_psubq, FSUB)
+
+SSE_HELPER_B(helper_paddusb, FADDUB)
+SSE_HELPER_B(helper_paddsb, FADDSB)
+SSE_HELPER_B(helper_psubusb, FSUBUB)
+SSE_HELPER_B(helper_psubsb, FSUBSB)
+
+SSE_HELPER_W(helper_paddusw, FADDUW)
+SSE_HELPER_W(helper_paddsw, FADDSW)
+SSE_HELPER_W(helper_psubusw, FSUBUW)
+SSE_HELPER_W(helper_psubsw, FSUBSW)
+
+SSE_HELPER_B(helper_pminub, FMINUB)
+SSE_HELPER_B(helper_pmaxub, FMAXUB)
+
+SSE_HELPER_W(helper_pminsw, FMINSW)
+SSE_HELPER_W(helper_pmaxsw, FMAXSW)
+
+SSE_HELPER_Q(helper_pand, FAND)
+SSE_HELPER_Q(helper_pandn, FANDN)
+SSE_HELPER_Q(helper_por, FOR)
+SSE_HELPER_Q(helper_pxor, FXOR)
+
+SSE_HELPER_B(helper_pcmpgtb, FCMPGTB)
+SSE_HELPER_W(helper_pcmpgtw, FCMPGTW)
+SSE_HELPER_L(helper_pcmpgtl, FCMPGTL)
+
+SSE_HELPER_B(helper_pcmpeqb, FCMPEQ)
+SSE_HELPER_W(helper_pcmpeqw, FCMPEQ)
+SSE_HELPER_L(helper_pcmpeql, FCMPEQ)
+
+SSE_HELPER_W(helper_pmullw, FMULLW)
+#if SHIFT == 0
+SSE_HELPER_W(helper_pmulhrw, FMULHRW)
+#endif
+SSE_HELPER_W(helper_pmulhuw, FMULHUW)
+SSE_HELPER_W(helper_pmulhw, FMULHW)
+
+SSE_HELPER_B(helper_pavgb, FAVG)
+SSE_HELPER_W(helper_pavgw, FAVG)
+
+void glue(helper_pmuludq, SUFFIX) (Reg *d, Reg *s)
+{
+    d->Q(0) = (uint64_t)s->L(0) * (uint64_t)d->L(0);
+#if SHIFT == 1
+    d->Q(1) = (uint64_t)s->L(2) * (uint64_t)d->L(2);
+#endif
+}
+
+void glue(helper_pmaddwd, SUFFIX) (Reg *d, Reg *s)
+{
+    int i;
+
+    for(i = 0; i < (2 << SHIFT); i++) {
+        d->L(i) = (int16_t)s->W(2*i) * (int16_t)d->W(2*i) +
+            (int16_t)s->W(2*i+1) * (int16_t)d->W(2*i+1);
+    }
+}
+
+#if SHIFT == 0
+static inline int abs1(int a)
+{
+    if (a < 0)
+        return -a;
+    else
+        return a;
+}
+#endif
+void glue(helper_psadbw, SUFFIX) (Reg *d, Reg *s)
+{
+    unsigned int val;
+
+    val = 0;
+    val += abs1(d->B(0) - s->B(0));
+    val += abs1(d->B(1) - s->B(1));
+    val += abs1(d->B(2) - s->B(2));
+    val += abs1(d->B(3) - s->B(3));
+    val += abs1(d->B(4) - s->B(4));
+    val += abs1(d->B(5) - s->B(5));
+    val += abs1(d->B(6) - s->B(6));
+    val += abs1(d->B(7) - s->B(7));
+    d->Q(0) = val;
+#if SHIFT == 1
+    val = 0;
+    val += abs1(d->B(8) - s->B(8));
+    val += abs1(d->B(9) - s->B(9));
+    val += abs1(d->B(10) - s->B(10));
+    val += abs1(d->B(11) - s->B(11));
+    val += abs1(d->B(12) - s->B(12));
+    val += abs1(d->B(13) - s->B(13));
+    val += abs1(d->B(14) - s->B(14));
+    val += abs1(d->B(15) - s->B(15));
+    d->Q(1) = val;
+#endif
+}
+
+void glue(helper_maskmov, SUFFIX) (Reg *d, Reg *s, target_ulong a0)
+{
+    int i;
+    for(i = 0; i < (8 << SHIFT); i++) {
+        if (s->B(i) & 0x80)
+            stb(a0 + i, d->B(i));
+    }
+}
+
+void glue(helper_movl_mm_T0, SUFFIX) (Reg *d, uint32_t val)
+{
+    d->L(0) = val;
+    d->L(1) = 0;
+#if SHIFT == 1
+    d->Q(1) = 0;
+#endif
+}
+
+#ifdef TARGET_X86_64
+void glue(helper_movq_mm_T0, SUFFIX) (Reg *d, uint64_t val)
+{
+    d->Q(0) = val;
+#if SHIFT == 1
+    d->Q(1) = 0;
+#endif
+}
+#endif
+
+#if SHIFT == 0
+void glue(helper_pshufw, SUFFIX) (Reg *d, Reg *s, int order)
+{
+    Reg r;
+    r.W(0) = s->W(order & 3);
+    r.W(1) = s->W((order >> 2) & 3);
+    r.W(2) = s->W((order >> 4) & 3);
+    r.W(3) = s->W((order >> 6) & 3);
+    *d = r;
+}
+#else
+void helper_shufps(Reg *d, Reg *s, int order)
+{
+    Reg r;
+    r.L(0) = d->L(order & 3);
+    r.L(1) = d->L((order >> 2) & 3);
+    r.L(2) = s->L((order >> 4) & 3);
+    r.L(3) = s->L((order >> 6) & 3);
+    *d = r;
+}
+
+void helper_shufpd(Reg *d, Reg *s, int order)
+{
+    Reg r;
+    r.Q(0) = d->Q(order & 1);
+    r.Q(1) = s->Q((order >> 1) & 1);
+    *d = r;
+}
+
+void glue(helper_pshufd, SUFFIX) (Reg *d, Reg *s, int order)
+{
+    Reg r;
+    r.L(0) = s->L(order & 3);
+    r.L(1) = s->L((order >> 2) & 3);
+    r.L(2) = s->L((order >> 4) & 3);
+    r.L(3) = s->L((order >> 6) & 3);
+    *d = r;
+}
+
+void glue(helper_pshuflw, SUFFIX) (Reg *d, Reg *s, int order)
+{
+    Reg r;
+    r.W(0) = s->W(order & 3);
+    r.W(1) = s->W((order >> 2) & 3);
+    r.W(2) = s->W((order >> 4) & 3);
+    r.W(3) = s->W((order >> 6) & 3);
+    r.Q(1) = s->Q(1);
+    *d = r;
+}
+
+void glue(helper_pshufhw, SUFFIX) (Reg *d, Reg *s, int order)
+{
+    Reg r;
+    r.Q(0) = s->Q(0);
+    r.W(4) = s->W(4 + (order & 3));
+    r.W(5) = s->W(4 + ((order >> 2) & 3));
+    r.W(6) = s->W(4 + ((order >> 4) & 3));
+    r.W(7) = s->W(4 + ((order >> 6) & 3));
+    *d = r;
+}
+#endif
+
+#if SHIFT == 1
+/* FPU ops */
+/* XXX: not accurate */
+
+#define SSE_HELPER_S(name, F)\
+void helper_ ## name ## ps (Reg *d, Reg *s)\
+{\
+    d->XMM_S(0) = F(32, d->XMM_S(0), s->XMM_S(0));\
+    d->XMM_S(1) = F(32, d->XMM_S(1), s->XMM_S(1));\
+    d->XMM_S(2) = F(32, d->XMM_S(2), s->XMM_S(2));\
+    d->XMM_S(3) = F(32, d->XMM_S(3), s->XMM_S(3));\
+}\
+\
+void helper_ ## name ## ss (Reg *d, Reg *s)\
+{\
+    d->XMM_S(0) = F(32, d->XMM_S(0), s->XMM_S(0));\
+}\
+void helper_ ## name ## pd (Reg *d, Reg *s)\
+{\
+    d->XMM_D(0) = F(64, d->XMM_D(0), s->XMM_D(0));\
+    d->XMM_D(1) = F(64, d->XMM_D(1), s->XMM_D(1));\
+}\
+\
+void helper_ ## name ## sd (Reg *d, Reg *s)\
+{\
+    d->XMM_D(0) = F(64, d->XMM_D(0), s->XMM_D(0));\
+}
+
+#define FPU_ADD(size, a, b) float ## size ## _add(a, b, &env->sse_status)
+#define FPU_SUB(size, a, b) float ## size ## _sub(a, b, &env->sse_status)
+#define FPU_MUL(size, a, b) float ## size ## _mul(a, b, &env->sse_status)
+#define FPU_DIV(size, a, b) float ## size ## _div(a, b, &env->sse_status)
+#define FPU_MIN(size, a, b) (a) < (b) ? (a) : (b)
+#define FPU_MAX(size, a, b) (a) > (b) ? (a) : (b)
+#define FPU_SQRT(size, a, b) float ## size ## _sqrt(b, &env->sse_status)
+
+SSE_HELPER_S(add, FPU_ADD)
+SSE_HELPER_S(sub, FPU_SUB)
+SSE_HELPER_S(mul, FPU_MUL)
+SSE_HELPER_S(div, FPU_DIV)
+SSE_HELPER_S(min, FPU_MIN)
+SSE_HELPER_S(max, FPU_MAX)
+SSE_HELPER_S(sqrt, FPU_SQRT)
+
+
+/* float to float conversions */
+void helper_cvtps2pd(Reg *d, Reg *s)
+{
+    float32 s0, s1;
+    s0 = s->XMM_S(0);
+    s1 = s->XMM_S(1);
+    d->XMM_D(0) = float32_to_float64(s0, &env->sse_status);
+    d->XMM_D(1) = float32_to_float64(s1, &env->sse_status);
+}
+
+void helper_cvtpd2ps(Reg *d, Reg *s)
+{
+    d->XMM_S(0) = float64_to_float32(s->XMM_D(0), &env->sse_status);
+    d->XMM_S(1) = float64_to_float32(s->XMM_D(1), &env->sse_status);
+    d->Q(1) = 0;
+}
+
+void helper_cvtss2sd(Reg *d, Reg *s)
+{
+    d->XMM_D(0) = float32_to_float64(s->XMM_S(0), &env->sse_status);
+}
+
+void helper_cvtsd2ss(Reg *d, Reg *s)
+{
+    d->XMM_S(0) = float64_to_float32(s->XMM_D(0), &env->sse_status);
+}
+
+/* integer to float */
+void helper_cvtdq2ps(Reg *d, Reg *s)
+{
+    d->XMM_S(0) = int32_to_float32(s->XMM_L(0), &env->sse_status);
+    d->XMM_S(1) = int32_to_float32(s->XMM_L(1), &env->sse_status);
+    d->XMM_S(2) = int32_to_float32(s->XMM_L(2), &env->sse_status);
+    d->XMM_S(3) = int32_to_float32(s->XMM_L(3), &env->sse_status);
+}
+
+void helper_cvtdq2pd(Reg *d, Reg *s)
+{
+    int32_t l0, l1;
+    l0 = (int32_t)s->XMM_L(0);
+    l1 = (int32_t)s->XMM_L(1);
+    d->XMM_D(0) = int32_to_float64(l0, &env->sse_status);
+    d->XMM_D(1) = int32_to_float64(l1, &env->sse_status);
+}
+
+void helper_cvtpi2ps(XMMReg *d, MMXReg *s)
+{
+    d->XMM_S(0) = int32_to_float32(s->MMX_L(0), &env->sse_status);
+    d->XMM_S(1) = int32_to_float32(s->MMX_L(1), &env->sse_status);
+}
+
+void helper_cvtpi2pd(XMMReg *d, MMXReg *s)
+{
+    d->XMM_D(0) = int32_to_float64(s->MMX_L(0), &env->sse_status);
+    d->XMM_D(1) = int32_to_float64(s->MMX_L(1), &env->sse_status);
+}
+
+void helper_cvtsi2ss(XMMReg *d, uint32_t val)
+{
+    d->XMM_S(0) = int32_to_float32(val, &env->sse_status);
+}
+
+void helper_cvtsi2sd(XMMReg *d, uint32_t val)
+{
+    d->XMM_D(0) = int32_to_float64(val, &env->sse_status);
+}
+
+#ifdef TARGET_X86_64
+void helper_cvtsq2ss(XMMReg *d, uint64_t val)
+{
+    d->XMM_S(0) = int64_to_float32(val, &env->sse_status);
+}
+
+void helper_cvtsq2sd(XMMReg *d, uint64_t val)
+{
+    d->XMM_D(0) = int64_to_float64(val, &env->sse_status);
+}
+#endif
+
+/* float to integer */
+void helper_cvtps2dq(XMMReg *d, XMMReg *s)
+{
+    d->XMM_L(0) = float32_to_int32(s->XMM_S(0), &env->sse_status);
+    d->XMM_L(1) = float32_to_int32(s->XMM_S(1), &env->sse_status);
+    d->XMM_L(2) = float32_to_int32(s->XMM_S(2), &env->sse_status);
+    d->XMM_L(3) = float32_to_int32(s->XMM_S(3), &env->sse_status);
+}
+
+void helper_cvtpd2dq(XMMReg *d, XMMReg *s)
+{
+    d->XMM_L(0) = float64_to_int32(s->XMM_D(0), &env->sse_status);
+    d->XMM_L(1) = float64_to_int32(s->XMM_D(1), &env->sse_status);
+    d->XMM_Q(1) = 0;
+}
+
+void helper_cvtps2pi(MMXReg *d, XMMReg *s)
+{
+    d->MMX_L(0) = float32_to_int32(s->XMM_S(0), &env->sse_status);
+    d->MMX_L(1) = float32_to_int32(s->XMM_S(1), &env->sse_status);
+}
+
+void helper_cvtpd2pi(MMXReg *d, XMMReg *s)
+{
+    d->MMX_L(0) = float64_to_int32(s->XMM_D(0), &env->sse_status);
+    d->MMX_L(1) = float64_to_int32(s->XMM_D(1), &env->sse_status);
+}
+
+int32_t helper_cvtss2si(XMMReg *s)
+{
+    return float32_to_int32(s->XMM_S(0), &env->sse_status);
+}
+
+int32_t helper_cvtsd2si(XMMReg *s)
+{
+    return float64_to_int32(s->XMM_D(0), &env->sse_status);
+}
+
+#ifdef TARGET_X86_64
+int64_t helper_cvtss2sq(XMMReg *s)
+{
+    return float32_to_int64(s->XMM_S(0), &env->sse_status);
+}
+
+int64_t helper_cvtsd2sq(XMMReg *s)
+{
+    return float64_to_int64(s->XMM_D(0), &env->sse_status);
+}
+#endif
+
+/* float to integer truncated */
+void helper_cvttps2dq(XMMReg *d, XMMReg *s)
+{
+    d->XMM_L(0) = float32_to_int32_round_to_zero(s->XMM_S(0), &env->sse_status);
+    d->XMM_L(1) = float32_to_int32_round_to_zero(s->XMM_S(1), &env->sse_status);
+    d->XMM_L(2) = float32_to_int32_round_to_zero(s->XMM_S(2), &env->sse_status);
+    d->XMM_L(3) = float32_to_int32_round_to_zero(s->XMM_S(3), &env->sse_status);
+}
+
+void helper_cvttpd2dq(XMMReg *d, XMMReg *s)
+{
+    d->XMM_L(0) = float64_to_int32_round_to_zero(s->XMM_D(0), &env->sse_status);
+    d->XMM_L(1) = float64_to_int32_round_to_zero(s->XMM_D(1), &env->sse_status);
+    d->XMM_Q(1) = 0;
+}
+
+void helper_cvttps2pi(MMXReg *d, XMMReg *s)
+{
+    d->MMX_L(0) = float32_to_int32_round_to_zero(s->XMM_S(0), &env->sse_status);
+    d->MMX_L(1) = float32_to_int32_round_to_zero(s->XMM_S(1), &env->sse_status);
+}
+
+void helper_cvttpd2pi(MMXReg *d, XMMReg *s)
+{
+    d->MMX_L(0) = float64_to_int32_round_to_zero(s->XMM_D(0), &env->sse_status);
+    d->MMX_L(1) = float64_to_int32_round_to_zero(s->XMM_D(1), &env->sse_status);
+}
+
+int32_t helper_cvttss2si(XMMReg *s)
+{
+    return float32_to_int32_round_to_zero(s->XMM_S(0), &env->sse_status);
+}
+
+int32_t helper_cvttsd2si(XMMReg *s)
+{
+    return float64_to_int32_round_to_zero(s->XMM_D(0), &env->sse_status);
+}
+
+#ifdef TARGET_X86_64
+int64_t helper_cvttss2sq(XMMReg *s)
+{
+    return float32_to_int64_round_to_zero(s->XMM_S(0), &env->sse_status);
+}
+
+int64_t helper_cvttsd2sq(XMMReg *s)
+{
+    return float64_to_int64_round_to_zero(s->XMM_D(0), &env->sse_status);
+}
+#endif
+
+void helper_rsqrtps(XMMReg *d, XMMReg *s)
+{
+    d->XMM_S(0) = approx_rsqrt(s->XMM_S(0));
+    d->XMM_S(1) = approx_rsqrt(s->XMM_S(1));
+    d->XMM_S(2) = approx_rsqrt(s->XMM_S(2));
+    d->XMM_S(3) = approx_rsqrt(s->XMM_S(3));
+}
+
+void helper_rsqrtss(XMMReg *d, XMMReg *s)
+{
+    d->XMM_S(0) = approx_rsqrt(s->XMM_S(0));
+}
+
+void helper_rcpps(XMMReg *d, XMMReg *s)
+{
+    d->XMM_S(0) = approx_rcp(s->XMM_S(0));
+    d->XMM_S(1) = approx_rcp(s->XMM_S(1));
+    d->XMM_S(2) = approx_rcp(s->XMM_S(2));
+    d->XMM_S(3) = approx_rcp(s->XMM_S(3));
+}
+
+void helper_rcpss(XMMReg *d, XMMReg *s)
+{
+    d->XMM_S(0) = approx_rcp(s->XMM_S(0));
+}
+
+static inline uint64_t helper_extrq(uint64_t src, int shift, int len)
+{
+    uint64_t mask;
+
+    if (len == 0) {
+        mask = ~0LL;
+    } else {
+        mask = (1ULL << len) - 1;
+    }
+    return (src >> shift) & mask;
+}
+
+void helper_extrq_r(XMMReg *d, XMMReg *s)
+{
+    d->XMM_Q(0) = helper_extrq(d->XMM_Q(0), s->XMM_B(1), s->XMM_B(0));
+}
+
+void helper_extrq_i(XMMReg *d, int index, int length)
+{
+    d->XMM_Q(0) = helper_extrq(d->XMM_Q(0), index, length);
+}
+
+static inline uint64_t helper_insertq(uint64_t src, int shift, int len)
+{
+    uint64_t mask;
+
+    if (len == 0) {
+        mask = ~0ULL;
+    } else {
+        mask = (1ULL << len) - 1;
+    }
+    return (src & ~(mask << shift)) | ((src & mask) << shift);
+}
+
+void helper_insertq_r(XMMReg *d, XMMReg *s)
+{
+    d->XMM_Q(0) = helper_insertq(s->XMM_Q(0), s->XMM_B(9), s->XMM_B(8));
+}
+
+void helper_insertq_i(XMMReg *d, int index, int length)
+{
+    d->XMM_Q(0) = helper_insertq(d->XMM_Q(0), index, length);
+}
+
+void helper_haddps(XMMReg *d, XMMReg *s)
+{
+    XMMReg r;
+    r.XMM_S(0) = d->XMM_S(0) + d->XMM_S(1);
+    r.XMM_S(1) = d->XMM_S(2) + d->XMM_S(3);
+    r.XMM_S(2) = s->XMM_S(0) + s->XMM_S(1);
+    r.XMM_S(3) = s->XMM_S(2) + s->XMM_S(3);
+    *d = r;
+}
+
+void helper_haddpd(XMMReg *d, XMMReg *s)
+{
+    XMMReg r;
+    r.XMM_D(0) = d->XMM_D(0) + d->XMM_D(1);
+    r.XMM_D(1) = s->XMM_D(0) + s->XMM_D(1);
+    *d = r;
+}
+
+void helper_hsubps(XMMReg *d, XMMReg *s)
+{
+    XMMReg r;
+    r.XMM_S(0) = d->XMM_S(0) - d->XMM_S(1);
+    r.XMM_S(1) = d->XMM_S(2) - d->XMM_S(3);
+    r.XMM_S(2) = s->XMM_S(0) - s->XMM_S(1);
+    r.XMM_S(3) = s->XMM_S(2) - s->XMM_S(3);
+    *d = r;
+}
+
+void helper_hsubpd(XMMReg *d, XMMReg *s)
+{
+    XMMReg r;
+    r.XMM_D(0) = d->XMM_D(0) - d->XMM_D(1);
+    r.XMM_D(1) = s->XMM_D(0) - s->XMM_D(1);
+    *d = r;
+}
+
+void helper_addsubps(XMMReg *d, XMMReg *s)
+{
+    d->XMM_S(0) = d->XMM_S(0) - s->XMM_S(0);
+    d->XMM_S(1) = d->XMM_S(1) + s->XMM_S(1);
+    d->XMM_S(2) = d->XMM_S(2) - s->XMM_S(2);
+    d->XMM_S(3) = d->XMM_S(3) + s->XMM_S(3);
+}
+
+void helper_addsubpd(XMMReg *d, XMMReg *s)
+{
+    d->XMM_D(0) = d->XMM_D(0) - s->XMM_D(0);
+    d->XMM_D(1) = d->XMM_D(1) + s->XMM_D(1);
+}
+
+/* XXX: unordered */
+#define SSE_HELPER_CMP(name, F)\
+void helper_ ## name ## ps (Reg *d, Reg *s)\
+{\
+    d->XMM_L(0) = F(32, d->XMM_S(0), s->XMM_S(0));\
+    d->XMM_L(1) = F(32, d->XMM_S(1), s->XMM_S(1));\
+    d->XMM_L(2) = F(32, d->XMM_S(2), s->XMM_S(2));\
+    d->XMM_L(3) = F(32, d->XMM_S(3), s->XMM_S(3));\
+}\
+\
+void helper_ ## name ## ss (Reg *d, Reg *s)\
+{\
+    d->XMM_L(0) = F(32, d->XMM_S(0), s->XMM_S(0));\
+}\
+void helper_ ## name ## pd (Reg *d, Reg *s)\
+{\
+    d->XMM_Q(0) = F(64, d->XMM_D(0), s->XMM_D(0));\
+    d->XMM_Q(1) = F(64, d->XMM_D(1), s->XMM_D(1));\
+}\
+\
+void helper_ ## name ## sd (Reg *d, Reg *s)\
+{\
+    d->XMM_Q(0) = F(64, d->XMM_D(0), s->XMM_D(0));\
+}
+
+#define FPU_CMPEQ(size, a, b) float ## size ## _eq(a, b, &env->sse_status) ? -1 : 0
+#define FPU_CMPLT(size, a, b) float ## size ## _lt(a, b, &env->sse_status) ? -1 : 0
+#define FPU_CMPLE(size, a, b) float ## size ## _le(a, b, &env->sse_status) ? -1 : 0
+#define FPU_CMPUNORD(size, a, b) float ## size ## _unordered(a, b, &env->sse_status) ? - 1 : 0
+#define FPU_CMPNEQ(size, a, b) float ## size ## _eq(a, b, &env->sse_status) ? 0 : -1
+#define FPU_CMPNLT(size, a, b) float ## size ## _lt(a, b, &env->sse_status) ? 0 : -1
+#define FPU_CMPNLE(size, a, b) float ## size ## _le(a, b, &env->sse_status) ? 0 : -1
+#define FPU_CMPORD(size, a, b) float ## size ## _unordered(a, b, &env->sse_status) ? 0 : -1
+
+SSE_HELPER_CMP(cmpeq, FPU_CMPEQ)
+SSE_HELPER_CMP(cmplt, FPU_CMPLT)
+SSE_HELPER_CMP(cmple, FPU_CMPLE)
+SSE_HELPER_CMP(cmpunord, FPU_CMPUNORD)
+SSE_HELPER_CMP(cmpneq, FPU_CMPNEQ)
+SSE_HELPER_CMP(cmpnlt, FPU_CMPNLT)
+SSE_HELPER_CMP(cmpnle, FPU_CMPNLE)
+SSE_HELPER_CMP(cmpord, FPU_CMPORD)
+
+static const int comis_eflags[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
+
+void helper_ucomiss(Reg *d, Reg *s)
+{
+    int ret;
+    float32 s0, s1;
+
+    s0 = d->XMM_S(0);
+    s1 = s->XMM_S(0);
+    ret = float32_compare_quiet(s0, s1, &env->sse_status);
+    CC_SRC = comis_eflags[ret + 1];
+}
+
+void helper_comiss(Reg *d, Reg *s)
+{
+    int ret;
+    float32 s0, s1;
+
+    s0 = d->XMM_S(0);
+    s1 = s->XMM_S(0);
+    ret = float32_compare(s0, s1, &env->sse_status);
+    CC_SRC = comis_eflags[ret + 1];
+}
+
+void helper_ucomisd(Reg *d, Reg *s)
+{
+    int ret;
+    float64 d0, d1;
+
+    d0 = d->XMM_D(0);
+    d1 = s->XMM_D(0);
+    ret = float64_compare_quiet(d0, d1, &env->sse_status);
+    CC_SRC = comis_eflags[ret + 1];
+}
+
+void helper_comisd(Reg *d, Reg *s)
+{
+    int ret;
+    float64 d0, d1;
+
+    d0 = d->XMM_D(0);
+    d1 = s->XMM_D(0);
+    ret = float64_compare(d0, d1, &env->sse_status);
+    CC_SRC = comis_eflags[ret + 1];
+}
+
+uint32_t helper_movmskps(Reg *s)
+{
+    int b0, b1, b2, b3;
+    b0 = s->XMM_L(0) >> 31;
+    b1 = s->XMM_L(1) >> 31;
+    b2 = s->XMM_L(2) >> 31;
+    b3 = s->XMM_L(3) >> 31;
+    return b0 | (b1 << 1) | (b2 << 2) | (b3 << 3);
+}
+
+uint32_t helper_movmskpd(Reg *s)
+{
+    int b0, b1;
+    b0 = s->XMM_L(1) >> 31;
+    b1 = s->XMM_L(3) >> 31;
+    return b0 | (b1 << 1);
+}
+
+#endif
+
+uint32_t glue(helper_pmovmskb, SUFFIX)(Reg *s)
+{
+    uint32_t val;
+    val = 0;
+    val |= (s->B(0) >> 7);
+    val |= (s->B(1) >> 6) & 0x02;
+    val |= (s->B(2) >> 5) & 0x04;
+    val |= (s->B(3) >> 4) & 0x08;
+    val |= (s->B(4) >> 3) & 0x10;
+    val |= (s->B(5) >> 2) & 0x20;
+    val |= (s->B(6) >> 1) & 0x40;
+    val |= (s->B(7)) & 0x80;
+#if SHIFT == 1
+    val |= (s->B(8) << 1) & 0x0100;
+    val |= (s->B(9) << 2) & 0x0200;
+    val |= (s->B(10) << 3) & 0x0400;
+    val |= (s->B(11) << 4) & 0x0800;
+    val |= (s->B(12) << 5) & 0x1000;
+    val |= (s->B(13) << 6) & 0x2000;
+    val |= (s->B(14) << 7) & 0x4000;
+    val |= (s->B(15) << 8) & 0x8000;
+#endif
+    return val;
+}
+
+void glue(helper_packsswb, SUFFIX) (Reg *d, Reg *s)
+{
+    Reg r;
+
+    r.B(0) = satsb((int16_t)d->W(0));
+    r.B(1) = satsb((int16_t)d->W(1));
+    r.B(2) = satsb((int16_t)d->W(2));
+    r.B(3) = satsb((int16_t)d->W(3));
+#if SHIFT == 1
+    r.B(4) = satsb((int16_t)d->W(4));
+    r.B(5) = satsb((int16_t)d->W(5));
+    r.B(6) = satsb((int16_t)d->W(6));
+    r.B(7) = satsb((int16_t)d->W(7));
+#endif
+    r.B((4 << SHIFT) + 0) = satsb((int16_t)s->W(0));
+    r.B((4 << SHIFT) + 1) = satsb((int16_t)s->W(1));
+    r.B((4 << SHIFT) + 2) = satsb((int16_t)s->W(2));
+    r.B((4 << SHIFT) + 3) = satsb((int16_t)s->W(3));
+#if SHIFT == 1
+    r.B(12) = satsb((int16_t)s->W(4));
+    r.B(13) = satsb((int16_t)s->W(5));
+    r.B(14) = satsb((int16_t)s->W(6));
+    r.B(15) = satsb((int16_t)s->W(7));
+#endif
+    *d = r;
+}
+
+void glue(helper_packuswb, SUFFIX) (Reg *d, Reg *s)
+{
+    Reg r;
+
+    r.B(0) = satub((int16_t)d->W(0));
+    r.B(1) = satub((int16_t)d->W(1));
+    r.B(2) = satub((int16_t)d->W(2));
+    r.B(3) = satub((int16_t)d->W(3));
+#if SHIFT == 1
+    r.B(4) = satub((int16_t)d->W(4));
+    r.B(5) = satub((int16_t)d->W(5));
+    r.B(6) = satub((int16_t)d->W(6));
+    r.B(7) = satub((int16_t)d->W(7));
+#endif
+    r.B((4 << SHIFT) + 0) = satub((int16_t)s->W(0));
+    r.B((4 << SHIFT) + 1) = satub((int16_t)s->W(1));
+    r.B((4 << SHIFT) + 2) = satub((int16_t)s->W(2));
+    r.B((4 << SHIFT) + 3) = satub((int16_t)s->W(3));
+#if SHIFT == 1
+    r.B(12) = satub((int16_t)s->W(4));
+    r.B(13) = satub((int16_t)s->W(5));
+    r.B(14) = satub((int16_t)s->W(6));
+    r.B(15) = satub((int16_t)s->W(7));
+#endif
+    *d = r;
+}
+
+void glue(helper_packssdw, SUFFIX) (Reg *d, Reg *s)
+{
+    Reg r;
+
+    r.W(0) = satsw(d->L(0));
+    r.W(1) = satsw(d->L(1));
+#if SHIFT == 1
+    r.W(2) = satsw(d->L(2));
+    r.W(3) = satsw(d->L(3));
+#endif
+    r.W((2 << SHIFT) + 0) = satsw(s->L(0));
+    r.W((2 << SHIFT) + 1) = satsw(s->L(1));
+#if SHIFT == 1
+    r.W(6) = satsw(s->L(2));
+    r.W(7) = satsw(s->L(3));
+#endif
+    *d = r;
+}
+
+#define UNPCK_OP(base_name, base)                               \
+                                                                \
+void glue(helper_punpck ## base_name ## bw, SUFFIX) (Reg *d, Reg *s)   \
+{                                                               \
+    Reg r;                                              \
+                                                                \
+    r.B(0) = d->B((base << (SHIFT + 2)) + 0);                   \
+    r.B(1) = s->B((base << (SHIFT + 2)) + 0);                   \
+    r.B(2) = d->B((base << (SHIFT + 2)) + 1);                   \
+    r.B(3) = s->B((base << (SHIFT + 2)) + 1);                   \
+    r.B(4) = d->B((base << (SHIFT + 2)) + 2);                   \
+    r.B(5) = s->B((base << (SHIFT + 2)) + 2);                   \
+    r.B(6) = d->B((base << (SHIFT + 2)) + 3);                   \
+    r.B(7) = s->B((base << (SHIFT + 2)) + 3);                   \
+XMM_ONLY(                                                       \
+    r.B(8) = d->B((base << (SHIFT + 2)) + 4);                   \
+    r.B(9) = s->B((base << (SHIFT + 2)) + 4);                   \
+    r.B(10) = d->B((base << (SHIFT + 2)) + 5);                  \
+    r.B(11) = s->B((base << (SHIFT + 2)) + 5);                  \
+    r.B(12) = d->B((base << (SHIFT + 2)) + 6);                  \
+    r.B(13) = s->B((base << (SHIFT + 2)) + 6);                  \
+    r.B(14) = d->B((base << (SHIFT + 2)) + 7);                  \
+    r.B(15) = s->B((base << (SHIFT + 2)) + 7);                  \
+)                                                               \
+    *d = r;                                                     \
+}                                                               \
+                                                                \
+void glue(helper_punpck ## base_name ## wd, SUFFIX) (Reg *d, Reg *s)   \
+{                                                               \
+    Reg r;                                              \
+                                                                \
+    r.W(0) = d->W((base << (SHIFT + 1)) + 0);                   \
+    r.W(1) = s->W((base << (SHIFT + 1)) + 0);                   \
+    r.W(2) = d->W((base << (SHIFT + 1)) + 1);                   \
+    r.W(3) = s->W((base << (SHIFT + 1)) + 1);                   \
+XMM_ONLY(                                                       \
+    r.W(4) = d->W((base << (SHIFT + 1)) + 2);                   \
+    r.W(5) = s->W((base << (SHIFT + 1)) + 2);                   \
+    r.W(6) = d->W((base << (SHIFT + 1)) + 3);                   \
+    r.W(7) = s->W((base << (SHIFT + 1)) + 3);                   \
+)                                                               \
+    *d = r;                                                     \
+}                                                               \
+                                                                \
+void glue(helper_punpck ## base_name ## dq, SUFFIX) (Reg *d, Reg *s)   \
+{                                                               \
+    Reg r;                                              \
+                                                                \
+    r.L(0) = d->L((base << SHIFT) + 0);                         \
+    r.L(1) = s->L((base << SHIFT) + 0);                         \
+XMM_ONLY(                                                       \
+    r.L(2) = d->L((base << SHIFT) + 1);                         \
+    r.L(3) = s->L((base << SHIFT) + 1);                         \
+)                                                               \
+    *d = r;                                                     \
+}                                                               \
+                                                                \
+XMM_ONLY(                                                       \
+void glue(helper_punpck ## base_name ## qdq, SUFFIX) (Reg *d, Reg *s)  \
+{                                                               \
+    Reg r;                                              \
+                                                                \
+    r.Q(0) = d->Q(base);                                        \
+    r.Q(1) = s->Q(base);                                        \
+    *d = r;                                                     \
+}                                                               \
+)
+
+UNPCK_OP(l, 0)
+UNPCK_OP(h, 1)
+
+/* 3DNow! float ops */
+#if SHIFT == 0
+void helper_pi2fd(MMXReg *d, MMXReg *s)
+{
+    d->MMX_S(0) = int32_to_float32(s->MMX_L(0), &env->mmx_status);
+    d->MMX_S(1) = int32_to_float32(s->MMX_L(1), &env->mmx_status);
+}
+
+void helper_pi2fw(MMXReg *d, MMXReg *s)
+{
+    d->MMX_S(0) = int32_to_float32((int16_t)s->MMX_W(0), &env->mmx_status);
+    d->MMX_S(1) = int32_to_float32((int16_t)s->MMX_W(2), &env->mmx_status);
+}
+
+void helper_pf2id(MMXReg *d, MMXReg *s)
+{
+    d->MMX_L(0) = float32_to_int32_round_to_zero(s->MMX_S(0), &env->mmx_status);
+    d->MMX_L(1) = float32_to_int32_round_to_zero(s->MMX_S(1), &env->mmx_status);
+}
+
+void helper_pf2iw(MMXReg *d, MMXReg *s)
+{
+    d->MMX_L(0) = satsw(float32_to_int32_round_to_zero(s->MMX_S(0), &env->mmx_status));
+    d->MMX_L(1) = satsw(float32_to_int32_round_to_zero(s->MMX_S(1), &env->mmx_status));
+}
+
+void helper_pfacc(MMXReg *d, MMXReg *s)
+{
+    MMXReg r;
+    r.MMX_S(0) = float32_add(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
+    r.MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
+    *d = r;
+}
+
+void helper_pfadd(MMXReg *d, MMXReg *s)
+{
+    d->MMX_S(0) = float32_add(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
+    d->MMX_S(1) = float32_add(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
+}
+
+void helper_pfcmpeq(MMXReg *d, MMXReg *s)
+{
+    d->MMX_L(0) = float32_eq(d->MMX_S(0), s->MMX_S(0), &env->mmx_status) ? -1 : 0;
+    d->MMX_L(1) = float32_eq(d->MMX_S(1), s->MMX_S(1), &env->mmx_status) ? -1 : 0;
+}
+
+void helper_pfcmpge(MMXReg *d, MMXReg *s)
+{
+    d->MMX_L(0) = float32_le(s->MMX_S(0), d->MMX_S(0), &env->mmx_status) ? -1 : 0;
+    d->MMX_L(1) = float32_le(s->MMX_S(1), d->MMX_S(1), &env->mmx_status) ? -1 : 0;
+}
+
+void helper_pfcmpgt(MMXReg *d, MMXReg *s)
+{
+    d->MMX_L(0) = float32_lt(s->MMX_S(0), d->MMX_S(0), &env->mmx_status) ? -1 : 0;
+    d->MMX_L(1) = float32_lt(s->MMX_S(1), d->MMX_S(1), &env->mmx_status) ? -1 : 0;
+}
+
+void helper_pfmax(MMXReg *d, MMXReg *s)
+{
+    if (float32_lt(d->MMX_S(0), s->MMX_S(0), &env->mmx_status))
+        d->MMX_S(0) = s->MMX_S(0);
+    if (float32_lt(d->MMX_S(1), s->MMX_S(1), &env->mmx_status))
+        d->MMX_S(1) = s->MMX_S(1);
+}
+
+void helper_pfmin(MMXReg *d, MMXReg *s)
+{
+    if (float32_lt(s->MMX_S(0), d->MMX_S(0), &env->mmx_status))
+        d->MMX_S(0) = s->MMX_S(0);
+    if (float32_lt(s->MMX_S(1), d->MMX_S(1), &env->mmx_status))
+        d->MMX_S(1) = s->MMX_S(1);
+}
+
+void helper_pfmul(MMXReg *d, MMXReg *s)
+{
+    d->MMX_S(0) = float32_mul(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
+    d->MMX_S(1) = float32_mul(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
+}
+
+void helper_pfnacc(MMXReg *d, MMXReg *s)
+{
+    MMXReg r;
+    r.MMX_S(0) = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
+    r.MMX_S(1) = float32_sub(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
+    *d = r;
+}
+
+void helper_pfpnacc(MMXReg *d, MMXReg *s)
+{
+    MMXReg r;
+    r.MMX_S(0) = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
+    r.MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
+    *d = r;
+}
+
+void helper_pfrcp(MMXReg *d, MMXReg *s)
+{
+    d->MMX_S(0) = approx_rcp(s->MMX_S(0));
+    d->MMX_S(1) = d->MMX_S(0);
+}
+
+void helper_pfrsqrt(MMXReg *d, MMXReg *s)
+{
+    d->MMX_L(1) = s->MMX_L(0) & 0x7fffffff;
+    d->MMX_S(1) = approx_rsqrt(d->MMX_S(1));
+    d->MMX_L(1) |= s->MMX_L(0) & 0x80000000;
+    d->MMX_L(0) = d->MMX_L(1);
+}
+
+void helper_pfsub(MMXReg *d, MMXReg *s)
+{
+    d->MMX_S(0) = float32_sub(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
+    d->MMX_S(1) = float32_sub(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
+}
+
+void helper_pfsubr(MMXReg *d, MMXReg *s)
+{
+    d->MMX_S(0) = float32_sub(s->MMX_S(0), d->MMX_S(0), &env->mmx_status);
+    d->MMX_S(1) = float32_sub(s->MMX_S(1), d->MMX_S(1), &env->mmx_status);
+}
+
+void helper_pswapd(MMXReg *d, MMXReg *s)
+{
+    MMXReg r;
+    r.MMX_L(0) = s->MMX_L(1);
+    r.MMX_L(1) = s->MMX_L(0);
+    *d = r;
+}
+#endif
+
+/* SSSE3 op helpers */
+void glue(helper_pshufb, SUFFIX) (Reg *d, Reg *s)
+{
+    int i;
+    Reg r;
+
+    for (i = 0; i < (8 << SHIFT); i++)
+        r.B(i) = (s->B(i) & 0x80) ? 0 : (d->B(s->B(i) & ((8 << SHIFT) - 1)));
+
+    *d = r;
+}
+
+void glue(helper_phaddw, SUFFIX) (Reg *d, Reg *s)
+{
+    d->W(0) = (int16_t)d->W(0) + (int16_t)d->W(1);
+    d->W(1) = (int16_t)d->W(2) + (int16_t)d->W(3);
+    XMM_ONLY(d->W(2) = (int16_t)d->W(4) + (int16_t)d->W(5));
+    XMM_ONLY(d->W(3) = (int16_t)d->W(6) + (int16_t)d->W(7));
+    d->W((2 << SHIFT) + 0) = (int16_t)s->W(0) + (int16_t)s->W(1);
+    d->W((2 << SHIFT) + 1) = (int16_t)s->W(2) + (int16_t)s->W(3);
+    XMM_ONLY(d->W(6) = (int16_t)s->W(4) + (int16_t)s->W(5));
+    XMM_ONLY(d->W(7) = (int16_t)s->W(6) + (int16_t)s->W(7));
+}
+
+void glue(helper_phaddd, SUFFIX) (Reg *d, Reg *s)
+{
+    d->L(0) = (int32_t)d->L(0) + (int32_t)d->L(1);
+    XMM_ONLY(d->L(1) = (int32_t)d->L(2) + (int32_t)d->L(3));
+    d->L((1 << SHIFT) + 0) = (int32_t)s->L(0) + (int32_t)s->L(1);
+    XMM_ONLY(d->L(3) = (int32_t)s->L(2) + (int32_t)s->L(3));
+}
+
+void glue(helper_phaddsw, SUFFIX) (Reg *d, Reg *s)
+{
+    d->W(0) = satsw((int16_t)d->W(0) + (int16_t)d->W(1));
+    d->W(1) = satsw((int16_t)d->W(2) + (int16_t)d->W(3));
+    XMM_ONLY(d->W(2) = satsw((int16_t)d->W(4) + (int16_t)d->W(5)));
+    XMM_ONLY(d->W(3) = satsw((int16_t)d->W(6) + (int16_t)d->W(7)));
+    d->W((2 << SHIFT) + 0) = satsw((int16_t)s->W(0) + (int16_t)s->W(1));
+    d->W((2 << SHIFT) + 1) = satsw((int16_t)s->W(2) + (int16_t)s->W(3));
+    XMM_ONLY(d->W(6) = satsw((int16_t)s->W(4) + (int16_t)s->W(5)));
+    XMM_ONLY(d->W(7) = satsw((int16_t)s->W(6) + (int16_t)s->W(7)));
+}
+
+void glue(helper_pmaddubsw, SUFFIX) (Reg *d, Reg *s)
+{
+    d->W(0) = satsw((int8_t)s->B( 0) * (uint8_t)d->B( 0) +
+                    (int8_t)s->B( 1) * (uint8_t)d->B( 1));
+    d->W(1) = satsw((int8_t)s->B( 2) * (uint8_t)d->B( 2) +
+                    (int8_t)s->B( 3) * (uint8_t)d->B( 3));
+    d->W(2) = satsw((int8_t)s->B( 4) * (uint8_t)d->B( 4) +
+                    (int8_t)s->B( 5) * (uint8_t)d->B( 5));
+    d->W(3) = satsw((int8_t)s->B( 6) * (uint8_t)d->B( 6) +
+                    (int8_t)s->B( 7) * (uint8_t)d->B( 7));
+#if SHIFT == 1
+    d->W(4) = satsw((int8_t)s->B( 8) * (uint8_t)d->B( 8) +
+                    (int8_t)s->B( 9) * (uint8_t)d->B( 9));
+    d->W(5) = satsw((int8_t)s->B(10) * (uint8_t)d->B(10) +
+                    (int8_t)s->B(11) * (uint8_t)d->B(11));
+    d->W(6) = satsw((int8_t)s->B(12) * (uint8_t)d->B(12) +
+                    (int8_t)s->B(13) * (uint8_t)d->B(13));
+    d->W(7) = satsw((int8_t)s->B(14) * (uint8_t)d->B(14) +
+                    (int8_t)s->B(15) * (uint8_t)d->B(15));
+#endif
+}
+
+void glue(helper_phsubw, SUFFIX) (Reg *d, Reg *s)
+{
+    d->W(0) = (int16_t)d->W(0) - (int16_t)d->W(1);
+    d->W(1) = (int16_t)d->W(2) - (int16_t)d->W(3);
+    XMM_ONLY(d->W(2) = (int16_t)d->W(4) - (int16_t)d->W(5));
+    XMM_ONLY(d->W(3) = (int16_t)d->W(6) - (int16_t)d->W(7));
+    d->W((2 << SHIFT) + 0) = (int16_t)s->W(0) - (int16_t)s->W(1);
+    d->W((2 << SHIFT) + 1) = (int16_t)s->W(2) - (int16_t)s->W(3);
+    XMM_ONLY(d->W(6) = (int16_t)s->W(4) - (int16_t)s->W(5));
+    XMM_ONLY(d->W(7) = (int16_t)s->W(6) - (int16_t)s->W(7));
+}
+
+void glue(helper_phsubd, SUFFIX) (Reg *d, Reg *s)
+{
+    d->L(0) = (int32_t)d->L(0) - (int32_t)d->L(1);
+    XMM_ONLY(d->L(1) = (int32_t)d->L(2) - (int32_t)d->L(3));
+    d->L((1 << SHIFT) + 0) = (int32_t)s->L(0) - (int32_t)s->L(1);
+    XMM_ONLY(d->L(3) = (int32_t)s->L(2) - (int32_t)s->L(3));
+}
+
+void glue(helper_phsubsw, SUFFIX) (Reg *d, Reg *s)
+{
+    d->W(0) = satsw((int16_t)d->W(0) - (int16_t)d->W(1));
+    d->W(1) = satsw((int16_t)d->W(2) - (int16_t)d->W(3));
+    XMM_ONLY(d->W(2) = satsw((int16_t)d->W(4) - (int16_t)d->W(5)));
+    XMM_ONLY(d->W(3) = satsw((int16_t)d->W(6) - (int16_t)d->W(7)));
+    d->W((2 << SHIFT) + 0) = satsw((int16_t)s->W(0) - (int16_t)s->W(1));
+    d->W((2 << SHIFT) + 1) = satsw((int16_t)s->W(2) - (int16_t)s->W(3));
+    XMM_ONLY(d->W(6) = satsw((int16_t)s->W(4) - (int16_t)s->W(5)));
+    XMM_ONLY(d->W(7) = satsw((int16_t)s->W(6) - (int16_t)s->W(7)));
+}
+
+#define FABSB(_, x) x > INT8_MAX  ? -(int8_t ) x : x
+#define FABSW(_, x) x > INT16_MAX ? -(int16_t) x : x
+#define FABSL(_, x) x > INT32_MAX ? -(int32_t) x : x
+SSE_HELPER_B(helper_pabsb, FABSB)
+SSE_HELPER_W(helper_pabsw, FABSW)
+SSE_HELPER_L(helper_pabsd, FABSL)
+
+#define FMULHRSW(d, s) ((int16_t) d * (int16_t) s + 0x4000) >> 15
+SSE_HELPER_W(helper_pmulhrsw, FMULHRSW)
+
+#define FSIGNB(d, s) s <= INT8_MAX  ? s ? d : 0 : -(int8_t ) d
+#define FSIGNW(d, s) s <= INT16_MAX ? s ? d : 0 : -(int16_t) d
+#define FSIGNL(d, s) s <= INT32_MAX ? s ? d : 0 : -(int32_t) d
+SSE_HELPER_B(helper_psignb, FSIGNB)
+SSE_HELPER_W(helper_psignw, FSIGNW)
+SSE_HELPER_L(helper_psignd, FSIGNL)
+
+void glue(helper_palignr, SUFFIX) (Reg *d, Reg *s, int32_t shift)
+{
+    Reg r;
+
+    /* XXX could be checked during translation */
+    if (shift >= (16 << SHIFT)) {
+        r.Q(0) = 0;
+        XMM_ONLY(r.Q(1) = 0);
+    } else {
+        shift <<= 3;
+#define SHR(v, i) (i < 64 && i > -64 ? i > 0 ? v >> (i) : (v << -(i)) : 0)
+#if SHIFT == 0
+        r.Q(0) = SHR(s->Q(0), shift -   0) |
+                 SHR(d->Q(0), shift -  64);
+#else
+        r.Q(0) = SHR(s->Q(0), shift -   0) |
+                 SHR(s->Q(1), shift -  64) |
+                 SHR(d->Q(0), shift - 128) |
+                 SHR(d->Q(1), shift - 192);
+        r.Q(1) = SHR(s->Q(0), shift +  64) |
+                 SHR(s->Q(1), shift -   0) |
+                 SHR(d->Q(0), shift -  64) |
+                 SHR(d->Q(1), shift - 128);
+#endif
+#undef SHR
+    }
+
+    *d = r;
+}
+
+#define XMM0 env->xmm_regs[0]
+
+#if SHIFT == 1
+#define SSE_HELPER_V(name, elem, num, F)\
+void glue(name, SUFFIX) (Reg *d, Reg *s)\
+{\
+    d->elem(0) = F(d->elem(0), s->elem(0), XMM0.elem(0));\
+    d->elem(1) = F(d->elem(1), s->elem(1), XMM0.elem(1));\
+    if (num > 2) {\
+        d->elem(2) = F(d->elem(2), s->elem(2), XMM0.elem(2));\
+        d->elem(3) = F(d->elem(3), s->elem(3), XMM0.elem(3));\
+        if (num > 4) {\
+            d->elem(4) = F(d->elem(4), s->elem(4), XMM0.elem(4));\
+            d->elem(5) = F(d->elem(5), s->elem(5), XMM0.elem(5));\
+            d->elem(6) = F(d->elem(6), s->elem(6), XMM0.elem(6));\
+            d->elem(7) = F(d->elem(7), s->elem(7), XMM0.elem(7));\
+            if (num > 8) {\
+                d->elem(8) = F(d->elem(8), s->elem(8), XMM0.elem(8));\
+                d->elem(9) = F(d->elem(9), s->elem(9), XMM0.elem(9));\
+                d->elem(10) = F(d->elem(10), s->elem(10), XMM0.elem(10));\
+                d->elem(11) = F(d->elem(11), s->elem(11), XMM0.elem(11));\
+                d->elem(12) = F(d->elem(12), s->elem(12), XMM0.elem(12));\
+                d->elem(13) = F(d->elem(13), s->elem(13), XMM0.elem(13));\
+                d->elem(14) = F(d->elem(14), s->elem(14), XMM0.elem(14));\
+                d->elem(15) = F(d->elem(15), s->elem(15), XMM0.elem(15));\
+            }\
+        }\
+    }\
+}
+
+#define SSE_HELPER_I(name, elem, num, F)\
+void glue(name, SUFFIX) (Reg *d, Reg *s, uint32_t imm)\
+{\
+    d->elem(0) = F(d->elem(0), s->elem(0), ((imm >> 0) & 1));\
+    d->elem(1) = F(d->elem(1), s->elem(1), ((imm >> 1) & 1));\
+    if (num > 2) {\
+        d->elem(2) = F(d->elem(2), s->elem(2), ((imm >> 2) & 1));\
+        d->elem(3) = F(d->elem(3), s->elem(3), ((imm >> 3) & 1));\
+        if (num > 4) {\
+            d->elem(4) = F(d->elem(4), s->elem(4), ((imm >> 4) & 1));\
+            d->elem(5) = F(d->elem(5), s->elem(5), ((imm >> 5) & 1));\
+            d->elem(6) = F(d->elem(6), s->elem(6), ((imm >> 6) & 1));\
+            d->elem(7) = F(d->elem(7), s->elem(7), ((imm >> 7) & 1));\
+            if (num > 8) {\
+                d->elem(8) = F(d->elem(8), s->elem(8), ((imm >> 8) & 1));\
+                d->elem(9) = F(d->elem(9), s->elem(9), ((imm >> 9) & 1));\
+                d->elem(10) = F(d->elem(10), s->elem(10), ((imm >> 10) & 1));\
+                d->elem(11) = F(d->elem(11), s->elem(11), ((imm >> 11) & 1));\
+                d->elem(12) = F(d->elem(12), s->elem(12), ((imm >> 12) & 1));\
+                d->elem(13) = F(d->elem(13), s->elem(13), ((imm >> 13) & 1));\
+                d->elem(14) = F(d->elem(14), s->elem(14), ((imm >> 14) & 1));\
+                d->elem(15) = F(d->elem(15), s->elem(15), ((imm >> 15) & 1));\
+            }\
+        }\
+    }\
+}
+
+/* SSE4.1 op helpers */
+#define FBLENDVB(d, s, m) (m & 0x80) ? s : d
+#define FBLENDVPS(d, s, m) (m & 0x80000000) ? s : d
+#define FBLENDVPD(d, s, m) (m & 0x8000000000000000LL) ? s : d
+SSE_HELPER_V(helper_pblendvb, B, 16, FBLENDVB)
+SSE_HELPER_V(helper_blendvps, L, 4, FBLENDVPS)
+SSE_HELPER_V(helper_blendvpd, Q, 2, FBLENDVPD)
+
+void glue(helper_ptest, SUFFIX) (Reg *d, Reg *s)
+{
+    uint64_t zf = (s->Q(0) &  d->Q(0)) | (s->Q(1) &  d->Q(1));
+    uint64_t cf = (s->Q(0) & ~d->Q(0)) | (s->Q(1) & ~d->Q(1));
+
+    CC_SRC = (zf ? 0 : CC_Z) | (cf ? 0 : CC_C);
+}
+
+#define SSE_HELPER_F(name, elem, num, F)\
+void glue(name, SUFFIX) (Reg *d, Reg *s)\
+{\
+    d->elem(0) = F(0);\
+    d->elem(1) = F(1);\
+    if (num > 2) {\
+        d->elem(2) = F(2);\
+        d->elem(3) = F(3);\
+        if (num > 4) {\
+            d->elem(4) = F(4);\
+            d->elem(5) = F(5);\
+            d->elem(6) = F(6);\
+            d->elem(7) = F(7);\
+        }\
+    }\
+}
+
+SSE_HELPER_F(helper_pmovsxbw, W, 8, (int8_t) s->B)
+SSE_HELPER_F(helper_pmovsxbd, L, 4, (int8_t) s->B)
+SSE_HELPER_F(helper_pmovsxbq, Q, 2, (int8_t) s->B)
+SSE_HELPER_F(helper_pmovsxwd, L, 4, (int16_t) s->W)
+SSE_HELPER_F(helper_pmovsxwq, Q, 2, (int16_t) s->W)
+SSE_HELPER_F(helper_pmovsxdq, Q, 2, (int32_t) s->L)
+SSE_HELPER_F(helper_pmovzxbw, W, 8, s->B)
+SSE_HELPER_F(helper_pmovzxbd, L, 4, s->B)
+SSE_HELPER_F(helper_pmovzxbq, Q, 2, s->B)
+SSE_HELPER_F(helper_pmovzxwd, L, 4, s->W)
+SSE_HELPER_F(helper_pmovzxwq, Q, 2, s->W)
+SSE_HELPER_F(helper_pmovzxdq, Q, 2, s->L)
+
+void glue(helper_pmuldq, SUFFIX) (Reg *d, Reg *s)
+{
+    d->Q(0) = (int64_t) (int32_t) d->L(0) * (int32_t) s->L(0);
+    d->Q(1) = (int64_t) (int32_t) d->L(2) * (int32_t) s->L(2);
+}
+
+#define FCMPEQQ(d, s) d == s ? -1 : 0
+SSE_HELPER_Q(helper_pcmpeqq, FCMPEQQ)
+
+void glue(helper_packusdw, SUFFIX) (Reg *d, Reg *s)
+{
+    d->W(0) = satuw((int32_t) d->L(0));
+    d->W(1) = satuw((int32_t) d->L(1));
+    d->W(2) = satuw((int32_t) d->L(2));
+    d->W(3) = satuw((int32_t) d->L(3));
+    d->W(4) = satuw((int32_t) s->L(0));
+    d->W(5) = satuw((int32_t) s->L(1));
+    d->W(6) = satuw((int32_t) s->L(2));
+    d->W(7) = satuw((int32_t) s->L(3));
+}
+
+#define FMINSB(d, s) MIN((int8_t) d, (int8_t) s)
+#define FMINSD(d, s) MIN((int32_t) d, (int32_t) s)
+#define FMAXSB(d, s) MAX((int8_t) d, (int8_t) s)
+#define FMAXSD(d, s) MAX((int32_t) d, (int32_t) s)
+SSE_HELPER_B(helper_pminsb, FMINSB)
+SSE_HELPER_L(helper_pminsd, FMINSD)
+SSE_HELPER_W(helper_pminuw, MIN)
+SSE_HELPER_L(helper_pminud, MIN)
+SSE_HELPER_B(helper_pmaxsb, FMAXSB)
+SSE_HELPER_L(helper_pmaxsd, FMAXSD)
+SSE_HELPER_W(helper_pmaxuw, MAX)
+SSE_HELPER_L(helper_pmaxud, MAX)
+
+#define FMULLD(d, s) (int32_t) d * (int32_t) s
+SSE_HELPER_L(helper_pmulld, FMULLD)
+
+void glue(helper_phminposuw, SUFFIX) (Reg *d, Reg *s)
+{
+    int idx = 0;
+
+    if (s->W(1) < s->W(idx))
+        idx = 1;
+    if (s->W(2) < s->W(idx))
+        idx = 2;
+    if (s->W(3) < s->W(idx))
+        idx = 3;
+    if (s->W(4) < s->W(idx))
+        idx = 4;
+    if (s->W(5) < s->W(idx))
+        idx = 5;
+    if (s->W(6) < s->W(idx))
+        idx = 6;
+    if (s->W(7) < s->W(idx))
+        idx = 7;
+
+    d->Q(1) = 0;
+    d->L(1) = 0;
+    d->W(1) = idx;
+    d->W(0) = s->W(idx);
+}
+
+void glue(helper_roundps, SUFFIX) (Reg *d, Reg *s, uint32_t mode)
+{
+    signed char prev_rounding_mode;
+
+    prev_rounding_mode = env->sse_status.float_rounding_mode;
+    if (!(mode & (1 << 2)))
+        switch (mode & 3) {
+        case 0:
+            set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
+            break;
+        case 1:
+            set_float_rounding_mode(float_round_down, &env->sse_status);
+            break;
+        case 2:
+            set_float_rounding_mode(float_round_up, &env->sse_status);
+            break;
+        case 3:
+            set_float_rounding_mode(float_round_to_zero, &env->sse_status);
+            break;
+        }
+
+    d->L(0) = float64_round_to_int(s->L(0), &env->sse_status);
+    d->L(1) = float64_round_to_int(s->L(1), &env->sse_status);
+    d->L(2) = float64_round_to_int(s->L(2), &env->sse_status);
+    d->L(3) = float64_round_to_int(s->L(3), &env->sse_status);
+
+#if 0 /* TODO */
+    if (mode & (1 << 3))
+        set_float_exception_flags(
+                        get_float_exception_flags(&env->sse_status) &
+                        ~float_flag_inexact,
+                        &env->sse_status);
+#endif
+    env->sse_status.float_rounding_mode = prev_rounding_mode;
+}
+
+void glue(helper_roundpd, SUFFIX) (Reg *d, Reg *s, uint32_t mode)
+{
+    signed char prev_rounding_mode;
+
+    prev_rounding_mode = env->sse_status.float_rounding_mode;
+    if (!(mode & (1 << 2)))
+        switch (mode & 3) {
+        case 0:
+            set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
+            break;
+        case 1:
+            set_float_rounding_mode(float_round_down, &env->sse_status);
+            break;
+        case 2:
+            set_float_rounding_mode(float_round_up, &env->sse_status);
+            break;
+        case 3:
+            set_float_rounding_mode(float_round_to_zero, &env->sse_status);
+            break;
+        }
+
+    d->Q(0) = float64_round_to_int(s->Q(0), &env->sse_status);
+    d->Q(1) = float64_round_to_int(s->Q(1), &env->sse_status);
+
+#if 0 /* TODO */
+    if (mode & (1 << 3))
+        set_float_exception_flags(
+                        get_float_exception_flags(&env->sse_status) &
+                        ~float_flag_inexact,
+                        &env->sse_status);
+#endif
+    env->sse_status.float_rounding_mode = prev_rounding_mode;
+}
+
+void glue(helper_roundss, SUFFIX) (Reg *d, Reg *s, uint32_t mode)
+{
+    signed char prev_rounding_mode;
+
+    prev_rounding_mode = env->sse_status.float_rounding_mode;
+    if (!(mode & (1 << 2)))
+        switch (mode & 3) {
+        case 0:
+            set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
+            break;
+        case 1:
+            set_float_rounding_mode(float_round_down, &env->sse_status);
+            break;
+        case 2:
+            set_float_rounding_mode(float_round_up, &env->sse_status);
+            break;
+        case 3:
+            set_float_rounding_mode(float_round_to_zero, &env->sse_status);
+            break;
+        }
+
+    d->L(0) = float64_round_to_int(s->L(0), &env->sse_status);
+
+#if 0 /* TODO */
+    if (mode & (1 << 3))
+        set_float_exception_flags(
+                        get_float_exception_flags(&env->sse_status) &
+                        ~float_flag_inexact,
+                        &env->sse_status);
+#endif
+    env->sse_status.float_rounding_mode = prev_rounding_mode;
+}
+
+void glue(helper_roundsd, SUFFIX) (Reg *d, Reg *s, uint32_t mode)
+{
+    signed char prev_rounding_mode;
+
+    prev_rounding_mode = env->sse_status.float_rounding_mode;
+    if (!(mode & (1 << 2)))
+        switch (mode & 3) {
+        case 0:
+            set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
+            break;
+        case 1:
+            set_float_rounding_mode(float_round_down, &env->sse_status);
+            break;
+        case 2:
+            set_float_rounding_mode(float_round_up, &env->sse_status);
+            break;
+        case 3:
+            set_float_rounding_mode(float_round_to_zero, &env->sse_status);
+            break;
+        }
+
+    d->Q(0) = float64_round_to_int(s->Q(0), &env->sse_status);
+
+#if 0 /* TODO */
+    if (mode & (1 << 3))
+        set_float_exception_flags(
+                        get_float_exception_flags(&env->sse_status) &
+                        ~float_flag_inexact,
+                        &env->sse_status);
+#endif
+    env->sse_status.float_rounding_mode = prev_rounding_mode;
+}
+
+#define FBLENDP(d, s, m) m ? s : d
+SSE_HELPER_I(helper_blendps, L, 4, FBLENDP)
+SSE_HELPER_I(helper_blendpd, Q, 2, FBLENDP)
+SSE_HELPER_I(helper_pblendw, W, 8, FBLENDP)
+
+void glue(helper_dpps, SUFFIX) (Reg *d, Reg *s, uint32_t mask)
+{
+    float32 iresult = 0 /*float32_zero*/;
+
+    if (mask & (1 << 4))
+        iresult = float32_add(iresult,
+                        float32_mul(d->L(0), s->L(0), &env->sse_status),
+                        &env->sse_status);
+    if (mask & (1 << 5))
+        iresult = float32_add(iresult,
+                        float32_mul(d->L(1), s->L(1), &env->sse_status),
+                        &env->sse_status);
+    if (mask & (1 << 6))
+        iresult = float32_add(iresult,
+                        float32_mul(d->L(2), s->L(2), &env->sse_status),
+                        &env->sse_status);
+    if (mask & (1 << 7))
+        iresult = float32_add(iresult,
+                        float32_mul(d->L(3), s->L(3), &env->sse_status),
+                        &env->sse_status);
+    d->L(0) = (mask & (1 << 0)) ? iresult : 0 /*float32_zero*/;
+    d->L(1) = (mask & (1 << 1)) ? iresult : 0 /*float32_zero*/;
+    d->L(2) = (mask & (1 << 2)) ? iresult : 0 /*float32_zero*/;
+    d->L(3) = (mask & (1 << 3)) ? iresult : 0 /*float32_zero*/;
+}
+
+void glue(helper_dppd, SUFFIX) (Reg *d, Reg *s, uint32_t mask)
+{
+    float64 iresult = 0 /*float64_zero*/;
+
+    if (mask & (1 << 4))
+        iresult = float64_add(iresult,
+                        float64_mul(d->Q(0), s->Q(0), &env->sse_status),
+                        &env->sse_status);
+    if (mask & (1 << 5))
+        iresult = float64_add(iresult,
+                        float64_mul(d->Q(1), s->Q(1), &env->sse_status),
+                        &env->sse_status);
+    d->Q(0) = (mask & (1 << 0)) ? iresult : 0 /*float64_zero*/;
+    d->Q(1) = (mask & (1 << 1)) ? iresult : 0 /*float64_zero*/;
+}
+
+void glue(helper_mpsadbw, SUFFIX) (Reg *d, Reg *s, uint32_t offset)
+{
+    int s0 = (offset & 3) << 2;
+    int d0 = (offset & 4) << 0;
+    int i;
+    Reg r;
+
+    for (i = 0; i < 8; i++, d0++) {
+        r.W(i) = 0;
+        r.W(i) += abs1(d->B(d0 + 0) - s->B(s0 + 0));
+        r.W(i) += abs1(d->B(d0 + 1) - s->B(s0 + 1));
+        r.W(i) += abs1(d->B(d0 + 2) - s->B(s0 + 2));
+        r.W(i) += abs1(d->B(d0 + 3) - s->B(s0 + 3));
+    }
+
+    *d = r;
+}
+
+/* SSE4.2 op helpers */
+/* it's unclear whether signed or unsigned */
+#define FCMPGTQ(d, s) d > s ? -1 : 0
+SSE_HELPER_Q(helper_pcmpgtq, FCMPGTQ)
+
+static inline int pcmp_elen(int reg, uint32_t ctrl)
+{
+    int val;
+
+    /* Presence of REX.W is indicated by a bit higher than 7 set */
+    if (ctrl >> 8)
+        val = abs1((int64_t) env->regs[reg]);
+    else
+        val = abs1((int32_t) env->regs[reg]);
+
+    if (ctrl & 1) {
+        if (val > 8)
+            return 8;
+    } else
+        if (val > 16)
+            return 16;
+
+    return val;
+}
+
+static inline int pcmp_ilen(Reg *r, uint8_t ctrl)
+{
+    int val = 0;
+
+    if (ctrl & 1) {
+        while (val < 8 && r->W(val))
+            val++;
+    } else
+        while (val < 16 && r->B(val))
+            val++;
+
+    return val;
+}
+
+static inline int pcmp_val(Reg *r, uint8_t ctrl, int i)
+{
+    switch ((ctrl >> 0) & 3) {
+    case 0:
+        return r->B(i);
+    case 1:
+        return r->W(i);
+    case 2:
+        return (int8_t) r->B(i);
+    case 3:
+    default:
+        return (int16_t) r->W(i);
+    }
+}
+
+static inline unsigned pcmpxstrx(Reg *d, Reg *s,
+                int8_t ctrl, int valids, int validd)
+{
+    unsigned int res = 0;
+    int v;
+    int j, i;
+    int upper = (ctrl & 1) ? 7 : 15;
+
+    valids--;
+    validd--;
+
+    CC_SRC = (valids < upper ? CC_Z : 0) | (validd < upper ? CC_S : 0);
+
+    switch ((ctrl >> 2) & 3) {
+    case 0:
+        for (j = valids; j >= 0; j--) {
+            res <<= 1;
+            v = pcmp_val(s, ctrl, j);
+            for (i = validd; i >= 0; i--)
+                res |= (v == pcmp_val(d, ctrl, i));
+        }
+        break;
+    case 1:
+        for (j = valids; j >= 0; j--) {
+            res <<= 1;
+            v = pcmp_val(s, ctrl, j);
+            for (i = ((validd - 1) | 1); i >= 0; i -= 2)
+                res |= (pcmp_val(d, ctrl, i - 0) <= v &&
+                        pcmp_val(d, ctrl, i - 1) >= v);
+        }
+        break;
+    case 2:
+        res = (2 << (upper - MAX(valids, validd))) - 1;
+        res <<= MAX(valids, validd) - MIN(valids, validd);
+        for (i = MIN(valids, validd); i >= 0; i--) {
+            res <<= 1;
+            v = pcmp_val(s, ctrl, i);
+            res |= (v == pcmp_val(d, ctrl, i));
+        }
+        break;
+    case 3:
+        for (j = valids - validd; j >= 0; j--) {
+            res <<= 1;
+            res |= 1;
+            for (i = MIN(upper - j, validd); i >= 0; i--)
+                res &= (pcmp_val(s, ctrl, i + j) == pcmp_val(d, ctrl, i));
+        }
+        break;
+    }
+
+    switch ((ctrl >> 4) & 3) {
+    case 1:
+        res ^= (2 << upper) - 1;
+        break;
+    case 3:
+        res ^= (2 << valids) - 1;
+        break;
+    }
+
+    if (res)
+       CC_SRC |= CC_C;
+    if (res & 1)
+       CC_SRC |= CC_O;
+
+    return res;
+}
+
+static inline int rffs1(unsigned int val)
+{
+    int ret = 1, hi;
+
+    for (hi = sizeof(val) * 4; hi; hi /= 2)
+        if (val >> hi) {
+            val >>= hi;
+            ret += hi;
+        }
+
+    return ret;
+}
+
+static inline int ffs1(unsigned int val)
+{
+    int ret = 1, hi;
+
+    for (hi = sizeof(val) * 4; hi; hi /= 2)
+        if (val << hi) {
+            val <<= hi;
+            ret += hi;
+        }
+
+    return ret;
+}
+
+void glue(helper_pcmpestri, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl)
+{
+    unsigned int res = pcmpxstrx(d, s, ctrl,
+                    pcmp_elen(R_EDX, ctrl),
+                    pcmp_elen(R_EAX, ctrl));
+
+    if (res)
+#ifndef VBOX
+        env->regs[R_ECX] = ((ctrl & (1 << 6)) ? rffs1 : ffs1)(res) - 1;
+#else
+        env->regs[R_ECX] = ((ctrl & (1 << 6)) ? rffs1(res) : ffs1(res)) - 1;
+#endif
+    else
+        env->regs[R_ECX] = 16 >> (ctrl & (1 << 0));
+}
+
+void glue(helper_pcmpestrm, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl)
+{
+    int i;
+    unsigned int res = pcmpxstrx(d, s, ctrl,
+                    pcmp_elen(R_EDX, ctrl),
+                    pcmp_elen(R_EAX, ctrl));
+
+    if ((ctrl >> 6) & 1) {
+#ifndef VBOX
+        if (ctrl & 1)
+            for (i = 0; i <= 8; i--, res >>= 1)
+                d->W(i) = (res & 1) ? ~0 : 0;
+        else
+            for (i = 0; i <= 16; i--, res >>= 1)
+                d->B(i) = (res & 1) ? ~0 : 0;
+#else
+        if (ctrl & 1)
+            for (i = 0; i < 8; i++, res >>= 1) {
+                d->W(i) = (res & 1) ? ~0 : 0;
+            }
+        else
+            for (i = 0; i < 16; i++, res >>= 1) {
+                d->B(i) = (res & 1) ? ~0 : 0;
+            }
+#endif
+    } else {
+        d->Q(1) = 0;
+        d->Q(0) = res;
+    }
+}
+
+void glue(helper_pcmpistri, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl)
+{
+    unsigned int res = pcmpxstrx(d, s, ctrl,
+                    pcmp_ilen(s, ctrl),
+                    pcmp_ilen(d, ctrl));
+
+    if (res)
+        env->regs[R_ECX] = ((ctrl & (1 << 6)) ? rffs1 : ffs1)(res) - 1;
+    else
+        env->regs[R_ECX] = 16 >> (ctrl & (1 << 0));
+}
+
+void glue(helper_pcmpistrm, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl)
+{
+    int i;
+    unsigned int res = pcmpxstrx(d, s, ctrl,
+                    pcmp_ilen(s, ctrl),
+                    pcmp_ilen(d, ctrl));
+
+    if ((ctrl >> 6) & 1) {
+#ifndef VBOX
+        if (ctrl & 1)
+            for (i = 0; i <= 8; i--, res >>= 1)
+                d->W(i) = (res & 1) ? ~0 : 0;
+        else
+            for (i = 0; i <= 16; i--, res >>= 1)
+                d->B(i) = (res & 1) ? ~0 : 0;
+#else
+        if (ctrl & 1)
+            for (i = 0; i < 8; i++, res >>= 1) {
+                d->W(i) = (res & 1) ? ~0 : 0;
+            }
+        else
+            for (i = 0; i < 16; i++, res >>= 1) {
+                d->B(i) = (res & 1) ? ~0 : 0;
+            }
+#endif
+    } else {
+        d->Q(1) = 0;
+        d->Q(0) = res;
+    }
+}
+
+#define CRCPOLY        0x1edc6f41
+#define CRCPOLY_BITREV 0x82f63b78
+target_ulong helper_crc32(uint32_t crc1, target_ulong msg, uint32_t len)
+{
+    target_ulong crc = (msg & ((target_ulong) -1 >>
+                            (TARGET_LONG_BITS - len))) ^ crc1;
+
+    while (len--)
+        crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_BITREV : 0);
+
+    return crc;
+}
+
+#define POPMASK(i)     ((target_ulong) -1 / ((1LL << (1 << i)) + 1))
+#define POPCOUNT(n, i) (n & POPMASK(i)) + ((n >> (1 << i)) & POPMASK(i))
+target_ulong helper_popcnt(target_ulong n, uint32_t type)
+{
+    CC_SRC = n ? 0 : CC_Z;
+
+    n = POPCOUNT(n, 0);
+    n = POPCOUNT(n, 1);
+    n = POPCOUNT(n, 2);
+    n = POPCOUNT(n, 3);
+    if (type == 1)
+        return n & 0xff;
+
+    n = POPCOUNT(n, 4);
+#ifndef TARGET_X86_64
+    return n;
+#else
+    if (type == 2)
+        return n & 0xff;
+
+    return POPCOUNT(n, 5);
+#endif
+}
+#endif
+
+#undef SHIFT
+#undef XMM_ONLY
+#undef Reg
+#undef B
+#undef W
+#undef L
+#undef Q
+#undef SUFFIX
diff --git a/src/recompiler/target-i386/ops_sse_header.h b/src/recompiler/target-i386/ops_sse_header.h
new file mode 100644
index 00000000..efa3e905
--- /dev/null
+++ b/src/recompiler/target-i386/ops_sse_header.h
@@ -0,0 +1,359 @@
+/*
+ *  MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI support
+ *
+ *  Copyright (c) 2005 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Oracle LGPL Disclaimer: For the avoidance of doubt, except that if any license choice
+ * other than GPL or LGPL is available it will apply instead, Oracle elects to use only
+ * the Lesser General Public License version 2.1 (LGPLv2) at this time for any software where
+ * a choice of LGPL license versions is made available with the language indicating
+ * that LGPLv2 or any later version may be used, or where a choice of which version
+ * of the LGPL is applied is otherwise unspecified.
+ */
+
+#if SHIFT == 0
+#define Reg MMXReg
+#define SUFFIX _mmx
+#else
+#define Reg XMMReg
+#define SUFFIX _xmm
+#endif
+
+#define dh_alias_Reg ptr
+#define dh_alias_XMMReg ptr
+#define dh_alias_MMXReg ptr
+#define dh_ctype_Reg Reg *
+#define dh_ctype_XMMReg XMMReg *
+#define dh_ctype_MMXReg MMXReg *
+#define dh_is_signed_Reg dh_is_signed_ptr
+#define dh_is_signed_XMMReg dh_is_signed_ptr
+#define dh_is_signed_MMXReg dh_is_signed_ptr
+
+DEF_HELPER_2(glue(psrlw, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(psraw, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(psllw, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(psrld, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(psrad, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(pslld, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(psrlq, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(psllq, SUFFIX), void, Reg, Reg)
+
+#if SHIFT == 1
+DEF_HELPER_2(glue(psrldq, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(pslldq, SUFFIX), void, Reg, Reg)
+#endif
+
+#define SSE_HELPER_B(name, F)\
+    DEF_HELPER_2(glue(name, SUFFIX), void, Reg, Reg)
+
+#define SSE_HELPER_W(name, F)\
+    DEF_HELPER_2(glue(name, SUFFIX), void, Reg, Reg)
+
+#define SSE_HELPER_L(name, F)\
+    DEF_HELPER_2(glue(name, SUFFIX), void, Reg, Reg)
+
+#define SSE_HELPER_Q(name, F)\
+    DEF_HELPER_2(glue(name, SUFFIX), void, Reg, Reg)
+
+SSE_HELPER_B(paddb, FADD)
+SSE_HELPER_W(paddw, FADD)
+SSE_HELPER_L(paddl, FADD)
+SSE_HELPER_Q(paddq, FADD)
+
+SSE_HELPER_B(psubb, FSUB)
+SSE_HELPER_W(psubw, FSUB)
+SSE_HELPER_L(psubl, FSUB)
+SSE_HELPER_Q(psubq, FSUB)
+
+SSE_HELPER_B(paddusb, FADDUB)
+SSE_HELPER_B(paddsb, FADDSB)
+SSE_HELPER_B(psubusb, FSUBUB)
+SSE_HELPER_B(psubsb, FSUBSB)
+
+SSE_HELPER_W(paddusw, FADDUW)
+SSE_HELPER_W(paddsw, FADDSW)
+SSE_HELPER_W(psubusw, FSUBUW)
+SSE_HELPER_W(psubsw, FSUBSW)
+
+SSE_HELPER_B(pminub, FMINUB)
+SSE_HELPER_B(pmaxub, FMAXUB)
+
+SSE_HELPER_W(pminsw, FMINSW)
+SSE_HELPER_W(pmaxsw, FMAXSW)
+
+SSE_HELPER_Q(pand, FAND)
+SSE_HELPER_Q(pandn, FANDN)
+SSE_HELPER_Q(por, FOR)
+SSE_HELPER_Q(pxor, FXOR)
+
+SSE_HELPER_B(pcmpgtb, FCMPGTB)
+SSE_HELPER_W(pcmpgtw, FCMPGTW)
+SSE_HELPER_L(pcmpgtl, FCMPGTL)
+
+SSE_HELPER_B(pcmpeqb, FCMPEQ)
+SSE_HELPER_W(pcmpeqw, FCMPEQ)
+SSE_HELPER_L(pcmpeql, FCMPEQ)
+
+SSE_HELPER_W(pmullw, FMULLW)
+#if SHIFT == 0
+SSE_HELPER_W(pmulhrw, FMULHRW)
+#endif
+SSE_HELPER_W(pmulhuw, FMULHUW)
+SSE_HELPER_W(pmulhw, FMULHW)
+
+SSE_HELPER_B(pavgb, FAVG)
+SSE_HELPER_W(pavgw, FAVG)
+
+DEF_HELPER_2(glue(pmuludq, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(pmaddwd, SUFFIX), void, Reg, Reg)
+
+DEF_HELPER_2(glue(psadbw, SUFFIX), void, Reg, Reg)
+DEF_HELPER_3(glue(maskmov, SUFFIX), void, Reg, Reg, tl)
+DEF_HELPER_2(glue(movl_mm_T0, SUFFIX), void, Reg, i32)
+#ifdef TARGET_X86_64
+DEF_HELPER_2(glue(movq_mm_T0, SUFFIX), void, Reg, i64)
+#endif
+
+#if SHIFT == 0
+DEF_HELPER_3(glue(pshufw, SUFFIX), void, Reg, Reg, int)
+#else
+DEF_HELPER_3(shufps, void, Reg, Reg, int)
+DEF_HELPER_3(shufpd, void, Reg, Reg, int)
+DEF_HELPER_3(glue(pshufd, SUFFIX), void, Reg, Reg, int)
+DEF_HELPER_3(glue(pshuflw, SUFFIX), void, Reg, Reg, int)
+DEF_HELPER_3(glue(pshufhw, SUFFIX), void, Reg, Reg, int)
+#endif
+
+#if SHIFT == 1
+/* FPU ops */
+/* XXX: not accurate */
+
+#define SSE_HELPER_S(name, F)\
+    DEF_HELPER_2(name ## ps , void, Reg, Reg)        \
+    DEF_HELPER_2(name ## ss , void, Reg, Reg)        \
+    DEF_HELPER_2(name ## pd , void, Reg, Reg)        \
+    DEF_HELPER_2(name ## sd , void, Reg, Reg)
+
+SSE_HELPER_S(add, FPU_ADD)
+SSE_HELPER_S(sub, FPU_SUB)
+SSE_HELPER_S(mul, FPU_MUL)
+SSE_HELPER_S(div, FPU_DIV)
+SSE_HELPER_S(min, FPU_MIN)
+SSE_HELPER_S(max, FPU_MAX)
+SSE_HELPER_S(sqrt, FPU_SQRT)
+
+
+DEF_HELPER_2(cvtps2pd, void, Reg, Reg)
+DEF_HELPER_2(cvtpd2ps, void, Reg, Reg)
+DEF_HELPER_2(cvtss2sd, void, Reg, Reg)
+DEF_HELPER_2(cvtsd2ss, void, Reg, Reg)
+DEF_HELPER_2(cvtdq2ps, void, Reg, Reg)
+DEF_HELPER_2(cvtdq2pd, void, Reg, Reg)
+DEF_HELPER_2(cvtpi2ps, void, XMMReg, MMXReg)
+DEF_HELPER_2(cvtpi2pd, void, XMMReg, MMXReg)
+DEF_HELPER_2(cvtsi2ss, void, XMMReg, i32)
+DEF_HELPER_2(cvtsi2sd, void, XMMReg, i32)
+
+#ifdef TARGET_X86_64
+DEF_HELPER_2(cvtsq2ss, void, XMMReg, i64)
+DEF_HELPER_2(cvtsq2sd, void, XMMReg, i64)
+#endif
+
+DEF_HELPER_2(cvtps2dq, void, XMMReg, XMMReg)
+DEF_HELPER_2(cvtpd2dq, void, XMMReg, XMMReg)
+DEF_HELPER_2(cvtps2pi, void, MMXReg, XMMReg)
+DEF_HELPER_2(cvtpd2pi, void, MMXReg, XMMReg)
+DEF_HELPER_1(cvtss2si, s32, XMMReg)
+DEF_HELPER_1(cvtsd2si, s32, XMMReg)
+#ifdef TARGET_X86_64
+DEF_HELPER_1(cvtss2sq, s64, XMMReg)
+DEF_HELPER_1(cvtsd2sq, s64, XMMReg)
+#endif
+
+DEF_HELPER_2(cvttps2dq, void, XMMReg, XMMReg)
+DEF_HELPER_2(cvttpd2dq, void, XMMReg, XMMReg)
+DEF_HELPER_2(cvttps2pi, void, MMXReg, XMMReg)
+DEF_HELPER_2(cvttpd2pi, void, MMXReg, XMMReg)
+DEF_HELPER_1(cvttss2si, s32, XMMReg)
+DEF_HELPER_1(cvttsd2si, s32, XMMReg)
+#ifdef TARGET_X86_64
+DEF_HELPER_1(cvttss2sq, s64, XMMReg)
+DEF_HELPER_1(cvttsd2sq, s64, XMMReg)
+#endif
+
+DEF_HELPER_2(rsqrtps, void, XMMReg, XMMReg)
+DEF_HELPER_2(rsqrtss, void, XMMReg, XMMReg)
+DEF_HELPER_2(rcpps, void, XMMReg, XMMReg)
+DEF_HELPER_2(rcpss, void, XMMReg, XMMReg)
+DEF_HELPER_2(extrq_r, void, XMMReg, XMMReg)
+DEF_HELPER_3(extrq_i, void, XMMReg, int, int)
+DEF_HELPER_2(insertq_r, void, XMMReg, XMMReg)
+DEF_HELPER_3(insertq_i, void, XMMReg, int, int)
+DEF_HELPER_2(haddps, void, XMMReg, XMMReg)
+DEF_HELPER_2(haddpd, void, XMMReg, XMMReg)
+DEF_HELPER_2(hsubps, void, XMMReg, XMMReg)
+DEF_HELPER_2(hsubpd, void, XMMReg, XMMReg)
+DEF_HELPER_2(addsubps, void, XMMReg, XMMReg)
+DEF_HELPER_2(addsubpd, void, XMMReg, XMMReg)
+
+#define SSE_HELPER_CMP(name, F)\
+    DEF_HELPER_2( name ## ps , void, Reg, Reg)        \
+    DEF_HELPER_2( name ## ss , void, Reg, Reg)        \
+    DEF_HELPER_2( name ## pd , void, Reg, Reg)        \
+    DEF_HELPER_2( name ## sd , void, Reg, Reg)
+
+SSE_HELPER_CMP(cmpeq, FPU_CMPEQ)
+SSE_HELPER_CMP(cmplt, FPU_CMPLT)
+SSE_HELPER_CMP(cmple, FPU_CMPLE)
+SSE_HELPER_CMP(cmpunord, FPU_CMPUNORD)
+SSE_HELPER_CMP(cmpneq, FPU_CMPNEQ)
+SSE_HELPER_CMP(cmpnlt, FPU_CMPNLT)
+SSE_HELPER_CMP(cmpnle, FPU_CMPNLE)
+SSE_HELPER_CMP(cmpord, FPU_CMPORD)
+
+DEF_HELPER_2(ucomiss, void, Reg, Reg)
+DEF_HELPER_2(comiss, void, Reg, Reg)
+DEF_HELPER_2(ucomisd, void, Reg, Reg)
+DEF_HELPER_2(comisd, void, Reg, Reg)
+DEF_HELPER_1(movmskps, i32, Reg)
+DEF_HELPER_1(movmskpd, i32, Reg)
+#endif
+
+DEF_HELPER_1(glue(pmovmskb, SUFFIX), i32, Reg)
+DEF_HELPER_2(glue(packsswb, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(packuswb, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(packssdw, SUFFIX), void, Reg, Reg)
+#define UNPCK_OP(base_name, base)                               \
+    DEF_HELPER_2(glue(punpck ## base_name ## bw, SUFFIX) , void, Reg, Reg) \
+    DEF_HELPER_2(glue(punpck ## base_name ## wd, SUFFIX) , void, Reg, Reg) \
+    DEF_HELPER_2(glue(punpck ## base_name ## dq, SUFFIX) , void, Reg, Reg)
+
+UNPCK_OP(l, 0)
+UNPCK_OP(h, 1)
+
+#if SHIFT == 1
+DEF_HELPER_2(glue(punpcklqdq, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(punpckhqdq, SUFFIX), void, Reg, Reg)
+#endif
+
+/* 3DNow! float ops */
+#if SHIFT == 0
+DEF_HELPER_2(pi2fd, void, MMXReg, MMXReg)
+DEF_HELPER_2(pi2fw, void, MMXReg, MMXReg)
+DEF_HELPER_2(pf2id, void, MMXReg, MMXReg)
+DEF_HELPER_2(pf2iw, void, MMXReg, MMXReg)
+DEF_HELPER_2(pfacc, void, MMXReg, MMXReg)
+DEF_HELPER_2(pfadd, void, MMXReg, MMXReg)
+DEF_HELPER_2(pfcmpeq, void, MMXReg, MMXReg)
+DEF_HELPER_2(pfcmpge, void, MMXReg, MMXReg)
+DEF_HELPER_2(pfcmpgt, void, MMXReg, MMXReg)
+DEF_HELPER_2(pfmax, void, MMXReg, MMXReg)
+DEF_HELPER_2(pfmin, void, MMXReg, MMXReg)
+DEF_HELPER_2(pfmul, void, MMXReg, MMXReg)
+DEF_HELPER_2(pfnacc, void, MMXReg, MMXReg)
+DEF_HELPER_2(pfpnacc, void, MMXReg, MMXReg)
+DEF_HELPER_2(pfrcp, void, MMXReg, MMXReg)
+DEF_HELPER_2(pfrsqrt, void, MMXReg, MMXReg)
+DEF_HELPER_2(pfsub, void, MMXReg, MMXReg)
+DEF_HELPER_2(pfsubr, void, MMXReg, MMXReg)
+DEF_HELPER_2(pswapd, void, MMXReg, MMXReg)
+#endif
+
+/* SSSE3 op helpers */
+DEF_HELPER_2(glue(phaddw, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(phaddd, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(phaddsw, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(phsubw, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(phsubd, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(phsubsw, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(pabsb, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(pabsw, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(pabsd, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(pmaddubsw, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(pmulhrsw, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(pshufb, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(psignb, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(psignw, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(psignd, SUFFIX), void, Reg, Reg)
+DEF_HELPER_3(glue(palignr, SUFFIX), void, Reg, Reg, s32)
+
+/* SSE4.1 op helpers */
+#if SHIFT == 1
+DEF_HELPER_2(glue(pblendvb, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(blendvps, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(blendvpd, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(ptest, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(pmovsxbw, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(pmovsxbd, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(pmovsxbq, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(pmovsxwd, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(pmovsxwq, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(pmovsxdq, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(pmovzxbw, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(pmovzxbd, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(pmovzxbq, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(pmovzxwd, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(pmovzxwq, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(pmovzxdq, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(pmuldq, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(pcmpeqq, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(packusdw, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(pminsb, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(pminsd, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(pminuw, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(pminud, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(pmaxsb, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(pmaxsd, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(pmaxuw, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(pmaxud, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(pmulld, SUFFIX), void, Reg, Reg)
+DEF_HELPER_2(glue(phminposuw, SUFFIX), void, Reg, Reg)
+DEF_HELPER_3(glue(roundps, SUFFIX), void, Reg, Reg, i32)
+DEF_HELPER_3(glue(roundpd, SUFFIX), void, Reg, Reg, i32)
+DEF_HELPER_3(glue(roundss, SUFFIX), void, Reg, Reg, i32)
+DEF_HELPER_3(glue(roundsd, SUFFIX), void, Reg, Reg, i32)
+DEF_HELPER_3(glue(blendps, SUFFIX), void, Reg, Reg, i32)
+DEF_HELPER_3(glue(blendpd, SUFFIX), void, Reg, Reg, i32)
+DEF_HELPER_3(glue(pblendw, SUFFIX), void, Reg, Reg, i32)
+DEF_HELPER_3(glue(dpps, SUFFIX), void, Reg, Reg, i32)
+DEF_HELPER_3(glue(dppd, SUFFIX), void, Reg, Reg, i32)
+DEF_HELPER_3(glue(mpsadbw, SUFFIX), void, Reg, Reg, i32)
+#endif
+
+/* SSE4.2 op helpers */
+#if SHIFT == 1
+DEF_HELPER_2(glue(pcmpgtq, SUFFIX), void, Reg, Reg)
+DEF_HELPER_3(glue(pcmpestri, SUFFIX), void, Reg, Reg, i32)
+DEF_HELPER_3(glue(pcmpestrm, SUFFIX), void, Reg, Reg, i32)
+DEF_HELPER_3(glue(pcmpistri, SUFFIX), void, Reg, Reg, i32)
+DEF_HELPER_3(glue(pcmpistrm, SUFFIX), void, Reg, Reg, i32)
+DEF_HELPER_3(crc32, tl, i32, tl, i32)
+DEF_HELPER_2(popcnt, tl, tl, i32)
+#endif
+
+#undef SHIFT
+#undef Reg
+#undef SUFFIX
+
+#undef SSE_HELPER_B
+#undef SSE_HELPER_W
+#undef SSE_HELPER_L
+#undef SSE_HELPER_Q
+#undef SSE_HELPER_S
+#undef SSE_HELPER_CMP
+#undef UNPCK_OP
diff --git a/src/recompiler/target-i386/svm.h b/src/recompiler/target-i386/svm.h
new file mode 100644
index 00000000..a224aead
--- /dev/null
+++ b/src/recompiler/target-i386/svm.h
@@ -0,0 +1,222 @@
+#ifndef __SVM_H
+#define __SVM_H
+
+#define TLB_CONTROL_DO_NOTHING 0
+#define TLB_CONTROL_FLUSH_ALL_ASID 1
+
+#define V_TPR_MASK 0x0f
+
+#define V_IRQ_SHIFT 8
+#define V_IRQ_MASK (1 << V_IRQ_SHIFT)
+
+#define V_INTR_PRIO_SHIFT 16
+#define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT)
+
+#define V_IGN_TPR_SHIFT 20
+#define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT)
+
+#define V_INTR_MASKING_SHIFT 24
+#define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT)
+
+#define SVM_INTERRUPT_SHADOW_MASK 1
+
+#define SVM_IOIO_STR_SHIFT 2
+#define SVM_IOIO_REP_SHIFT 3
+#define SVM_IOIO_SIZE_SHIFT 4
+#define SVM_IOIO_ASIZE_SHIFT 7
+
+#define SVM_IOIO_TYPE_MASK 1
+#define SVM_IOIO_STR_MASK (1 << SVM_IOIO_STR_SHIFT)
+#define SVM_IOIO_REP_MASK (1 << SVM_IOIO_REP_SHIFT)
+#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT)
+#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT)
+
+#define SVM_EVTINJ_VEC_MASK 0xff
+
+#define SVM_EVTINJ_TYPE_SHIFT 8
+#define SVM_EVTINJ_TYPE_MASK (7 << SVM_EVTINJ_TYPE_SHIFT)
+
+#define SVM_EVTINJ_TYPE_INTR (0 << SVM_EVTINJ_TYPE_SHIFT)
+#define SVM_EVTINJ_TYPE_NMI (2 << SVM_EVTINJ_TYPE_SHIFT)
+#define SVM_EVTINJ_TYPE_EXEPT (3 << SVM_EVTINJ_TYPE_SHIFT)
+#define SVM_EVTINJ_TYPE_SOFT (4 << SVM_EVTINJ_TYPE_SHIFT)
+
+#define SVM_EVTINJ_VALID (1 << 31)
+#define SVM_EVTINJ_VALID_ERR (1 << 11)
+
+#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK
+
+#define	SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR
+#define	SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI
+#define	SVM_EXITINTINFO_TYPE_EXEPT SVM_EVTINJ_TYPE_EXEPT
+#define	SVM_EXITINTINFO_TYPE_SOFT SVM_EVTINJ_TYPE_SOFT
+
+#define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID
+#define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR
+
+#define	SVM_EXIT_READ_CR0 	0x000
+#define	SVM_EXIT_READ_CR3 	0x003
+#define	SVM_EXIT_READ_CR4 	0x004
+#define	SVM_EXIT_READ_CR8 	0x008
+#define	SVM_EXIT_WRITE_CR0 	0x010
+#define	SVM_EXIT_WRITE_CR3 	0x013
+#define	SVM_EXIT_WRITE_CR4 	0x014
+#define	SVM_EXIT_WRITE_CR8 	0x018
+#define	SVM_EXIT_READ_DR0 	0x020
+#define	SVM_EXIT_READ_DR1 	0x021
+#define	SVM_EXIT_READ_DR2 	0x022
+#define	SVM_EXIT_READ_DR3 	0x023
+#define	SVM_EXIT_READ_DR4 	0x024
+#define	SVM_EXIT_READ_DR5 	0x025
+#define	SVM_EXIT_READ_DR6 	0x026
+#define	SVM_EXIT_READ_DR7 	0x027
+#define	SVM_EXIT_WRITE_DR0 	0x030
+#define	SVM_EXIT_WRITE_DR1 	0x031
+#define	SVM_EXIT_WRITE_DR2 	0x032
+#define	SVM_EXIT_WRITE_DR3 	0x033
+#define	SVM_EXIT_WRITE_DR4 	0x034
+#define	SVM_EXIT_WRITE_DR5 	0x035
+#define	SVM_EXIT_WRITE_DR6 	0x036
+#define	SVM_EXIT_WRITE_DR7 	0x037
+#define SVM_EXIT_EXCP_BASE      0x040
+#define SVM_EXIT_INTR		0x060
+#define SVM_EXIT_NMI		0x061
+#define SVM_EXIT_SMI		0x062
+#define SVM_EXIT_INIT		0x063
+#define SVM_EXIT_VINTR		0x064
+#define SVM_EXIT_CR0_SEL_WRITE	0x065
+#define SVM_EXIT_IDTR_READ	0x066
+#define SVM_EXIT_GDTR_READ	0x067
+#define SVM_EXIT_LDTR_READ	0x068
+#define SVM_EXIT_TR_READ	0x069
+#define SVM_EXIT_IDTR_WRITE	0x06a
+#define SVM_EXIT_GDTR_WRITE	0x06b
+#define SVM_EXIT_LDTR_WRITE	0x06c
+#define SVM_EXIT_TR_WRITE	0x06d
+#define SVM_EXIT_RDTSC		0x06e
+#define SVM_EXIT_RDPMC		0x06f
+#define SVM_EXIT_PUSHF		0x070
+#define SVM_EXIT_POPF		0x071
+#define SVM_EXIT_CPUID		0x072
+#define SVM_EXIT_RSM		0x073
+#define SVM_EXIT_IRET		0x074
+#define SVM_EXIT_SWINT		0x075
+#define SVM_EXIT_INVD		0x076
+#define SVM_EXIT_PAUSE		0x077
+#define SVM_EXIT_HLT		0x078
+#define SVM_EXIT_INVLPG		0x079
+#define SVM_EXIT_INVLPGA	0x07a
+#define SVM_EXIT_IOIO		0x07b
+#define SVM_EXIT_MSR		0x07c
+#define SVM_EXIT_TASK_SWITCH	0x07d
+#define SVM_EXIT_FERR_FREEZE	0x07e
+#define SVM_EXIT_SHUTDOWN	0x07f
+#define SVM_EXIT_VMRUN		0x080
+#define SVM_EXIT_VMMCALL	0x081
+#define SVM_EXIT_VMLOAD		0x082
+#define SVM_EXIT_VMSAVE		0x083
+#define SVM_EXIT_STGI		0x084
+#define SVM_EXIT_CLGI		0x085
+#define SVM_EXIT_SKINIT		0x086
+#define SVM_EXIT_RDTSCP		0x087
+#define SVM_EXIT_ICEBP		0x088
+#define SVM_EXIT_WBINVD		0x089
+/* only included in documentation, maybe wrong */
+#define SVM_EXIT_MONITOR	0x08a
+#define SVM_EXIT_MWAIT		0x08b
+#define SVM_EXIT_NPF  		0x400
+
+#define SVM_EXIT_ERR		-1
+
+#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */
+
+struct __attribute__ ((__packed__)) vmcb_control_area {
+	uint16_t intercept_cr_read;
+	uint16_t intercept_cr_write;
+	uint16_t intercept_dr_read;
+	uint16_t intercept_dr_write;
+	uint32_t intercept_exceptions;
+	uint64_t intercept;
+	uint8_t reserved_1[44];
+	uint64_t iopm_base_pa;
+	uint64_t msrpm_base_pa;
+	uint64_t tsc_offset;
+	uint32_t asid;
+	uint8_t tlb_ctl;
+	uint8_t reserved_2[3];
+	uint32_t int_ctl;
+	uint32_t int_vector;
+	uint32_t int_state;
+	uint8_t reserved_3[4];
+	uint64_t exit_code;
+	uint64_t exit_info_1;
+	uint64_t exit_info_2;
+	uint32_t exit_int_info;
+	uint32_t exit_int_info_err;
+	uint64_t nested_ctl;
+	uint8_t reserved_4[16];
+	uint32_t event_inj;
+	uint32_t event_inj_err;
+	uint64_t nested_cr3;
+	uint64_t lbr_ctl;
+	uint8_t reserved_5[832];
+};
+
+struct __attribute__ ((__packed__)) vmcb_seg {
+	uint16_t selector;
+	uint16_t attrib;
+	uint32_t limit;
+	uint64_t base;
+};
+
+struct __attribute__ ((__packed__)) vmcb_save_area {
+	struct vmcb_seg es;
+	struct vmcb_seg cs;
+	struct vmcb_seg ss;
+	struct vmcb_seg ds;
+	struct vmcb_seg fs;
+	struct vmcb_seg gs;
+	struct vmcb_seg gdtr;
+	struct vmcb_seg ldtr;
+	struct vmcb_seg idtr;
+	struct vmcb_seg tr;
+	uint8_t reserved_1[43];
+	uint8_t cpl;
+	uint8_t reserved_2[4];
+	uint64_t efer;
+	uint8_t reserved_3[112];
+	uint64_t cr4;
+	uint64_t cr3;
+	uint64_t cr0;
+	uint64_t dr7;
+	uint64_t dr6;
+	uint64_t rflags;
+	uint64_t rip;
+	uint8_t reserved_4[88];
+	uint64_t rsp;
+	uint8_t reserved_5[24];
+	uint64_t rax;
+	uint64_t star;
+	uint64_t lstar;
+	uint64_t cstar;
+	uint64_t sfmask;
+	uint64_t kernel_gs_base;
+	uint64_t sysenter_cs;
+	uint64_t sysenter_esp;
+	uint64_t sysenter_eip;
+	uint64_t cr2;
+	uint8_t reserved_6[32];
+	uint64_t g_pat;
+	uint64_t dbgctl;
+	uint64_t br_from;
+	uint64_t br_to;
+	uint64_t last_excp_from;
+	uint64_t last_excp_to;
+};
+
+struct __attribute__ ((__packed__)) vmcb {
+	struct vmcb_control_area control;
+	struct vmcb_save_area save;
+};
+
+#endif
diff --git a/src/recompiler/target-i386/translate.c b/src/recompiler/target-i386/translate.c
new file mode 100644
index 00000000..1b82f3f5
--- /dev/null
+++ b/src/recompiler/target-i386/translate.c
@@ -0,0 +1,8385 @@
+/*
+ *  i386 translation
+ *
+ *  Copyright (c) 2003 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Oracle LGPL Disclaimer: For the avoidance of doubt, except that if any license choice
+ * other than GPL or LGPL is available it will apply instead, Oracle elects to use only
+ * the Lesser General Public License version 2.1 (LGPLv2) at this time for any software where
+ * a choice of LGPL license versions is made available with the language indicating
+ * that LGPLv2 or any later version may be used, or where a choice of which version
+ * of the LGPL is applied is otherwise unspecified.
+ */
+
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#ifndef VBOX
+#include <inttypes.h>
+#include <signal.h>
+#endif /* !VBOX */
+
+#include "cpu.h"
+#include "exec-all.h"
+#include "disas.h"
+#include "tcg-op.h"
+
+#include "helper.h"
+#define GEN_HELPER 1
+#include "helper.h"
+
+#define PREFIX_REPZ   0x01
+#define PREFIX_REPNZ  0x02
+#define PREFIX_LOCK   0x04
+#define PREFIX_DATA   0x08
+#define PREFIX_ADR    0x10
+
+#ifdef TARGET_X86_64
+#define X86_64_ONLY(x) x
+#define X86_64_DEF(...)  __VA_ARGS__
+#define CODE64(s) ((s)->code64)
+#define REX_X(s) ((s)->rex_x)
+#define REX_B(s) ((s)->rex_b)
+# ifdef VBOX
+#  define IS_LONG_MODE(s)   ((s)->lma)
+# endif
+/* XXX: gcc generates push/pop in some opcodes, so we cannot use them */
+#if 1
+#define BUGGY_64(x) NULL
+#endif
+#else
+#define X86_64_ONLY(x) NULL
+#define X86_64_DEF(...)
+#define CODE64(s) 0
+#define REX_X(s) 0
+#define REX_B(s) 0
+# ifdef VBOX
+#  define IS_LONG_MODE(s)   0
+# endif
+#endif
+
+//#define MACRO_TEST   1
+
+/* global register indexes */
+static TCGv_ptr cpu_env;
+static TCGv cpu_A0, cpu_cc_src, cpu_cc_dst, cpu_cc_tmp;
+static TCGv_i32 cpu_cc_op;
+static TCGv cpu_regs[CPU_NB_REGS];
+/* local temps */
+static TCGv cpu_T[2], cpu_T3;
+/* local register indexes (only used inside old micro ops) */
+static TCGv cpu_tmp0, cpu_tmp4;
+static TCGv_ptr cpu_ptr0, cpu_ptr1;
+static TCGv_i32 cpu_tmp2_i32, cpu_tmp3_i32;
+static TCGv_i64 cpu_tmp1_i64;
+static TCGv cpu_tmp5;
+
+static uint8_t gen_opc_cc_op[OPC_BUF_SIZE];
+
+#include "gen-icount.h"
+
+#ifdef TARGET_X86_64
+static int x86_64_hregs;
+#endif
+
+#ifdef VBOX
+
+/* Special/override code readers to hide patched code. */
+
+uint8_t ldub_code_raw(target_ulong pc)
+{
+    uint8_t b;
+
+# ifdef VBOX_WITH_RAW_MODE
+    if (!remR3GetOpcode(cpu_single_env, pc, &b))
+# endif
+        b = ldub_code(pc);
+    return b;
+}
+# define ldub_code(a) ldub_code_raw(a)
+
+uint16_t lduw_code_raw(target_ulong pc)
+{
+    uint16_t u16;
+    u16  = (uint16_t)ldub_code_raw(pc);
+    u16 |= (uint16_t)ldub_code_raw(pc + 1) << 8;
+    return u16;
+}
+# define lduw_code(a) lduw_code_raw(a)
+
+
+uint32_t ldl_code_raw(target_ulong pc)
+{
+    uint32_t u32;
+    u32  = (uint32_t)ldub_code_raw(pc);
+    u32 |= (uint32_t)ldub_code_raw(pc + 1) << 8;
+    u32 |= (uint32_t)ldub_code_raw(pc + 2) << 16;
+    u32 |= (uint32_t)ldub_code_raw(pc + 3) << 24;
+    return u32;
+}
+# define ldl_code(a) ldl_code_raw(a)
+
+#endif /* VBOX */
+
+typedef struct DisasContext {
+    /* current insn context */
+    int override; /* -1 if no override */
+    int prefix;
+    int aflag, dflag;
+    target_ulong pc; /* pc = eip + cs_base */
+    int is_jmp; /* 1 = means jump (stop translation), 2 means CPU
+                   static state change (stop translation) */
+    /* current block context */
+    target_ulong cs_base; /* base of CS segment */
+    int pe;     /* protected mode */
+    int code32; /* 32 bit code segment */
+#ifdef TARGET_X86_64
+    int lma;    /* long mode active */
+    int code64; /* 64 bit code segment */
+    int rex_x, rex_b;
+#endif
+    int ss32;   /* 32 bit stack segment */
+    int cc_op;  /* current CC operation */
+    int addseg; /* non zero if either DS/ES/SS have a non zero base */
+    int f_st;   /* currently unused */
+    int vm86;   /* vm86 mode */
+#ifdef VBOX
+    int vme;    /* CR4.VME */
+    int pvi;    /* CR4.PVI */
+    int record_call;    /* record calls for CSAM or not? */
+#endif
+    int cpl;
+    int iopl;
+    int tf;     /* TF cpu flag */
+    int singlestep_enabled; /* "hardware" single step enabled */
+    int jmp_opt; /* use direct block chaining for direct jumps */
+    int mem_index; /* select memory access functions */
+    uint64_t flags; /* all execution flags */
+    struct TranslationBlock *tb;
+    int popl_esp_hack; /* for correct popl with esp base handling */
+    int rip_offset; /* only used in x86_64, but left for simplicity */
+    int cpuid_features;
+    int cpuid_ext_features;
+    int cpuid_ext2_features;
+    int cpuid_ext3_features;
+} DisasContext;
+
+static void gen_eob(DisasContext *s);
+static void gen_jmp(DisasContext *s, target_ulong eip);
+static void gen_jmp_tb(DisasContext *s, target_ulong eip, int tb_num);
+
+#ifdef VBOX
+static void gen_check_external_event(void);
+#endif
+
+/* i386 arith/logic operations */
+enum {
+    OP_ADDL,
+    OP_ORL,
+    OP_ADCL,
+    OP_SBBL,
+    OP_ANDL,
+    OP_SUBL,
+    OP_XORL,
+    OP_CMPL,
+};
+
+/* i386 shift ops */
+enum {
+    OP_ROL,
+    OP_ROR,
+    OP_RCL,
+    OP_RCR,
+    OP_SHL,
+    OP_SHR,
+    OP_SHL1, /* undocumented */
+    OP_SAR = 7,
+};
+
+enum {
+    JCC_O,
+    JCC_B,
+    JCC_Z,
+    JCC_BE,
+    JCC_S,
+    JCC_P,
+    JCC_L,
+    JCC_LE,
+};
+
+/* operand size */
+enum {
+    OT_BYTE = 0,
+    OT_WORD,
+    OT_LONG,
+    OT_QUAD,
+};
+
+enum {
+    /* I386 int registers */
+    OR_EAX,   /* MUST be even numbered */
+    OR_ECX,
+    OR_EDX,
+    OR_EBX,
+    OR_ESP,
+    OR_EBP,
+    OR_ESI,
+    OR_EDI,
+
+    OR_TMP0 = 16,    /* temporary operand register */
+    OR_TMP1,
+    OR_A0, /* temporary register used when doing address evaluation */
+};
+
+static inline void gen_op_movl_T0_0(void)
+{
+    tcg_gen_movi_tl(cpu_T[0], 0);
+}
+
+static inline void gen_op_movl_T0_im(int32_t val)
+{
+    tcg_gen_movi_tl(cpu_T[0], val);
+}
+
+static inline void gen_op_movl_T0_imu(uint32_t val)
+{
+    tcg_gen_movi_tl(cpu_T[0], val);
+}
+
+static inline void gen_op_movl_T1_im(int32_t val)
+{
+    tcg_gen_movi_tl(cpu_T[1], val);
+}
+
+static inline void gen_op_movl_T1_imu(uint32_t val)
+{
+    tcg_gen_movi_tl(cpu_T[1], val);
+}
+
+static inline void gen_op_movl_A0_im(uint32_t val)
+{
+    tcg_gen_movi_tl(cpu_A0, val);
+}
+
+#ifdef TARGET_X86_64
+static inline void gen_op_movq_A0_im(int64_t val)
+{
+    tcg_gen_movi_tl(cpu_A0, val);
+}
+#endif
+
+static inline void gen_movtl_T0_im(target_ulong val)
+{
+    tcg_gen_movi_tl(cpu_T[0], val);
+}
+
+static inline void gen_movtl_T1_im(target_ulong val)
+{
+    tcg_gen_movi_tl(cpu_T[1], val);
+}
+
+static inline void gen_op_andl_T0_ffff(void)
+{
+    tcg_gen_andi_tl(cpu_T[0], cpu_T[0], 0xffff);
+}
+
+static inline void gen_op_andl_T0_im(uint32_t val)
+{
+    tcg_gen_andi_tl(cpu_T[0], cpu_T[0], val);
+}
+
+static inline void gen_op_movl_T0_T1(void)
+{
+    tcg_gen_mov_tl(cpu_T[0], cpu_T[1]);
+}
+
+static inline void gen_op_andl_A0_ffff(void)
+{
+    tcg_gen_andi_tl(cpu_A0, cpu_A0, 0xffff);
+}
+
+#ifdef TARGET_X86_64
+
+#define NB_OP_SIZES 4
+
+#else /* !TARGET_X86_64 */
+
+#define NB_OP_SIZES 3
+
+#endif /* !TARGET_X86_64 */
+
+#if defined(HOST_WORDS_BIGENDIAN)
+#define REG_B_OFFSET (sizeof(target_ulong) - 1)
+#define REG_H_OFFSET (sizeof(target_ulong) - 2)
+#define REG_W_OFFSET (sizeof(target_ulong) - 2)
+#define REG_L_OFFSET (sizeof(target_ulong) - 4)
+#define REG_LH_OFFSET (sizeof(target_ulong) - 8)
+#else
+#define REG_B_OFFSET 0
+#define REG_H_OFFSET 1
+#define REG_W_OFFSET 0
+#define REG_L_OFFSET 0
+#define REG_LH_OFFSET 4
+#endif
+
+static inline void gen_op_mov_reg_v(int ot, int reg, TCGv t0)
+{
+    TCGv tmp;
+
+    switch(ot) {
+    case OT_BYTE:
+        tmp = tcg_temp_new();
+        tcg_gen_ext8u_tl(tmp, t0);
+        if (reg < 4 X86_64_DEF( || reg >= 8 || x86_64_hregs)) {
+            tcg_gen_andi_tl(cpu_regs[reg], cpu_regs[reg], ~0xff);
+            tcg_gen_or_tl(cpu_regs[reg], cpu_regs[reg], tmp);
+        } else {
+            tcg_gen_shli_tl(tmp, tmp, 8);
+            tcg_gen_andi_tl(cpu_regs[reg - 4], cpu_regs[reg - 4], ~0xff00);
+            tcg_gen_or_tl(cpu_regs[reg - 4], cpu_regs[reg - 4], tmp);
+        }
+        tcg_temp_free(tmp);
+        break;
+    case OT_WORD:
+        tmp = tcg_temp_new();
+        tcg_gen_ext16u_tl(tmp, t0);
+        tcg_gen_andi_tl(cpu_regs[reg], cpu_regs[reg], ~0xffff);
+        tcg_gen_or_tl(cpu_regs[reg], cpu_regs[reg], tmp);
+        tcg_temp_free(tmp);
+        break;
+    default: /* XXX this shouldn't be reached;  abort? */
+    case OT_LONG:
+        /* For x86_64, this sets the higher half of register to zero.
+           For i386, this is equivalent to a mov. */
+        tcg_gen_ext32u_tl(cpu_regs[reg], t0);
+        break;
+#ifdef TARGET_X86_64
+    case OT_QUAD:
+        tcg_gen_mov_tl(cpu_regs[reg], t0);
+        break;
+#endif
+    }
+}
+
+static inline void gen_op_mov_reg_T0(int ot, int reg)
+{
+    gen_op_mov_reg_v(ot, reg, cpu_T[0]);
+}
+
+static inline void gen_op_mov_reg_T1(int ot, int reg)
+{
+    gen_op_mov_reg_v(ot, reg, cpu_T[1]);
+}
+
+static inline void gen_op_mov_reg_A0(int size, int reg)
+{
+    TCGv tmp;
+
+    switch(size) {
+    case 0:
+        tmp = tcg_temp_new();
+        tcg_gen_ext16u_tl(tmp, cpu_A0);
+        tcg_gen_andi_tl(cpu_regs[reg], cpu_regs[reg], ~0xffff);
+        tcg_gen_or_tl(cpu_regs[reg], cpu_regs[reg], tmp);
+        tcg_temp_free(tmp);
+        break;
+    default: /* XXX this shouldn't be reached;  abort? */
+    case 1:
+        /* For x86_64, this sets the higher half of register to zero.
+           For i386, this is equivalent to a mov. */
+        tcg_gen_ext32u_tl(cpu_regs[reg], cpu_A0);
+        break;
+#ifdef TARGET_X86_64
+    case 2:
+        tcg_gen_mov_tl(cpu_regs[reg], cpu_A0);
+        break;
+#endif
+    }
+}
+
+static inline void gen_op_mov_v_reg(int ot, TCGv t0, int reg)
+{
+    switch(ot) {
+    case OT_BYTE:
+        if (reg < 4 X86_64_DEF( || reg >= 8 || x86_64_hregs)) {
+            goto std_case;
+        } else {
+            tcg_gen_shri_tl(t0, cpu_regs[reg - 4], 8);
+            tcg_gen_ext8u_tl(t0, t0);
+        }
+        break;
+    default:
+    std_case:
+        tcg_gen_mov_tl(t0, cpu_regs[reg]);
+        break;
+    }
+}
+
+static inline void gen_op_mov_TN_reg(int ot, int t_index, int reg)
+{
+    gen_op_mov_v_reg(ot, cpu_T[t_index], reg);
+}
+
+static inline void gen_op_movl_A0_reg(int reg)
+{
+    tcg_gen_mov_tl(cpu_A0, cpu_regs[reg]);
+}
+
+static inline void gen_op_addl_A0_im(int32_t val)
+{
+    tcg_gen_addi_tl(cpu_A0, cpu_A0, val);
+#ifdef TARGET_X86_64
+    tcg_gen_andi_tl(cpu_A0, cpu_A0, 0xffffffff);
+#endif
+}
+
+#ifdef TARGET_X86_64
+static inline void gen_op_addq_A0_im(int64_t val)
+{
+    tcg_gen_addi_tl(cpu_A0, cpu_A0, val);
+}
+#endif
+
+static void gen_add_A0_im(DisasContext *s, int val)
+{
+#ifdef TARGET_X86_64
+    if (CODE64(s))
+        gen_op_addq_A0_im(val);
+    else
+#endif
+        gen_op_addl_A0_im(val);
+}
+
+static inline void gen_op_addl_T0_T1(void)
+{
+    tcg_gen_add_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+}
+
+static inline void gen_op_jmp_T0(void)
+{
+    tcg_gen_st_tl(cpu_T[0], cpu_env, offsetof(CPUState, eip));
+}
+
+static inline void gen_op_add_reg_im(int size, int reg, int32_t val)
+{
+    switch(size) {
+    case 0:
+        tcg_gen_addi_tl(cpu_tmp0, cpu_regs[reg], val);
+        tcg_gen_ext16u_tl(cpu_tmp0, cpu_tmp0);
+        tcg_gen_andi_tl(cpu_regs[reg], cpu_regs[reg], ~0xffff);
+        tcg_gen_or_tl(cpu_regs[reg], cpu_regs[reg], cpu_tmp0);
+        break;
+    case 1:
+        tcg_gen_addi_tl(cpu_tmp0, cpu_regs[reg], val);
+        /* For x86_64, this sets the higher half of register to zero.
+           For i386, this is equivalent to a nop. */
+        tcg_gen_ext32u_tl(cpu_tmp0, cpu_tmp0);
+        tcg_gen_mov_tl(cpu_regs[reg], cpu_tmp0);
+        break;
+#ifdef TARGET_X86_64
+    case 2:
+        tcg_gen_addi_tl(cpu_regs[reg], cpu_regs[reg], val);
+        break;
+#endif
+    }
+}
+
+static inline void gen_op_add_reg_T0(int size, int reg)
+{
+    switch(size) {
+    case 0:
+        tcg_gen_add_tl(cpu_tmp0, cpu_regs[reg], cpu_T[0]);
+        tcg_gen_ext16u_tl(cpu_tmp0, cpu_tmp0);
+        tcg_gen_andi_tl(cpu_regs[reg], cpu_regs[reg], ~0xffff);
+        tcg_gen_or_tl(cpu_regs[reg], cpu_regs[reg], cpu_tmp0);
+        break;
+    case 1:
+        tcg_gen_add_tl(cpu_tmp0, cpu_regs[reg], cpu_T[0]);
+        /* For x86_64, this sets the higher half of register to zero.
+           For i386, this is equivalent to a nop. */
+        tcg_gen_ext32u_tl(cpu_tmp0, cpu_tmp0);
+        tcg_gen_mov_tl(cpu_regs[reg], cpu_tmp0);
+        break;
+#ifdef TARGET_X86_64
+    case 2:
+        tcg_gen_add_tl(cpu_regs[reg], cpu_regs[reg], cpu_T[0]);
+        break;
+#endif
+    }
+}
+
+static inline void gen_op_set_cc_op(int32_t val)
+{
+    tcg_gen_movi_i32(cpu_cc_op, val);
+}
+
+static inline void gen_op_addl_A0_reg_sN(int shift, int reg)
+{
+    tcg_gen_mov_tl(cpu_tmp0, cpu_regs[reg]);
+    if (shift != 0)
+        tcg_gen_shli_tl(cpu_tmp0, cpu_tmp0, shift);
+    tcg_gen_add_tl(cpu_A0, cpu_A0, cpu_tmp0);
+    /* For x86_64, this sets the higher half of register to zero.
+       For i386, this is equivalent to a nop. */
+    tcg_gen_ext32u_tl(cpu_A0, cpu_A0);
+}
+
+#ifdef VBOX
+DECLINLINE(void) gen_op_seg_check(int reg, bool keepA0)
+{
+    /* It seems segments doesn't get out of sync - if they do in fact - enable below code. */
+# ifdef FORCE_SEGMENT_SYNC
+#  if 1
+    TCGv t0;
+
+    /* Considering poor quality of TCG optimizer - better call directly */
+    t0 = tcg_temp_local_new(TCG_TYPE_TL);
+    tcg_gen_movi_tl(t0, reg);
+    tcg_gen_helper_0_1(helper_sync_seg, t0);
+    tcg_temp_free(t0);
+#  else
+    /* Our segments could be outdated, thus check for newselector field to see if update really needed */
+    int skip_label;
+    TCGv t0, a0;
+
+    /* For other segments this check is waste of time, and also TCG is unable to cope with this code,
+       for data/stack segments, as expects alive cpu_T[0] */
+    if (reg != R_GS)
+        return;
+
+    if (keepA0)
+    {
+        /* we need to store old cpu_A0 */
+        a0 = tcg_temp_local_new(TCG_TYPE_TL);
+        tcg_gen_mov_tl(a0, cpu_A0);
+    }
+
+    skip_label = gen_new_label();
+    t0 = tcg_temp_local_new(TCG_TYPE_TL);
+
+    tcg_gen_ld32u_tl(t0, cpu_env, offsetof(CPUState, segs[reg].newselector) + REG_L_OFFSET);
+    tcg_gen_brcondi_i32(TCG_COND_EQ, t0, 0, skip_label);
+    tcg_gen_ld32u_tl(t0, cpu_env, offsetof(CPUState, eflags) + REG_L_OFFSET);
+    tcg_gen_andi_tl(t0, t0, VM_MASK);
+    tcg_gen_brcondi_i32(TCG_COND_NE, t0, 0, skip_label);
+    tcg_gen_movi_tl(t0, reg);
+
+    tcg_gen_helper_0_1(helper_sync_seg, t0);
+
+    tcg_temp_free(t0);
+
+   gen_set_label(skip_label);
+    if (keepA0)
+    {
+        tcg_gen_mov_tl(cpu_A0, a0);
+        tcg_temp_free(a0);
+    }
+#  endif /* 0 */
+# endif /* FORCE_SEGMENT_SYNC */
+}
+#endif /* VBOX */
+
+static inline void gen_op_movl_A0_seg(int reg)
+{
+#ifdef VBOX
+    gen_op_seg_check(reg, false);
+#endif
+    tcg_gen_ld32u_tl(cpu_A0, cpu_env, offsetof(CPUState, segs[reg].base) + REG_L_OFFSET);
+}
+
+static inline void gen_op_addl_A0_seg(int reg)
+{
+#ifdef VBOX
+    gen_op_seg_check(reg, true);
+#endif
+    tcg_gen_ld_tl(cpu_tmp0, cpu_env, offsetof(CPUState, segs[reg].base));
+    tcg_gen_add_tl(cpu_A0, cpu_A0, cpu_tmp0);
+#ifdef TARGET_X86_64
+    tcg_gen_andi_tl(cpu_A0, cpu_A0, 0xffffffff);
+#endif
+}
+
+#ifdef TARGET_X86_64
+static inline void gen_op_movq_A0_seg(int reg)
+{
+#ifdef VBOX
+    gen_op_seg_check(reg, false);
+#endif
+    tcg_gen_ld_tl(cpu_A0, cpu_env, offsetof(CPUState, segs[reg].base));
+}
+
+static inline void gen_op_addq_A0_seg(int reg)
+{
+#ifdef VBOX
+    gen_op_seg_check(reg, true);
+#endif
+    tcg_gen_ld_tl(cpu_tmp0, cpu_env, offsetof(CPUState, segs[reg].base));
+    tcg_gen_add_tl(cpu_A0, cpu_A0, cpu_tmp0);
+}
+
+static inline void gen_op_movq_A0_reg(int reg)
+{
+    tcg_gen_mov_tl(cpu_A0, cpu_regs[reg]);
+}
+
+static inline void gen_op_addq_A0_reg_sN(int shift, int reg)
+{
+    tcg_gen_mov_tl(cpu_tmp0, cpu_regs[reg]);
+    if (shift != 0)
+        tcg_gen_shli_tl(cpu_tmp0, cpu_tmp0, shift);
+    tcg_gen_add_tl(cpu_A0, cpu_A0, cpu_tmp0);
+}
+#endif
+
+static inline void gen_op_lds_T0_A0(int idx)
+{
+    int mem_index = (idx >> 2) - 1;
+    switch(idx & 3) {
+    case 0:
+        tcg_gen_qemu_ld8s(cpu_T[0], cpu_A0, mem_index);
+        break;
+    case 1:
+        tcg_gen_qemu_ld16s(cpu_T[0], cpu_A0, mem_index);
+        break;
+    default:
+    case 2:
+        tcg_gen_qemu_ld32s(cpu_T[0], cpu_A0, mem_index);
+        break;
+    }
+}
+
+static inline void gen_op_ld_v(int idx, TCGv t0, TCGv a0)
+{
+    int mem_index = (idx >> 2) - 1;
+    switch(idx & 3) {
+    case 0:
+        tcg_gen_qemu_ld8u(t0, a0, mem_index);
+        break;
+    case 1:
+        tcg_gen_qemu_ld16u(t0, a0, mem_index);
+        break;
+    case 2:
+        tcg_gen_qemu_ld32u(t0, a0, mem_index);
+        break;
+    default:
+    case 3:
+        /* Should never happen on 32-bit targets.  */
+#ifdef TARGET_X86_64
+        tcg_gen_qemu_ld64(t0, a0, mem_index);
+#endif
+        break;
+    }
+}
+
+/* XXX: always use ldu or lds */
+static inline void gen_op_ld_T0_A0(int idx)
+{
+    gen_op_ld_v(idx, cpu_T[0], cpu_A0);
+}
+
+static inline void gen_op_ldu_T0_A0(int idx)
+{
+    gen_op_ld_v(idx, cpu_T[0], cpu_A0);
+}
+
+static inline void gen_op_ld_T1_A0(int idx)
+{
+    gen_op_ld_v(idx, cpu_T[1], cpu_A0);
+}
+
+static inline void gen_op_st_v(int idx, TCGv t0, TCGv a0)
+{
+    int mem_index = (idx >> 2) - 1;
+    switch(idx & 3) {
+    case 0:
+        tcg_gen_qemu_st8(t0, a0, mem_index);
+        break;
+    case 1:
+        tcg_gen_qemu_st16(t0, a0, mem_index);
+        break;
+    case 2:
+        tcg_gen_qemu_st32(t0, a0, mem_index);
+        break;
+    default:
+    case 3:
+        /* Should never happen on 32-bit targets.  */
+#ifdef TARGET_X86_64
+        tcg_gen_qemu_st64(t0, a0, mem_index);
+#endif
+        break;
+    }
+}
+
+static inline void gen_op_st_T0_A0(int idx)
+{
+    gen_op_st_v(idx, cpu_T[0], cpu_A0);
+}
+
+static inline void gen_op_st_T1_A0(int idx)
+{
+    gen_op_st_v(idx, cpu_T[1], cpu_A0);
+}
+
+#ifdef VBOX
+
+static void gen_check_external_event(void)
+{
+# if 1
+    /** @todo once TCG codegen improves, we may want to use version
+        from else version */
+    gen_helper_check_external_event();
+# else
+    int skip_label;
+    TCGv t0;
+
+    skip_label = gen_new_label();
+    t0 = tcg_temp_local_new(TCG_TYPE_TL);
+    /* t0 = cpu_tmp0; */
+
+    tcg_gen_ld32u_tl(t0, cpu_env, offsetof(CPUState, interrupt_request));
+    /* Keep in sync with helper_check_external_event() */
+    tcg_gen_andi_tl(t0, t0,
+                    CPU_INTERRUPT_EXTERNAL_EXIT
+                    | CPU_INTERRUPT_EXTERNAL_TIMER
+                    | CPU_INTERRUPT_EXTERNAL_DMA
+                    | CPU_INTERRUPT_EXTERNAL_HARD);
+    /** @todo predict branch as taken */
+    tcg_gen_brcondi_i32(TCG_COND_EQ, t0, 0, skip_label);
+    tcg_temp_free(t0);
+
+    gen_helper_check_external_event();
+
+   gen_set_label(skip_label);
+# endif
+}
+
+#endif /* VBOX */
+
+static inline void gen_jmp_im(target_ulong pc)
+{
+    tcg_gen_movi_tl(cpu_tmp0, pc);
+    tcg_gen_st_tl(cpu_tmp0, cpu_env, offsetof(CPUState, eip));
+}
+
+#ifdef VBOX
+DECLINLINE(void) gen_update_eip(target_ulong pc)
+{
+    gen_jmp_im(pc);
+# ifdef VBOX_DUMP_STATE
+    gen_helper_dump_state();
+# endif
+}
+#endif /* VBOX */
+
+static inline void gen_string_movl_A0_ESI(DisasContext *s)
+{
+    int override;
+
+    override = s->override;
+#ifdef TARGET_X86_64
+    if (s->aflag == 2) {
+        if (override >= 0) {
+            gen_op_movq_A0_seg(override);
+            gen_op_addq_A0_reg_sN(0, R_ESI);
+        } else {
+            gen_op_movq_A0_reg(R_ESI);
+        }
+    } else
+#endif
+    if (s->aflag) {
+        /* 32 bit address */
+        if (s->addseg && override < 0)
+            override = R_DS;
+        if (override >= 0) {
+            gen_op_movl_A0_seg(override);
+            gen_op_addl_A0_reg_sN(0, R_ESI);
+        } else {
+            gen_op_movl_A0_reg(R_ESI);
+        }
+    } else {
+        /* 16 address, always override */
+        if (override < 0)
+            override = R_DS;
+        gen_op_movl_A0_reg(R_ESI);
+        gen_op_andl_A0_ffff();
+        gen_op_addl_A0_seg(override);
+    }
+}
+
+static inline void gen_string_movl_A0_EDI(DisasContext *s)
+{
+#ifdef TARGET_X86_64
+    if (s->aflag == 2) {
+        gen_op_movq_A0_reg(R_EDI);
+    } else
+#endif
+    if (s->aflag) {
+        if (s->addseg) {
+            gen_op_movl_A0_seg(R_ES);
+            gen_op_addl_A0_reg_sN(0, R_EDI);
+        } else {
+            gen_op_movl_A0_reg(R_EDI);
+        }
+    } else {
+        gen_op_movl_A0_reg(R_EDI);
+        gen_op_andl_A0_ffff();
+        gen_op_addl_A0_seg(R_ES);
+    }
+}
+
+static inline void gen_op_movl_T0_Dshift(int ot)
+{
+    tcg_gen_ld32s_tl(cpu_T[0], cpu_env, offsetof(CPUState, df));
+    tcg_gen_shli_tl(cpu_T[0], cpu_T[0], ot);
+};
+
+static void gen_extu(int ot, TCGv reg)
+{
+    switch(ot) {
+    case OT_BYTE:
+        tcg_gen_ext8u_tl(reg, reg);
+        break;
+    case OT_WORD:
+        tcg_gen_ext16u_tl(reg, reg);
+        break;
+    case OT_LONG:
+        tcg_gen_ext32u_tl(reg, reg);
+        break;
+    default:
+        break;
+    }
+}
+
+static void gen_exts(int ot, TCGv reg)
+{
+    switch(ot) {
+    case OT_BYTE:
+        tcg_gen_ext8s_tl(reg, reg);
+        break;
+    case OT_WORD:
+        tcg_gen_ext16s_tl(reg, reg);
+        break;
+    case OT_LONG:
+        tcg_gen_ext32s_tl(reg, reg);
+        break;
+    default:
+        break;
+    }
+}
+
+static inline void gen_op_jnz_ecx(int size, int label1)
+{
+    tcg_gen_mov_tl(cpu_tmp0, cpu_regs[R_ECX]);
+    gen_extu(size + 1, cpu_tmp0);
+    tcg_gen_brcondi_tl(TCG_COND_NE, cpu_tmp0, 0, label1);
+}
+
+static inline void gen_op_jz_ecx(int size, int label1)
+{
+    tcg_gen_mov_tl(cpu_tmp0, cpu_regs[R_ECX]);
+    gen_extu(size + 1, cpu_tmp0);
+    tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_tmp0, 0, label1);
+}
+
+static void gen_helper_in_func(int ot, TCGv v, TCGv_i32 n)
+{
+    switch (ot) {
+    case 0: gen_helper_inb(v, n); break;
+    case 1: gen_helper_inw(v, n); break;
+    case 2: gen_helper_inl(v, n); break;
+    }
+
+}
+
+static void gen_helper_out_func(int ot, TCGv_i32 v, TCGv_i32 n)
+{
+    switch (ot) {
+    case 0: gen_helper_outb(v, n); break;
+    case 1: gen_helper_outw(v, n); break;
+    case 2: gen_helper_outl(v, n); break;
+    }
+
+}
+
+static void gen_check_io(DisasContext *s, int ot, target_ulong cur_eip,
+                         uint32_t svm_flags)
+{
+    int state_saved;
+    target_ulong next_eip;
+
+    state_saved = 0;
+    if (s->pe && (s->cpl > s->iopl || s->vm86)) {
+        if (s->cc_op != CC_OP_DYNAMIC)
+            gen_op_set_cc_op(s->cc_op);
+        gen_jmp_im(cur_eip);
+        state_saved = 1;
+        tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+        switch (ot) {
+        case 0: gen_helper_check_iob(cpu_tmp2_i32); break;
+        case 1: gen_helper_check_iow(cpu_tmp2_i32); break;
+        case 2: gen_helper_check_iol(cpu_tmp2_i32); break;
+        }
+    }
+    if(s->flags & HF_SVMI_MASK) {
+        if (!state_saved) {
+            if (s->cc_op != CC_OP_DYNAMIC)
+                gen_op_set_cc_op(s->cc_op);
+            gen_jmp_im(cur_eip);
+        }
+        svm_flags |= (1 << (4 + ot));
+        next_eip = s->pc - s->cs_base;
+        tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+        gen_helper_svm_check_io(cpu_tmp2_i32, tcg_const_i32(svm_flags),
+                                tcg_const_i32(next_eip - cur_eip));
+    }
+}
+
+static inline void gen_movs(DisasContext *s, int ot)
+{
+    gen_string_movl_A0_ESI(s);
+    gen_op_ld_T0_A0(ot + s->mem_index);
+    gen_string_movl_A0_EDI(s);
+    gen_op_st_T0_A0(ot + s->mem_index);
+    gen_op_movl_T0_Dshift(ot);
+    gen_op_add_reg_T0(s->aflag, R_ESI);
+    gen_op_add_reg_T0(s->aflag, R_EDI);
+}
+
+static inline void gen_update_cc_op(DisasContext *s)
+{
+    if (s->cc_op != CC_OP_DYNAMIC) {
+        gen_op_set_cc_op(s->cc_op);
+        s->cc_op = CC_OP_DYNAMIC;
+    }
+}
+
+static void gen_op_update1_cc(void)
+{
+    tcg_gen_discard_tl(cpu_cc_src);
+    tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+}
+
+static void gen_op_update2_cc(void)
+{
+    tcg_gen_mov_tl(cpu_cc_src, cpu_T[1]);
+    tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+}
+
+static inline void gen_op_cmpl_T0_T1_cc(void)
+{
+    tcg_gen_mov_tl(cpu_cc_src, cpu_T[1]);
+    tcg_gen_sub_tl(cpu_cc_dst, cpu_T[0], cpu_T[1]);
+}
+
+static inline void gen_op_testl_T0_T1_cc(void)
+{
+    tcg_gen_discard_tl(cpu_cc_src);
+    tcg_gen_and_tl(cpu_cc_dst, cpu_T[0], cpu_T[1]);
+}
+
+static void gen_op_update_neg_cc(void)
+{
+    tcg_gen_neg_tl(cpu_cc_src, cpu_T[0]);
+    tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+}
+
+/* compute eflags.C to reg */
+static void gen_compute_eflags_c(TCGv reg)
+{
+    gen_helper_cc_compute_c(cpu_tmp2_i32, cpu_cc_op);
+    tcg_gen_extu_i32_tl(reg, cpu_tmp2_i32);
+}
+
+/* compute all eflags to cc_src */
+static void gen_compute_eflags(TCGv reg)
+{
+    gen_helper_cc_compute_all(cpu_tmp2_i32, cpu_cc_op);
+    tcg_gen_extu_i32_tl(reg, cpu_tmp2_i32);
+}
+
+static inline void gen_setcc_slow_T0(DisasContext *s, int jcc_op)
+{
+    if (s->cc_op != CC_OP_DYNAMIC)
+        gen_op_set_cc_op(s->cc_op);
+    switch(jcc_op) {
+    case JCC_O:
+        gen_compute_eflags(cpu_T[0]);
+        tcg_gen_shri_tl(cpu_T[0], cpu_T[0], 11);
+        tcg_gen_andi_tl(cpu_T[0], cpu_T[0], 1);
+        break;
+    case JCC_B:
+        gen_compute_eflags_c(cpu_T[0]);
+        break;
+    case JCC_Z:
+        gen_compute_eflags(cpu_T[0]);
+        tcg_gen_shri_tl(cpu_T[0], cpu_T[0], 6);
+        tcg_gen_andi_tl(cpu_T[0], cpu_T[0], 1);
+        break;
+    case JCC_BE:
+        gen_compute_eflags(cpu_tmp0);
+        tcg_gen_shri_tl(cpu_T[0], cpu_tmp0, 6);
+        tcg_gen_or_tl(cpu_T[0], cpu_T[0], cpu_tmp0);
+        tcg_gen_andi_tl(cpu_T[0], cpu_T[0], 1);
+        break;
+    case JCC_S:
+        gen_compute_eflags(cpu_T[0]);
+        tcg_gen_shri_tl(cpu_T[0], cpu_T[0], 7);
+        tcg_gen_andi_tl(cpu_T[0], cpu_T[0], 1);
+        break;
+    case JCC_P:
+        gen_compute_eflags(cpu_T[0]);
+        tcg_gen_shri_tl(cpu_T[0], cpu_T[0], 2);
+        tcg_gen_andi_tl(cpu_T[0], cpu_T[0], 1);
+        break;
+    case JCC_L:
+        gen_compute_eflags(cpu_tmp0);
+        tcg_gen_shri_tl(cpu_T[0], cpu_tmp0, 11); /* CC_O */
+        tcg_gen_shri_tl(cpu_tmp0, cpu_tmp0, 7); /* CC_S */
+        tcg_gen_xor_tl(cpu_T[0], cpu_T[0], cpu_tmp0);
+        tcg_gen_andi_tl(cpu_T[0], cpu_T[0], 1);
+        break;
+    default:
+    case JCC_LE:
+        gen_compute_eflags(cpu_tmp0);
+        tcg_gen_shri_tl(cpu_T[0], cpu_tmp0, 11); /* CC_O */
+        tcg_gen_shri_tl(cpu_tmp4, cpu_tmp0, 7); /* CC_S */
+        tcg_gen_shri_tl(cpu_tmp0, cpu_tmp0, 6); /* CC_Z */
+        tcg_gen_xor_tl(cpu_T[0], cpu_T[0], cpu_tmp4);
+        tcg_gen_or_tl(cpu_T[0], cpu_T[0], cpu_tmp0);
+        tcg_gen_andi_tl(cpu_T[0], cpu_T[0], 1);
+        break;
+    }
+}
+
+/* return true if setcc_slow is not needed (WARNING: must be kept in
+   sync with gen_jcc1) */
+static int is_fast_jcc_case(DisasContext *s, int b)
+{
+    int jcc_op;
+    jcc_op = (b >> 1) & 7;
+    switch(s->cc_op) {
+        /* we optimize the cmp/jcc case */
+    case CC_OP_SUBB:
+    case CC_OP_SUBW:
+    case CC_OP_SUBL:
+    case CC_OP_SUBQ:
+        if (jcc_op == JCC_O || jcc_op == JCC_P)
+            goto slow_jcc;
+        break;
+
+        /* some jumps are easy to compute */
+    case CC_OP_ADDB:
+    case CC_OP_ADDW:
+    case CC_OP_ADDL:
+    case CC_OP_ADDQ:
+
+    case CC_OP_LOGICB:
+    case CC_OP_LOGICW:
+    case CC_OP_LOGICL:
+    case CC_OP_LOGICQ:
+
+    case CC_OP_INCB:
+    case CC_OP_INCW:
+    case CC_OP_INCL:
+    case CC_OP_INCQ:
+
+    case CC_OP_DECB:
+    case CC_OP_DECW:
+    case CC_OP_DECL:
+    case CC_OP_DECQ:
+
+    case CC_OP_SHLB:
+    case CC_OP_SHLW:
+    case CC_OP_SHLL:
+    case CC_OP_SHLQ:
+        if (jcc_op != JCC_Z && jcc_op != JCC_S)
+            goto slow_jcc;
+        break;
+    default:
+    slow_jcc:
+        return 0;
+    }
+    return 1;
+}
+
+/* generate a conditional jump to label 'l1' according to jump opcode
+   value 'b'. In the fast case, T0 is guaranted not to be used. */
+static inline void gen_jcc1(DisasContext *s, int cc_op, int b, int l1)
+{
+    int inv, jcc_op, size, cond;
+    TCGv t0;
+
+    inv = b & 1;
+    jcc_op = (b >> 1) & 7;
+
+    switch(cc_op) {
+        /* we optimize the cmp/jcc case */
+    case CC_OP_SUBB:
+    case CC_OP_SUBW:
+    case CC_OP_SUBL:
+    case CC_OP_SUBQ:
+
+        size = cc_op - CC_OP_SUBB;
+        switch(jcc_op) {
+        case JCC_Z:
+        fast_jcc_z:
+            switch(size) {
+            case 0:
+                tcg_gen_andi_tl(cpu_tmp0, cpu_cc_dst, 0xff);
+                t0 = cpu_tmp0;
+                break;
+            case 1:
+                tcg_gen_andi_tl(cpu_tmp0, cpu_cc_dst, 0xffff);
+                t0 = cpu_tmp0;
+                break;
+#ifdef TARGET_X86_64
+            case 2:
+                tcg_gen_andi_tl(cpu_tmp0, cpu_cc_dst, 0xffffffff);
+                t0 = cpu_tmp0;
+                break;
+#endif
+            default:
+                t0 = cpu_cc_dst;
+                break;
+            }
+            tcg_gen_brcondi_tl(inv ? TCG_COND_NE : TCG_COND_EQ, t0, 0, l1);
+            break;
+        case JCC_S:
+        fast_jcc_s:
+            switch(size) {
+            case 0:
+                tcg_gen_andi_tl(cpu_tmp0, cpu_cc_dst, 0x80);
+                tcg_gen_brcondi_tl(inv ? TCG_COND_EQ : TCG_COND_NE, cpu_tmp0,
+                                   0, l1);
+                break;
+            case 1:
+                tcg_gen_andi_tl(cpu_tmp0, cpu_cc_dst, 0x8000);
+                tcg_gen_brcondi_tl(inv ? TCG_COND_EQ : TCG_COND_NE, cpu_tmp0,
+                                   0, l1);
+                break;
+#ifdef TARGET_X86_64
+            case 2:
+                tcg_gen_andi_tl(cpu_tmp0, cpu_cc_dst, 0x80000000);
+                tcg_gen_brcondi_tl(inv ? TCG_COND_EQ : TCG_COND_NE, cpu_tmp0,
+                                   0, l1);
+                break;
+#endif
+            default:
+                tcg_gen_brcondi_tl(inv ? TCG_COND_GE : TCG_COND_LT, cpu_cc_dst,
+                                   0, l1);
+                break;
+            }
+            break;
+
+        case JCC_B:
+            cond = inv ? TCG_COND_GEU : TCG_COND_LTU;
+            goto fast_jcc_b;
+        case JCC_BE:
+            cond = inv ? TCG_COND_GTU : TCG_COND_LEU;
+        fast_jcc_b:
+            tcg_gen_add_tl(cpu_tmp4, cpu_cc_dst, cpu_cc_src);
+            switch(size) {
+            case 0:
+                t0 = cpu_tmp0;
+                tcg_gen_andi_tl(cpu_tmp4, cpu_tmp4, 0xff);
+                tcg_gen_andi_tl(t0, cpu_cc_src, 0xff);
+                break;
+            case 1:
+                t0 = cpu_tmp0;
+                tcg_gen_andi_tl(cpu_tmp4, cpu_tmp4, 0xffff);
+                tcg_gen_andi_tl(t0, cpu_cc_src, 0xffff);
+                break;
+#ifdef TARGET_X86_64
+            case 2:
+                t0 = cpu_tmp0;
+                tcg_gen_andi_tl(cpu_tmp4, cpu_tmp4, 0xffffffff);
+                tcg_gen_andi_tl(t0, cpu_cc_src, 0xffffffff);
+                break;
+#endif
+            default:
+                t0 = cpu_cc_src;
+                break;
+            }
+            tcg_gen_brcond_tl(cond, cpu_tmp4, t0, l1);
+            break;
+
+        case JCC_L:
+            cond = inv ? TCG_COND_GE : TCG_COND_LT;
+            goto fast_jcc_l;
+        case JCC_LE:
+            cond = inv ? TCG_COND_GT : TCG_COND_LE;
+        fast_jcc_l:
+            tcg_gen_add_tl(cpu_tmp4, cpu_cc_dst, cpu_cc_src);
+            switch(size) {
+            case 0:
+                t0 = cpu_tmp0;
+                tcg_gen_ext8s_tl(cpu_tmp4, cpu_tmp4);
+                tcg_gen_ext8s_tl(t0, cpu_cc_src);
+                break;
+            case 1:
+                t0 = cpu_tmp0;
+                tcg_gen_ext16s_tl(cpu_tmp4, cpu_tmp4);
+                tcg_gen_ext16s_tl(t0, cpu_cc_src);
+                break;
+#ifdef TARGET_X86_64
+            case 2:
+                t0 = cpu_tmp0;
+                tcg_gen_ext32s_tl(cpu_tmp4, cpu_tmp4);
+                tcg_gen_ext32s_tl(t0, cpu_cc_src);
+                break;
+#endif
+            default:
+                t0 = cpu_cc_src;
+                break;
+            }
+            tcg_gen_brcond_tl(cond, cpu_tmp4, t0, l1);
+            break;
+
+        default:
+            goto slow_jcc;
+        }
+        break;
+
+        /* some jumps are easy to compute */
+    case CC_OP_ADDB:
+    case CC_OP_ADDW:
+    case CC_OP_ADDL:
+    case CC_OP_ADDQ:
+
+    case CC_OP_ADCB:
+    case CC_OP_ADCW:
+    case CC_OP_ADCL:
+    case CC_OP_ADCQ:
+
+    case CC_OP_SBBB:
+    case CC_OP_SBBW:
+    case CC_OP_SBBL:
+    case CC_OP_SBBQ:
+
+    case CC_OP_LOGICB:
+    case CC_OP_LOGICW:
+    case CC_OP_LOGICL:
+    case CC_OP_LOGICQ:
+
+    case CC_OP_INCB:
+    case CC_OP_INCW:
+    case CC_OP_INCL:
+    case CC_OP_INCQ:
+
+    case CC_OP_DECB:
+    case CC_OP_DECW:
+    case CC_OP_DECL:
+    case CC_OP_DECQ:
+
+    case CC_OP_SHLB:
+    case CC_OP_SHLW:
+    case CC_OP_SHLL:
+    case CC_OP_SHLQ:
+
+    case CC_OP_SARB:
+    case CC_OP_SARW:
+    case CC_OP_SARL:
+    case CC_OP_SARQ:
+        switch(jcc_op) {
+        case JCC_Z:
+            size = (cc_op - CC_OP_ADDB) & 3;
+            goto fast_jcc_z;
+        case JCC_S:
+            size = (cc_op - CC_OP_ADDB) & 3;
+            goto fast_jcc_s;
+        default:
+            goto slow_jcc;
+        }
+        break;
+    default:
+    slow_jcc:
+        gen_setcc_slow_T0(s, jcc_op);
+        tcg_gen_brcondi_tl(inv ? TCG_COND_EQ : TCG_COND_NE,
+                           cpu_T[0], 0, l1);
+        break;
+    }
+}
+
+/* XXX: does not work with gdbstub "ice" single step - not a
+   serious problem */
+static int gen_jz_ecx_string(DisasContext *s, target_ulong next_eip)
+{
+    int l1, l2;
+
+    l1 = gen_new_label();
+    l2 = gen_new_label();
+    gen_op_jnz_ecx(s->aflag, l1);
+    gen_set_label(l2);
+    gen_jmp_tb(s, next_eip, 1);
+    gen_set_label(l1);
+    return l2;
+}
+
+static inline void gen_stos(DisasContext *s, int ot)
+{
+    gen_op_mov_TN_reg(OT_LONG, 0, R_EAX);
+    gen_string_movl_A0_EDI(s);
+    gen_op_st_T0_A0(ot + s->mem_index);
+    gen_op_movl_T0_Dshift(ot);
+    gen_op_add_reg_T0(s->aflag, R_EDI);
+}
+
+static inline void gen_lods(DisasContext *s, int ot)
+{
+    gen_string_movl_A0_ESI(s);
+    gen_op_ld_T0_A0(ot + s->mem_index);
+    gen_op_mov_reg_T0(ot, R_EAX);
+    gen_op_movl_T0_Dshift(ot);
+    gen_op_add_reg_T0(s->aflag, R_ESI);
+}
+
+static inline void gen_scas(DisasContext *s, int ot)
+{
+    gen_op_mov_TN_reg(OT_LONG, 0, R_EAX);
+    gen_string_movl_A0_EDI(s);
+    gen_op_ld_T1_A0(ot + s->mem_index);
+    gen_op_cmpl_T0_T1_cc();
+    gen_op_movl_T0_Dshift(ot);
+    gen_op_add_reg_T0(s->aflag, R_EDI);
+}
+
+static inline void gen_cmps(DisasContext *s, int ot)
+{
+    gen_string_movl_A0_ESI(s);
+    gen_op_ld_T0_A0(ot + s->mem_index);
+    gen_string_movl_A0_EDI(s);
+    gen_op_ld_T1_A0(ot + s->mem_index);
+    gen_op_cmpl_T0_T1_cc();
+    gen_op_movl_T0_Dshift(ot);
+    gen_op_add_reg_T0(s->aflag, R_ESI);
+    gen_op_add_reg_T0(s->aflag, R_EDI);
+}
+
+static inline void gen_ins(DisasContext *s, int ot)
+{
+    if (use_icount)
+        gen_io_start();
+    gen_string_movl_A0_EDI(s);
+    /* Note: we must do this dummy write first to be restartable in
+       case of page fault. */
+    gen_op_movl_T0_0();
+    gen_op_st_T0_A0(ot + s->mem_index);
+    gen_op_mov_TN_reg(OT_WORD, 1, R_EDX);
+    tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[1]);
+    tcg_gen_andi_i32(cpu_tmp2_i32, cpu_tmp2_i32, 0xffff);
+    gen_helper_in_func(ot, cpu_T[0], cpu_tmp2_i32);
+    gen_op_st_T0_A0(ot + s->mem_index);
+    gen_op_movl_T0_Dshift(ot);
+    gen_op_add_reg_T0(s->aflag, R_EDI);
+    if (use_icount)
+        gen_io_end();
+}
+
+static inline void gen_outs(DisasContext *s, int ot)
+{
+    if (use_icount)
+        gen_io_start();
+    gen_string_movl_A0_ESI(s);
+    gen_op_ld_T0_A0(ot + s->mem_index);
+
+    gen_op_mov_TN_reg(OT_WORD, 1, R_EDX);
+    tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[1]);
+    tcg_gen_andi_i32(cpu_tmp2_i32, cpu_tmp2_i32, 0xffff);
+    tcg_gen_trunc_tl_i32(cpu_tmp3_i32, cpu_T[0]);
+    gen_helper_out_func(ot, cpu_tmp2_i32, cpu_tmp3_i32);
+
+    gen_op_movl_T0_Dshift(ot);
+    gen_op_add_reg_T0(s->aflag, R_ESI);
+    if (use_icount)
+        gen_io_end();
+}
+
+/* same method as Valgrind : we generate jumps to current or next
+   instruction */
+#define GEN_REPZ(op)                                                          \
+static inline void gen_repz_ ## op(DisasContext *s, int ot,                   \
+                                 target_ulong cur_eip, target_ulong next_eip) \
+{                                                                             \
+    int l2;\
+    gen_update_cc_op(s);                                                      \
+    l2 = gen_jz_ecx_string(s, next_eip);                                      \
+    gen_ ## op(s, ot);                                                        \
+    gen_op_add_reg_im(s->aflag, R_ECX, -1);                                   \
+    /* a loop would cause two single step exceptions if ECX = 1               \
+       before rep string_insn */                                              \
+    if (!s->jmp_opt)                                                          \
+        gen_op_jz_ecx(s->aflag, l2);                                          \
+    gen_jmp(s, cur_eip);                                                      \
+}
+
+#define GEN_REPZ2(op)                                                         \
+static inline void gen_repz_ ## op(DisasContext *s, int ot,                   \
+                                   target_ulong cur_eip,                      \
+                                   target_ulong next_eip,                     \
+                                   int nz)                                    \
+{                                                                             \
+    int l2;\
+    gen_update_cc_op(s);                                                      \
+    l2 = gen_jz_ecx_string(s, next_eip);                                      \
+    gen_ ## op(s, ot);                                                        \
+    gen_op_add_reg_im(s->aflag, R_ECX, -1);                                   \
+    gen_op_set_cc_op(CC_OP_SUBB + ot);                                        \
+    gen_jcc1(s, CC_OP_SUBB + ot, (JCC_Z << 1) | (nz ^ 1), l2);                \
+    if (!s->jmp_opt)                                                          \
+        gen_op_jz_ecx(s->aflag, l2);                                          \
+    gen_jmp(s, cur_eip);                                                      \
+}
+
+GEN_REPZ(movs)
+GEN_REPZ(stos)
+GEN_REPZ(lods)
+GEN_REPZ(ins)
+GEN_REPZ(outs)
+GEN_REPZ2(scas)
+GEN_REPZ2(cmps)
+
+static void gen_helper_fp_arith_ST0_FT0(int op)
+{
+    switch (op) {
+    case 0: gen_helper_fadd_ST0_FT0(); break;
+    case 1: gen_helper_fmul_ST0_FT0(); break;
+    case 2: gen_helper_fcom_ST0_FT0(); break;
+    case 3: gen_helper_fcom_ST0_FT0(); break;
+    case 4: gen_helper_fsub_ST0_FT0(); break;
+    case 5: gen_helper_fsubr_ST0_FT0(); break;
+    case 6: gen_helper_fdiv_ST0_FT0(); break;
+    case 7: gen_helper_fdivr_ST0_FT0(); break;
+    }
+}
+
+/* NOTE the exception in "r" op ordering */
+static void gen_helper_fp_arith_STN_ST0(int op, int opreg)
+{
+    TCGv_i32 tmp = tcg_const_i32(opreg);
+    switch (op) {
+    case 0: gen_helper_fadd_STN_ST0(tmp); break;
+    case 1: gen_helper_fmul_STN_ST0(tmp); break;
+    case 4: gen_helper_fsubr_STN_ST0(tmp); break;
+    case 5: gen_helper_fsub_STN_ST0(tmp); break;
+    case 6: gen_helper_fdivr_STN_ST0(tmp); break;
+    case 7: gen_helper_fdiv_STN_ST0(tmp); break;
+    }
+}
+
+/* if d == OR_TMP0, it means memory operand (address in A0) */
+static void gen_op(DisasContext *s1, int op, int ot, int d)
+{
+    if (d != OR_TMP0) {
+        gen_op_mov_TN_reg(ot, 0, d);
+    } else {
+        gen_op_ld_T0_A0(ot + s1->mem_index);
+    }
+    switch(op) {
+    case OP_ADCL:
+        if (s1->cc_op != CC_OP_DYNAMIC)
+            gen_op_set_cc_op(s1->cc_op);
+        gen_compute_eflags_c(cpu_tmp4);
+        tcg_gen_add_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+        tcg_gen_add_tl(cpu_T[0], cpu_T[0], cpu_tmp4);
+        if (d != OR_TMP0)
+            gen_op_mov_reg_T0(ot, d);
+        else
+            gen_op_st_T0_A0(ot + s1->mem_index);
+        tcg_gen_mov_tl(cpu_cc_src, cpu_T[1]);
+        tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+        tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_tmp4);
+        tcg_gen_shli_i32(cpu_tmp2_i32, cpu_tmp2_i32, 2);
+        tcg_gen_addi_i32(cpu_cc_op, cpu_tmp2_i32, CC_OP_ADDB + ot);
+        s1->cc_op = CC_OP_DYNAMIC;
+        break;
+    case OP_SBBL:
+        if (s1->cc_op != CC_OP_DYNAMIC)
+            gen_op_set_cc_op(s1->cc_op);
+        gen_compute_eflags_c(cpu_tmp4);
+        tcg_gen_sub_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+        tcg_gen_sub_tl(cpu_T[0], cpu_T[0], cpu_tmp4);
+        if (d != OR_TMP0)
+            gen_op_mov_reg_T0(ot, d);
+        else
+            gen_op_st_T0_A0(ot + s1->mem_index);
+        tcg_gen_mov_tl(cpu_cc_src, cpu_T[1]);
+        tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+        tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_tmp4);
+        tcg_gen_shli_i32(cpu_tmp2_i32, cpu_tmp2_i32, 2);
+        tcg_gen_addi_i32(cpu_cc_op, cpu_tmp2_i32, CC_OP_SUBB + ot);
+        s1->cc_op = CC_OP_DYNAMIC;
+        break;
+    case OP_ADDL:
+        gen_op_addl_T0_T1();
+        if (d != OR_TMP0)
+            gen_op_mov_reg_T0(ot, d);
+        else
+            gen_op_st_T0_A0(ot + s1->mem_index);
+        gen_op_update2_cc();
+        s1->cc_op = CC_OP_ADDB + ot;
+        break;
+    case OP_SUBL:
+        tcg_gen_sub_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+        if (d != OR_TMP0)
+            gen_op_mov_reg_T0(ot, d);
+        else
+            gen_op_st_T0_A0(ot + s1->mem_index);
+        gen_op_update2_cc();
+        s1->cc_op = CC_OP_SUBB + ot;
+        break;
+    default:
+    case OP_ANDL:
+        tcg_gen_and_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+        if (d != OR_TMP0)
+            gen_op_mov_reg_T0(ot, d);
+        else
+            gen_op_st_T0_A0(ot + s1->mem_index);
+        gen_op_update1_cc();
+        s1->cc_op = CC_OP_LOGICB + ot;
+        break;
+    case OP_ORL:
+        tcg_gen_or_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+        if (d != OR_TMP0)
+            gen_op_mov_reg_T0(ot, d);
+        else
+            gen_op_st_T0_A0(ot + s1->mem_index);
+        gen_op_update1_cc();
+        s1->cc_op = CC_OP_LOGICB + ot;
+        break;
+    case OP_XORL:
+        tcg_gen_xor_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+        if (d != OR_TMP0)
+            gen_op_mov_reg_T0(ot, d);
+        else
+            gen_op_st_T0_A0(ot + s1->mem_index);
+        gen_op_update1_cc();
+        s1->cc_op = CC_OP_LOGICB + ot;
+        break;
+    case OP_CMPL:
+        gen_op_cmpl_T0_T1_cc();
+        s1->cc_op = CC_OP_SUBB + ot;
+        break;
+    }
+}
+
+/* if d == OR_TMP0, it means memory operand (address in A0) */
+static void gen_inc(DisasContext *s1, int ot, int d, int c)
+{
+    if (d != OR_TMP0)
+        gen_op_mov_TN_reg(ot, 0, d);
+    else
+        gen_op_ld_T0_A0(ot + s1->mem_index);
+    if (s1->cc_op != CC_OP_DYNAMIC)
+        gen_op_set_cc_op(s1->cc_op);
+    if (c > 0) {
+        tcg_gen_addi_tl(cpu_T[0], cpu_T[0], 1);
+        s1->cc_op = CC_OP_INCB + ot;
+    } else {
+        tcg_gen_addi_tl(cpu_T[0], cpu_T[0], -1);
+        s1->cc_op = CC_OP_DECB + ot;
+    }
+    if (d != OR_TMP0)
+        gen_op_mov_reg_T0(ot, d);
+    else
+        gen_op_st_T0_A0(ot + s1->mem_index);
+    gen_compute_eflags_c(cpu_cc_src);
+    tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+}
+
+static void gen_shift_rm_T1(DisasContext *s, int ot, int op1,
+                            int is_right, int is_arith)
+{
+    target_ulong mask;
+    int shift_label;
+    TCGv t0, t1;
+
+    if (ot == OT_QUAD)
+        mask = 0x3f;
+    else
+        mask = 0x1f;
+
+    /* load */
+    if (op1 == OR_TMP0)
+        gen_op_ld_T0_A0(ot + s->mem_index);
+    else
+        gen_op_mov_TN_reg(ot, 0, op1);
+
+    tcg_gen_andi_tl(cpu_T[1], cpu_T[1], mask);
+
+    tcg_gen_addi_tl(cpu_tmp5, cpu_T[1], -1);
+
+    if (is_right) {
+        if (is_arith) {
+            gen_exts(ot, cpu_T[0]);
+            tcg_gen_sar_tl(cpu_T3, cpu_T[0], cpu_tmp5);
+            tcg_gen_sar_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+        } else {
+            gen_extu(ot, cpu_T[0]);
+            tcg_gen_shr_tl(cpu_T3, cpu_T[0], cpu_tmp5);
+            tcg_gen_shr_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+        }
+    } else {
+        tcg_gen_shl_tl(cpu_T3, cpu_T[0], cpu_tmp5);
+        tcg_gen_shl_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+    }
+
+    /* store */
+    if (op1 == OR_TMP0)
+        gen_op_st_T0_A0(ot + s->mem_index);
+    else
+        gen_op_mov_reg_T0(ot, op1);
+
+    /* update eflags if non zero shift */
+    if (s->cc_op != CC_OP_DYNAMIC)
+        gen_op_set_cc_op(s->cc_op);
+
+    /* XXX: inefficient */
+    t0 = tcg_temp_local_new();
+    t1 = tcg_temp_local_new();
+
+    tcg_gen_mov_tl(t0, cpu_T[0]);
+    tcg_gen_mov_tl(t1, cpu_T3);
+
+    shift_label = gen_new_label();
+    tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_T[1], 0, shift_label);
+
+    tcg_gen_mov_tl(cpu_cc_src, t1);
+    tcg_gen_mov_tl(cpu_cc_dst, t0);
+    if (is_right)
+        tcg_gen_movi_i32(cpu_cc_op, CC_OP_SARB + ot);
+    else
+        tcg_gen_movi_i32(cpu_cc_op, CC_OP_SHLB + ot);
+
+    gen_set_label(shift_label);
+    s->cc_op = CC_OP_DYNAMIC; /* cannot predict flags after */
+
+    tcg_temp_free(t0);
+    tcg_temp_free(t1);
+}
+
+static void gen_shift_rm_im(DisasContext *s, int ot, int op1, int op2,
+                            int is_right, int is_arith)
+{
+    int mask;
+
+    if (ot == OT_QUAD)
+        mask = 0x3f;
+    else
+        mask = 0x1f;
+
+    /* load */
+    if (op1 == OR_TMP0)
+        gen_op_ld_T0_A0(ot + s->mem_index);
+    else
+        gen_op_mov_TN_reg(ot, 0, op1);
+
+    op2 &= mask;
+    if (op2 != 0) {
+        if (is_right) {
+            if (is_arith) {
+                gen_exts(ot, cpu_T[0]);
+                tcg_gen_sari_tl(cpu_tmp4, cpu_T[0], op2 - 1);
+                tcg_gen_sari_tl(cpu_T[0], cpu_T[0], op2);
+            } else {
+                gen_extu(ot, cpu_T[0]);
+                tcg_gen_shri_tl(cpu_tmp4, cpu_T[0], op2 - 1);
+                tcg_gen_shri_tl(cpu_T[0], cpu_T[0], op2);
+            }
+        } else {
+            tcg_gen_shli_tl(cpu_tmp4, cpu_T[0], op2 - 1);
+            tcg_gen_shli_tl(cpu_T[0], cpu_T[0], op2);
+        }
+    }
+
+    /* store */
+    if (op1 == OR_TMP0)
+        gen_op_st_T0_A0(ot + s->mem_index);
+    else
+        gen_op_mov_reg_T0(ot, op1);
+
+    /* update eflags if non zero shift */
+    if (op2 != 0) {
+        tcg_gen_mov_tl(cpu_cc_src, cpu_tmp4);
+        tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+        if (is_right)
+            s->cc_op = CC_OP_SARB + ot;
+        else
+            s->cc_op = CC_OP_SHLB + ot;
+    }
+}
+
+static inline void tcg_gen_lshift(TCGv ret, TCGv arg1, target_long arg2)
+{
+    if (arg2 >= 0)
+        tcg_gen_shli_tl(ret, arg1, arg2);
+    else
+        tcg_gen_shri_tl(ret, arg1, -arg2);
+}
+
+static void gen_rot_rm_T1(DisasContext *s, int ot, int op1,
+                          int is_right)
+{
+    target_ulong mask;
+    int label1, label2, data_bits;
+    TCGv t0, t1, t2, a0;
+
+    /* XXX: inefficient, but we must use local temps */
+    t0 = tcg_temp_local_new();
+    t1 = tcg_temp_local_new();
+    t2 = tcg_temp_local_new();
+    a0 = tcg_temp_local_new();
+
+    if (ot == OT_QUAD)
+        mask = 0x3f;
+    else
+        mask = 0x1f;
+
+    /* load */
+    if (op1 == OR_TMP0) {
+        tcg_gen_mov_tl(a0, cpu_A0);
+        gen_op_ld_v(ot + s->mem_index, t0, a0);
+    } else {
+        gen_op_mov_v_reg(ot, t0, op1);
+    }
+
+    tcg_gen_mov_tl(t1, cpu_T[1]);
+
+    tcg_gen_andi_tl(t1, t1, mask);
+
+    /* Must test zero case to avoid using undefined behaviour in TCG
+       shifts. */
+    label1 = gen_new_label();
+    tcg_gen_brcondi_tl(TCG_COND_EQ, t1, 0, label1);
+
+    if (ot <= OT_WORD)
+        tcg_gen_andi_tl(cpu_tmp0, t1, (1 << (3 + ot)) - 1);
+    else
+        tcg_gen_mov_tl(cpu_tmp0, t1);
+
+    gen_extu(ot, t0);
+    tcg_gen_mov_tl(t2, t0);
+
+    data_bits = 8 << ot;
+    /* XXX: rely on behaviour of shifts when operand 2 overflows (XXX:
+       fix TCG definition) */
+    if (is_right) {
+        tcg_gen_shr_tl(cpu_tmp4, t0, cpu_tmp0);
+        tcg_gen_subfi_tl(cpu_tmp0, data_bits, cpu_tmp0);
+        tcg_gen_shl_tl(t0, t0, cpu_tmp0);
+    } else {
+        tcg_gen_shl_tl(cpu_tmp4, t0, cpu_tmp0);
+        tcg_gen_subfi_tl(cpu_tmp0, data_bits, cpu_tmp0);
+        tcg_gen_shr_tl(t0, t0, cpu_tmp0);
+    }
+    tcg_gen_or_tl(t0, t0, cpu_tmp4);
+
+    gen_set_label(label1);
+    /* store */
+    if (op1 == OR_TMP0) {
+        gen_op_st_v(ot + s->mem_index, t0, a0);
+    } else {
+        gen_op_mov_reg_v(ot, op1, t0);
+    }
+
+    /* update eflags */
+    if (s->cc_op != CC_OP_DYNAMIC)
+        gen_op_set_cc_op(s->cc_op);
+
+    label2 = gen_new_label();
+    tcg_gen_brcondi_tl(TCG_COND_EQ, t1, 0, label2);
+
+    gen_compute_eflags(cpu_cc_src);
+    tcg_gen_andi_tl(cpu_cc_src, cpu_cc_src, ~(CC_O | CC_C));
+    tcg_gen_xor_tl(cpu_tmp0, t2, t0);
+    tcg_gen_lshift(cpu_tmp0, cpu_tmp0, 11 - (data_bits - 1));
+    tcg_gen_andi_tl(cpu_tmp0, cpu_tmp0, CC_O);
+    tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, cpu_tmp0);
+    if (is_right) {
+        tcg_gen_shri_tl(t0, t0, data_bits - 1);
+    }
+    tcg_gen_andi_tl(t0, t0, CC_C);
+    tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, t0);
+
+    tcg_gen_discard_tl(cpu_cc_dst);
+    tcg_gen_movi_i32(cpu_cc_op, CC_OP_EFLAGS);
+
+    gen_set_label(label2);
+    s->cc_op = CC_OP_DYNAMIC; /* cannot predict flags after */
+
+    tcg_temp_free(t0);
+    tcg_temp_free(t1);
+    tcg_temp_free(t2);
+    tcg_temp_free(a0);
+}
+
+static void gen_rot_rm_im(DisasContext *s, int ot, int op1, int op2,
+                          int is_right)
+{
+    int mask;
+    int data_bits;
+    TCGv t0, t1, a0;
+
+    /* XXX: inefficient, but we must use local temps */
+    t0 = tcg_temp_local_new();
+    t1 = tcg_temp_local_new();
+    a0 = tcg_temp_local_new();
+
+    if (ot == OT_QUAD)
+        mask = 0x3f;
+    else
+        mask = 0x1f;
+
+    /* load */
+    if (op1 == OR_TMP0) {
+        tcg_gen_mov_tl(a0, cpu_A0);
+        gen_op_ld_v(ot + s->mem_index, t0, a0);
+    } else {
+        gen_op_mov_v_reg(ot, t0, op1);
+    }
+
+    gen_extu(ot, t0);
+    tcg_gen_mov_tl(t1, t0);
+
+    op2 &= mask;
+    data_bits = 8 << ot;
+    if (op2 != 0) {
+        int shift = op2 & ((1 << (3 + ot)) - 1);
+        if (is_right) {
+            tcg_gen_shri_tl(cpu_tmp4, t0, shift);
+            tcg_gen_shli_tl(t0, t0, data_bits - shift);
+        }
+        else {
+            tcg_gen_shli_tl(cpu_tmp4, t0, shift);
+            tcg_gen_shri_tl(t0, t0, data_bits - shift);
+        }
+        tcg_gen_or_tl(t0, t0, cpu_tmp4);
+    }
+
+    /* store */
+    if (op1 == OR_TMP0) {
+        gen_op_st_v(ot + s->mem_index, t0, a0);
+    } else {
+        gen_op_mov_reg_v(ot, op1, t0);
+    }
+
+    if (op2 != 0) {
+        /* update eflags */
+        if (s->cc_op != CC_OP_DYNAMIC)
+            gen_op_set_cc_op(s->cc_op);
+
+        gen_compute_eflags(cpu_cc_src);
+        tcg_gen_andi_tl(cpu_cc_src, cpu_cc_src, ~(CC_O | CC_C));
+        tcg_gen_xor_tl(cpu_tmp0, t1, t0);
+        tcg_gen_lshift(cpu_tmp0, cpu_tmp0, 11 - (data_bits - 1));
+        tcg_gen_andi_tl(cpu_tmp0, cpu_tmp0, CC_O);
+        tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, cpu_tmp0);
+        if (is_right) {
+            tcg_gen_shri_tl(t0, t0, data_bits - 1);
+        }
+        tcg_gen_andi_tl(t0, t0, CC_C);
+        tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, t0);
+
+        tcg_gen_discard_tl(cpu_cc_dst);
+        tcg_gen_movi_i32(cpu_cc_op, CC_OP_EFLAGS);
+        s->cc_op = CC_OP_EFLAGS;
+    }
+
+    tcg_temp_free(t0);
+    tcg_temp_free(t1);
+    tcg_temp_free(a0);
+}
+
+/* XXX: add faster immediate = 1 case */
+static void gen_rotc_rm_T1(DisasContext *s, int ot, int op1,
+                           int is_right)
+{
+    int label1;
+
+    if (s->cc_op != CC_OP_DYNAMIC)
+        gen_op_set_cc_op(s->cc_op);
+
+    /* load */
+    if (op1 == OR_TMP0)
+        gen_op_ld_T0_A0(ot + s->mem_index);
+    else
+        gen_op_mov_TN_reg(ot, 0, op1);
+
+    if (is_right) {
+        switch (ot) {
+        case 0: gen_helper_rcrb(cpu_T[0], cpu_T[0], cpu_T[1]); break;
+        case 1: gen_helper_rcrw(cpu_T[0], cpu_T[0], cpu_T[1]); break;
+        case 2: gen_helper_rcrl(cpu_T[0], cpu_T[0], cpu_T[1]); break;
+#ifdef TARGET_X86_64
+        case 3: gen_helper_rcrq(cpu_T[0], cpu_T[0], cpu_T[1]); break;
+#endif
+        }
+    } else {
+        switch (ot) {
+        case 0: gen_helper_rclb(cpu_T[0], cpu_T[0], cpu_T[1]); break;
+        case 1: gen_helper_rclw(cpu_T[0], cpu_T[0], cpu_T[1]); break;
+        case 2: gen_helper_rcll(cpu_T[0], cpu_T[0], cpu_T[1]); break;
+#ifdef TARGET_X86_64
+        case 3: gen_helper_rclq(cpu_T[0], cpu_T[0], cpu_T[1]); break;
+#endif
+        }
+    }
+    /* store */
+    if (op1 == OR_TMP0)
+        gen_op_st_T0_A0(ot + s->mem_index);
+    else
+        gen_op_mov_reg_T0(ot, op1);
+
+    /* update eflags */
+    label1 = gen_new_label();
+    tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_cc_tmp, -1, label1);
+
+    tcg_gen_mov_tl(cpu_cc_src, cpu_cc_tmp);
+    tcg_gen_discard_tl(cpu_cc_dst);
+    tcg_gen_movi_i32(cpu_cc_op, CC_OP_EFLAGS);
+
+    gen_set_label(label1);
+    s->cc_op = CC_OP_DYNAMIC; /* cannot predict flags after */
+}
+
+/* XXX: add faster immediate case */
+static void gen_shiftd_rm_T1_T3(DisasContext *s, int ot, int op1,
+                                int is_right)
+{
+    int label1, label2, data_bits;
+    target_ulong mask;
+    TCGv t0, t1, t2, a0;
+
+    t0 = tcg_temp_local_new();
+    t1 = tcg_temp_local_new();
+    t2 = tcg_temp_local_new();
+    a0 = tcg_temp_local_new();
+
+    if (ot == OT_QUAD)
+        mask = 0x3f;
+    else
+        mask = 0x1f;
+
+    /* load */
+    if (op1 == OR_TMP0) {
+        tcg_gen_mov_tl(a0, cpu_A0);
+        gen_op_ld_v(ot + s->mem_index, t0, a0);
+    } else {
+        gen_op_mov_v_reg(ot, t0, op1);
+    }
+
+    tcg_gen_andi_tl(cpu_T3, cpu_T3, mask);
+
+    tcg_gen_mov_tl(t1, cpu_T[1]);
+    tcg_gen_mov_tl(t2, cpu_T3);
+
+    /* Must test zero case to avoid using undefined behaviour in TCG
+       shifts. */
+    label1 = gen_new_label();
+    tcg_gen_brcondi_tl(TCG_COND_EQ, t2, 0, label1);
+
+    tcg_gen_addi_tl(cpu_tmp5, t2, -1);
+    if (ot == OT_WORD) {
+        /* Note: we implement the Intel behaviour for shift count > 16 */
+        if (is_right) {
+            tcg_gen_andi_tl(t0, t0, 0xffff);
+            tcg_gen_shli_tl(cpu_tmp0, t1, 16);
+            tcg_gen_or_tl(t0, t0, cpu_tmp0);
+            tcg_gen_ext32u_tl(t0, t0);
+
+            tcg_gen_shr_tl(cpu_tmp4, t0, cpu_tmp5);
+
+            /* only needed if count > 16, but a test would complicate */
+            tcg_gen_subfi_tl(cpu_tmp5, 32, t2);
+            tcg_gen_shl_tl(cpu_tmp0, t0, cpu_tmp5);
+
+            tcg_gen_shr_tl(t0, t0, t2);
+
+            tcg_gen_or_tl(t0, t0, cpu_tmp0);
+        } else {
+            /* XXX: not optimal */
+            tcg_gen_andi_tl(t0, t0, 0xffff);
+            tcg_gen_shli_tl(t1, t1, 16);
+            tcg_gen_or_tl(t1, t1, t0);
+            tcg_gen_ext32u_tl(t1, t1);
+
+            tcg_gen_shl_tl(cpu_tmp4, t0, cpu_tmp5);
+            tcg_gen_subfi_tl(cpu_tmp0, 32, cpu_tmp5);
+            tcg_gen_shr_tl(cpu_tmp5, t1, cpu_tmp0);
+            tcg_gen_or_tl(cpu_tmp4, cpu_tmp4, cpu_tmp5);
+
+            tcg_gen_shl_tl(t0, t0, t2);
+            tcg_gen_subfi_tl(cpu_tmp5, 32, t2);
+            tcg_gen_shr_tl(t1, t1, cpu_tmp5);
+            tcg_gen_or_tl(t0, t0, t1);
+        }
+    } else {
+        data_bits = 8 << ot;
+        if (is_right) {
+            if (ot == OT_LONG)
+                tcg_gen_ext32u_tl(t0, t0);
+
+            tcg_gen_shr_tl(cpu_tmp4, t0, cpu_tmp5);
+
+            tcg_gen_shr_tl(t0, t0, t2);
+            tcg_gen_subfi_tl(cpu_tmp5, data_bits, t2);
+            tcg_gen_shl_tl(t1, t1, cpu_tmp5);
+            tcg_gen_or_tl(t0, t0, t1);
+
+        } else {
+            if (ot == OT_LONG)
+                tcg_gen_ext32u_tl(t1, t1);
+
+            tcg_gen_shl_tl(cpu_tmp4, t0, cpu_tmp5);
+
+            tcg_gen_shl_tl(t0, t0, t2);
+            tcg_gen_subfi_tl(cpu_tmp5, data_bits, t2);
+            tcg_gen_shr_tl(t1, t1, cpu_tmp5);
+            tcg_gen_or_tl(t0, t0, t1);
+        }
+    }
+    tcg_gen_mov_tl(t1, cpu_tmp4);
+
+    gen_set_label(label1);
+    /* store */
+    if (op1 == OR_TMP0) {
+        gen_op_st_v(ot + s->mem_index, t0, a0);
+    } else {
+        gen_op_mov_reg_v(ot, op1, t0);
+    }
+
+    /* update eflags */
+    if (s->cc_op != CC_OP_DYNAMIC)
+        gen_op_set_cc_op(s->cc_op);
+
+    label2 = gen_new_label();
+    tcg_gen_brcondi_tl(TCG_COND_EQ, t2, 0, label2);
+
+    tcg_gen_mov_tl(cpu_cc_src, t1);
+    tcg_gen_mov_tl(cpu_cc_dst, t0);
+    if (is_right) {
+        tcg_gen_movi_i32(cpu_cc_op, CC_OP_SARB + ot);
+    } else {
+        tcg_gen_movi_i32(cpu_cc_op, CC_OP_SHLB + ot);
+    }
+    gen_set_label(label2);
+    s->cc_op = CC_OP_DYNAMIC; /* cannot predict flags after */
+
+    tcg_temp_free(t0);
+    tcg_temp_free(t1);
+    tcg_temp_free(t2);
+    tcg_temp_free(a0);
+}
+
+static void gen_shift(DisasContext *s1, int op, int ot, int d, int s)
+{
+    if (s != OR_TMP1)
+        gen_op_mov_TN_reg(ot, 1, s);
+    switch(op) {
+    case OP_ROL:
+        gen_rot_rm_T1(s1, ot, d, 0);
+        break;
+    case OP_ROR:
+        gen_rot_rm_T1(s1, ot, d, 1);
+        break;
+    case OP_SHL:
+    case OP_SHL1:
+        gen_shift_rm_T1(s1, ot, d, 0, 0);
+        break;
+    case OP_SHR:
+        gen_shift_rm_T1(s1, ot, d, 1, 0);
+        break;
+    case OP_SAR:
+        gen_shift_rm_T1(s1, ot, d, 1, 1);
+        break;
+    case OP_RCL:
+        gen_rotc_rm_T1(s1, ot, d, 0);
+        break;
+    case OP_RCR:
+        gen_rotc_rm_T1(s1, ot, d, 1);
+        break;
+    }
+}
+
+static void gen_shifti(DisasContext *s1, int op, int ot, int d, int c)
+{
+    switch(op) {
+    case OP_ROL:
+        gen_rot_rm_im(s1, ot, d, c, 0);
+        break;
+    case OP_ROR:
+        gen_rot_rm_im(s1, ot, d, c, 1);
+        break;
+    case OP_SHL:
+    case OP_SHL1:
+        gen_shift_rm_im(s1, ot, d, c, 0, 0);
+        break;
+    case OP_SHR:
+        gen_shift_rm_im(s1, ot, d, c, 1, 0);
+        break;
+    case OP_SAR:
+        gen_shift_rm_im(s1, ot, d, c, 1, 1);
+        break;
+    default:
+        /* currently not optimized */
+        gen_op_movl_T1_im(c);
+        gen_shift(s1, op, ot, d, OR_TMP1);
+        break;
+    }
+}
+
+static void gen_lea_modrm(DisasContext *s, int modrm, int *reg_ptr, int *offset_ptr)
+{
+    target_long disp;
+    int havesib;
+    int base;
+    int index;
+    int scale;
+    int opreg;
+    int mod, rm, code, override, must_add_seg;
+
+    override = s->override;
+    must_add_seg = s->addseg;
+    if (override >= 0)
+        must_add_seg = 1;
+    mod = (modrm >> 6) & 3;
+    rm = modrm & 7;
+
+    if (s->aflag) {
+
+        havesib = 0;
+        base = rm;
+        index = 0;
+        scale = 0;
+
+        if (base == 4) {
+            havesib = 1;
+            code = ldub_code(s->pc++);
+            scale = (code >> 6) & 3;
+            index = ((code >> 3) & 7) | REX_X(s);
+            base = (code & 7);
+        }
+        base |= REX_B(s);
+
+        switch (mod) {
+        case 0:
+            if ((base & 7) == 5) {
+                base = -1;
+                disp = (int32_t)ldl_code(s->pc);
+                s->pc += 4;
+                if (CODE64(s) && !havesib) {
+                    disp += s->pc + s->rip_offset;
+                }
+            } else {
+                disp = 0;
+            }
+            break;
+        case 1:
+            disp = (int8_t)ldub_code(s->pc++);
+            break;
+        default:
+        case 2:
+#ifdef VBOX
+            disp = (int32_t)ldl_code(s->pc);
+#else
+            disp = ldl_code(s->pc);
+#endif
+            s->pc += 4;
+            break;
+        }
+
+        if (base >= 0) {
+            /* for correct popl handling with esp */
+            if (base == 4 && s->popl_esp_hack)
+                disp += s->popl_esp_hack;
+#ifdef TARGET_X86_64
+            if (s->aflag == 2) {
+                gen_op_movq_A0_reg(base);
+                if (disp != 0) {
+                    gen_op_addq_A0_im(disp);
+                }
+            } else
+#endif
+            {
+                gen_op_movl_A0_reg(base);
+                if (disp != 0)
+                    gen_op_addl_A0_im(disp);
+            }
+        } else {
+#ifdef TARGET_X86_64
+            if (s->aflag == 2) {
+                gen_op_movq_A0_im(disp);
+            } else
+#endif
+            {
+                gen_op_movl_A0_im(disp);
+            }
+        }
+        /* index == 4 means no index */
+        if (havesib && (index != 4)) {
+#ifdef TARGET_X86_64
+            if (s->aflag == 2) {
+                gen_op_addq_A0_reg_sN(scale, index);
+            } else
+#endif
+            {
+                gen_op_addl_A0_reg_sN(scale, index);
+            }
+        }
+        if (must_add_seg) {
+            if (override < 0) {
+                if (base == R_EBP || base == R_ESP)
+                    override = R_SS;
+                else
+                    override = R_DS;
+            }
+#ifdef TARGET_X86_64
+            if (s->aflag == 2) {
+                gen_op_addq_A0_seg(override);
+            } else
+#endif
+            {
+                gen_op_addl_A0_seg(override);
+            }
+        }
+    } else {
+        switch (mod) {
+        case 0:
+            if (rm == 6) {
+                disp = lduw_code(s->pc);
+                s->pc += 2;
+                gen_op_movl_A0_im(disp);
+                rm = 0; /* avoid SS override */
+                goto no_rm;
+            } else {
+                disp = 0;
+            }
+            break;
+        case 1:
+            disp = (int8_t)ldub_code(s->pc++);
+            break;
+        default:
+        case 2:
+            disp = lduw_code(s->pc);
+            s->pc += 2;
+            break;
+        }
+        switch(rm) {
+        case 0:
+            gen_op_movl_A0_reg(R_EBX);
+            gen_op_addl_A0_reg_sN(0, R_ESI);
+            break;
+        case 1:
+            gen_op_movl_A0_reg(R_EBX);
+            gen_op_addl_A0_reg_sN(0, R_EDI);
+            break;
+        case 2:
+            gen_op_movl_A0_reg(R_EBP);
+            gen_op_addl_A0_reg_sN(0, R_ESI);
+            break;
+        case 3:
+            gen_op_movl_A0_reg(R_EBP);
+            gen_op_addl_A0_reg_sN(0, R_EDI);
+            break;
+        case 4:
+            gen_op_movl_A0_reg(R_ESI);
+            break;
+        case 5:
+            gen_op_movl_A0_reg(R_EDI);
+            break;
+        case 6:
+            gen_op_movl_A0_reg(R_EBP);
+            break;
+        default:
+        case 7:
+            gen_op_movl_A0_reg(R_EBX);
+            break;
+        }
+        if (disp != 0)
+            gen_op_addl_A0_im(disp);
+        gen_op_andl_A0_ffff();
+    no_rm:
+        if (must_add_seg) {
+            if (override < 0) {
+                if (rm == 2 || rm == 3 || rm == 6)
+                    override = R_SS;
+                else
+                    override = R_DS;
+            }
+            gen_op_addl_A0_seg(override);
+        }
+    }
+
+    opreg = OR_A0;
+    disp = 0;
+    *reg_ptr = opreg;
+    *offset_ptr = disp;
+}
+
+static void gen_nop_modrm(DisasContext *s, int modrm)
+{
+    int mod, rm, base, code;
+
+    mod = (modrm >> 6) & 3;
+    if (mod == 3)
+        return;
+    rm = modrm & 7;
+
+    if (s->aflag) {
+
+        base = rm;
+
+        if (base == 4) {
+            code = ldub_code(s->pc++);
+            base = (code & 7);
+        }
+
+        switch (mod) {
+        case 0:
+            if (base == 5) {
+                s->pc += 4;
+            }
+            break;
+        case 1:
+            s->pc++;
+            break;
+        default:
+        case 2:
+            s->pc += 4;
+            break;
+        }
+    } else {
+        switch (mod) {
+        case 0:
+            if (rm == 6) {
+                s->pc += 2;
+            }
+            break;
+        case 1:
+            s->pc++;
+            break;
+        default:
+        case 2:
+            s->pc += 2;
+            break;
+        }
+    }
+}
+
+/* used for LEA and MOV AX, mem */
+static void gen_add_A0_ds_seg(DisasContext *s)
+{
+    int override, must_add_seg;
+    must_add_seg = s->addseg;
+    override = R_DS;
+    if (s->override >= 0) {
+        override = s->override;
+        must_add_seg = 1;
+    }
+    if (must_add_seg) {
+#ifdef TARGET_X86_64
+        if (CODE64(s)) {
+            gen_op_addq_A0_seg(override);
+        } else
+#endif
+        {
+            gen_op_addl_A0_seg(override);
+        }
+    }
+}
+
+/* generate modrm memory load or store of 'reg'. TMP0 is used if reg ==
+   OR_TMP0 */
+static void gen_ldst_modrm(DisasContext *s, int modrm, int ot, int reg, int is_store)
+{
+    int mod, rm, opreg, disp;
+
+    mod = (modrm >> 6) & 3;
+    rm = (modrm & 7) | REX_B(s);
+    if (mod == 3) {
+        if (is_store) {
+            if (reg != OR_TMP0)
+                gen_op_mov_TN_reg(ot, 0, reg);
+            gen_op_mov_reg_T0(ot, rm);
+        } else {
+            gen_op_mov_TN_reg(ot, 0, rm);
+            if (reg != OR_TMP0)
+                gen_op_mov_reg_T0(ot, reg);
+        }
+    } else {
+        gen_lea_modrm(s, modrm, &opreg, &disp);
+        if (is_store) {
+            if (reg != OR_TMP0)
+                gen_op_mov_TN_reg(ot, 0, reg);
+            gen_op_st_T0_A0(ot + s->mem_index);
+        } else {
+            gen_op_ld_T0_A0(ot + s->mem_index);
+            if (reg != OR_TMP0)
+                gen_op_mov_reg_T0(ot, reg);
+        }
+    }
+}
+
+static inline uint32_t insn_get(DisasContext *s, int ot)
+{
+    uint32_t ret;
+
+    switch(ot) {
+    case OT_BYTE:
+        ret = ldub_code(s->pc);
+        s->pc++;
+        break;
+    case OT_WORD:
+        ret = lduw_code(s->pc);
+        s->pc += 2;
+        break;
+    default:
+    case OT_LONG:
+        ret = ldl_code(s->pc);
+        s->pc += 4;
+        break;
+    }
+    return ret;
+}
+
+static inline int insn_const_size(unsigned int ot)
+{
+    if (ot <= OT_LONG)
+        return 1 << ot;
+    else
+        return 4;
+}
+
+static inline void gen_goto_tb(DisasContext *s, int tb_num, target_ulong eip)
+{
+    TranslationBlock *tb;
+    target_ulong pc;
+
+    pc = s->cs_base + eip;
+    tb = s->tb;
+    /* NOTE: we handle the case where the TB spans two pages here */
+    if ((pc & TARGET_PAGE_MASK) == (tb->pc & TARGET_PAGE_MASK) ||
+        (pc & TARGET_PAGE_MASK) == ((s->pc - 1) & TARGET_PAGE_MASK))  {
+#ifdef VBOX
+        gen_check_external_event();
+#endif /* VBOX */
+        /* jump to same page: we can use a direct jump */
+        tcg_gen_goto_tb(tb_num);
+        gen_jmp_im(eip);
+        tcg_gen_exit_tb((intptr_t)tb + tb_num);
+    } else {
+        /* jump to another page: currently not optimized */
+        gen_jmp_im(eip);
+        gen_eob(s);
+    }
+}
+
+static inline void gen_jcc(DisasContext *s, int b,
+                           target_ulong val, target_ulong next_eip)
+{
+    int l1, l2, cc_op;
+
+    cc_op = s->cc_op;
+    gen_update_cc_op(s);
+    if (s->jmp_opt) {
+        l1 = gen_new_label();
+        gen_jcc1(s, cc_op, b, l1);
+
+        gen_goto_tb(s, 0, next_eip);
+
+        gen_set_label(l1);
+        gen_goto_tb(s, 1, val);
+        s->is_jmp = DISAS_TB_JUMP;
+    } else {
+
+        l1 = gen_new_label();
+        l2 = gen_new_label();
+        gen_jcc1(s, cc_op, b, l1);
+
+        gen_jmp_im(next_eip);
+        tcg_gen_br(l2);
+
+        gen_set_label(l1);
+        gen_jmp_im(val);
+        gen_set_label(l2);
+        gen_eob(s);
+    }
+}
+
+static void gen_setcc(DisasContext *s, int b)
+{
+    int inv, jcc_op, l1;
+    TCGv t0;
+
+    if (is_fast_jcc_case(s, b)) {
+        /* nominal case: we use a jump */
+        /* XXX: make it faster by adding new instructions in TCG */
+        t0 = tcg_temp_local_new();
+        tcg_gen_movi_tl(t0, 0);
+        l1 = gen_new_label();
+        gen_jcc1(s, s->cc_op, b ^ 1, l1);
+        tcg_gen_movi_tl(t0, 1);
+        gen_set_label(l1);
+        tcg_gen_mov_tl(cpu_T[0], t0);
+        tcg_temp_free(t0);
+    } else {
+        /* slow case: it is more efficient not to generate a jump,
+           although it is questionnable whether this optimization is
+           worth to */
+        inv = b & 1;
+        jcc_op = (b >> 1) & 7;
+        gen_setcc_slow_T0(s, jcc_op);
+        if (inv) {
+            tcg_gen_xori_tl(cpu_T[0], cpu_T[0], 1);
+        }
+    }
+}
+
+static inline void gen_op_movl_T0_seg(int seg_reg)
+{
+    tcg_gen_ld32u_tl(cpu_T[0], cpu_env,
+                     offsetof(CPUX86State,segs[seg_reg].selector));
+}
+
+static inline void gen_op_movl_seg_T0_vm(int seg_reg)
+{
+    tcg_gen_andi_tl(cpu_T[0], cpu_T[0], 0xffff);
+    tcg_gen_st32_tl(cpu_T[0], cpu_env,
+                    offsetof(CPUX86State,segs[seg_reg].selector));
+    tcg_gen_shli_tl(cpu_T[0], cpu_T[0], 4);
+    tcg_gen_st_tl(cpu_T[0], cpu_env,
+                  offsetof(CPUX86State,segs[seg_reg].base));
+}
+
+/* move T0 to seg_reg and compute if the CPU state may change. Never
+   call this function with seg_reg == R_CS */
+static void gen_movl_seg_T0(DisasContext *s, int seg_reg, target_ulong cur_eip)
+{
+    if (s->pe && !s->vm86) {
+        /* XXX: optimize by finding processor state dynamically */
+        if (s->cc_op != CC_OP_DYNAMIC)
+            gen_op_set_cc_op(s->cc_op);
+        gen_jmp_im(cur_eip);
+        tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+        gen_helper_load_seg(tcg_const_i32(seg_reg), cpu_tmp2_i32);
+        /* abort translation because the addseg value may change or
+           because ss32 may change. For R_SS, translation must always
+           stop as a special handling must be done to disable hardware
+           interrupts for the next instruction */
+        if (seg_reg == R_SS || (s->code32 && seg_reg < R_FS))
+            s->is_jmp = DISAS_TB_JUMP;
+    } else {
+        gen_op_movl_seg_T0_vm(seg_reg);
+        if (seg_reg == R_SS)
+            s->is_jmp = DISAS_TB_JUMP;
+    }
+}
+
+static inline int svm_is_rep(int prefixes)
+{
+    return ((prefixes & (PREFIX_REPZ | PREFIX_REPNZ)) ? 8 : 0);
+}
+
+static inline void
+gen_svm_check_intercept_param(DisasContext *s, target_ulong pc_start,
+                              uint32_t type, uint64_t param)
+{
+    /* no SVM activated; fast case */
+    if (likely(!(s->flags & HF_SVMI_MASK)))
+        return;
+    if (s->cc_op != CC_OP_DYNAMIC)
+        gen_op_set_cc_op(s->cc_op);
+    gen_jmp_im(pc_start - s->cs_base);
+    gen_helper_svm_check_intercept_param(tcg_const_i32(type),
+                                         tcg_const_i64(param));
+}
+
+static inline void
+gen_svm_check_intercept(DisasContext *s, target_ulong pc_start, uint64_t type)
+{
+    gen_svm_check_intercept_param(s, pc_start, type, 0);
+}
+
+static inline void gen_stack_update(DisasContext *s, int addend)
+{
+#ifdef TARGET_X86_64
+    if (CODE64(s)) {
+        gen_op_add_reg_im(2, R_ESP, addend);
+    } else
+#endif
+    if (s->ss32) {
+        gen_op_add_reg_im(1, R_ESP, addend);
+    } else {
+        gen_op_add_reg_im(0, R_ESP, addend);
+    }
+}
+
+/* generate a push. It depends on ss32, addseg and dflag */
+static void gen_push_T0(DisasContext *s)
+{
+#ifdef TARGET_X86_64
+    if (CODE64(s)) {
+        gen_op_movq_A0_reg(R_ESP);
+        if (s->dflag) {
+            gen_op_addq_A0_im(-8);
+            gen_op_st_T0_A0(OT_QUAD + s->mem_index);
+        } else {
+            gen_op_addq_A0_im(-2);
+            gen_op_st_T0_A0(OT_WORD + s->mem_index);
+        }
+        gen_op_mov_reg_A0(2, R_ESP);
+    } else
+#endif
+    {
+        gen_op_movl_A0_reg(R_ESP);
+        if (!s->dflag)
+            gen_op_addl_A0_im(-2);
+        else
+            gen_op_addl_A0_im(-4);
+        if (s->ss32) {
+            if (s->addseg) {
+                tcg_gen_mov_tl(cpu_T[1], cpu_A0);
+                gen_op_addl_A0_seg(R_SS);
+            }
+        } else {
+            gen_op_andl_A0_ffff();
+            tcg_gen_mov_tl(cpu_T[1], cpu_A0);
+            gen_op_addl_A0_seg(R_SS);
+        }
+        gen_op_st_T0_A0(s->dflag + 1 + s->mem_index);
+        if (s->ss32 && !s->addseg)
+            gen_op_mov_reg_A0(1, R_ESP);
+        else
+            gen_op_mov_reg_T1(s->ss32 + 1, R_ESP);
+    }
+}
+
+/* generate a push. It depends on ss32, addseg and dflag */
+/* slower version for T1, only used for call Ev */
+static void gen_push_T1(DisasContext *s)
+{
+#ifdef TARGET_X86_64
+    if (CODE64(s)) {
+        gen_op_movq_A0_reg(R_ESP);
+        if (s->dflag) {
+            gen_op_addq_A0_im(-8);
+            gen_op_st_T1_A0(OT_QUAD + s->mem_index);
+        } else {
+            gen_op_addq_A0_im(-2);
+            gen_op_st_T0_A0(OT_WORD + s->mem_index);
+        }
+        gen_op_mov_reg_A0(2, R_ESP);
+    } else
+#endif
+    {
+        gen_op_movl_A0_reg(R_ESP);
+        if (!s->dflag)
+            gen_op_addl_A0_im(-2);
+        else
+            gen_op_addl_A0_im(-4);
+        if (s->ss32) {
+            if (s->addseg) {
+                gen_op_addl_A0_seg(R_SS);
+            }
+        } else {
+            gen_op_andl_A0_ffff();
+            gen_op_addl_A0_seg(R_SS);
+        }
+        gen_op_st_T1_A0(s->dflag + 1 + s->mem_index);
+
+        if (s->ss32 && !s->addseg)
+            gen_op_mov_reg_A0(1, R_ESP);
+        else
+            gen_stack_update(s, -(2 << s->dflag));
+    }
+}
+
+/* two step pop is necessary for precise exceptions */
+static void gen_pop_T0(DisasContext *s)
+{
+#ifdef TARGET_X86_64
+    if (CODE64(s)) {
+        gen_op_movq_A0_reg(R_ESP);
+        gen_op_ld_T0_A0((s->dflag ? OT_QUAD : OT_WORD) + s->mem_index);
+    } else
+#endif
+    {
+        gen_op_movl_A0_reg(R_ESP);
+        if (s->ss32) {
+            if (s->addseg)
+                gen_op_addl_A0_seg(R_SS);
+        } else {
+            gen_op_andl_A0_ffff();
+            gen_op_addl_A0_seg(R_SS);
+        }
+        gen_op_ld_T0_A0(s->dflag + 1 + s->mem_index);
+    }
+}
+
+static void gen_pop_update(DisasContext *s)
+{
+#ifdef TARGET_X86_64
+    if (CODE64(s) && s->dflag) {
+        gen_stack_update(s, 8);
+    } else
+#endif
+    {
+        gen_stack_update(s, 2 << s->dflag);
+    }
+}
+
+static void gen_stack_A0(DisasContext *s)
+{
+    gen_op_movl_A0_reg(R_ESP);
+    if (!s->ss32)
+        gen_op_andl_A0_ffff();
+    tcg_gen_mov_tl(cpu_T[1], cpu_A0);
+    if (s->addseg)
+        gen_op_addl_A0_seg(R_SS);
+}
+
+/* NOTE: wrap around in 16 bit not fully handled */
+static void gen_pusha(DisasContext *s)
+{
+    int i;
+    gen_op_movl_A0_reg(R_ESP);
+    gen_op_addl_A0_im(-(16 <<  s->dflag));
+    if (!s->ss32)
+        gen_op_andl_A0_ffff();
+    tcg_gen_mov_tl(cpu_T[1], cpu_A0);
+    if (s->addseg)
+        gen_op_addl_A0_seg(R_SS);
+    for(i = 0;i < 8; i++) {
+        gen_op_mov_TN_reg(OT_LONG, 0, 7 - i);
+        gen_op_st_T0_A0(OT_WORD + s->dflag + s->mem_index);
+        gen_op_addl_A0_im(2 <<  s->dflag);
+    }
+    gen_op_mov_reg_T1(OT_WORD + s->ss32, R_ESP);
+}
+
+/* NOTE: wrap around in 16 bit not fully handled */
+static void gen_popa(DisasContext *s)
+{
+    int i;
+    gen_op_movl_A0_reg(R_ESP);
+    if (!s->ss32)
+        gen_op_andl_A0_ffff();
+    tcg_gen_mov_tl(cpu_T[1], cpu_A0);
+    tcg_gen_addi_tl(cpu_T[1], cpu_T[1], 16 <<  s->dflag);
+    if (s->addseg)
+        gen_op_addl_A0_seg(R_SS);
+    for(i = 0;i < 8; i++) {
+        /* ESP is not reloaded */
+        if (i != 3) {
+            gen_op_ld_T0_A0(OT_WORD + s->dflag + s->mem_index);
+            gen_op_mov_reg_T0(OT_WORD + s->dflag, 7 - i);
+        }
+        gen_op_addl_A0_im(2 <<  s->dflag);
+    }
+    gen_op_mov_reg_T1(OT_WORD + s->ss32, R_ESP);
+}
+
+static void gen_enter(DisasContext *s, int esp_addend, int level)
+{
+    int ot, opsize;
+
+    level &= 0x1f;
+#ifdef TARGET_X86_64
+    if (CODE64(s)) {
+        ot = s->dflag ? OT_QUAD : OT_WORD;
+        opsize = 1 << ot;
+
+        gen_op_movl_A0_reg(R_ESP);
+        gen_op_addq_A0_im(-opsize);
+        tcg_gen_mov_tl(cpu_T[1], cpu_A0);
+
+        /* push bp */
+        gen_op_mov_TN_reg(OT_LONG, 0, R_EBP);
+        gen_op_st_T0_A0(ot + s->mem_index);
+        if (level) {
+            /* XXX: must save state */
+            gen_helper_enter64_level(tcg_const_i32(level),
+                                     tcg_const_i32((ot == OT_QUAD)),
+                                     cpu_T[1]);
+        }
+        gen_op_mov_reg_T1(ot, R_EBP);
+        tcg_gen_addi_tl(cpu_T[1], cpu_T[1], -esp_addend + (-opsize * level));
+        gen_op_mov_reg_T1(OT_QUAD, R_ESP);
+    } else
+#endif
+    {
+        ot = s->dflag + OT_WORD;
+        opsize = 2 << s->dflag;
+
+        gen_op_movl_A0_reg(R_ESP);
+        gen_op_addl_A0_im(-opsize);
+        if (!s->ss32)
+            gen_op_andl_A0_ffff();
+        tcg_gen_mov_tl(cpu_T[1], cpu_A0);
+        if (s->addseg)
+            gen_op_addl_A0_seg(R_SS);
+        /* push bp */
+        gen_op_mov_TN_reg(OT_LONG, 0, R_EBP);
+        gen_op_st_T0_A0(ot + s->mem_index);
+        if (level) {
+            /* XXX: must save state */
+            gen_helper_enter_level(tcg_const_i32(level),
+                                   tcg_const_i32(s->dflag),
+                                   cpu_T[1]);
+        }
+        gen_op_mov_reg_T1(ot, R_EBP);
+        tcg_gen_addi_tl(cpu_T[1], cpu_T[1], -esp_addend + (-opsize * level));
+        gen_op_mov_reg_T1(OT_WORD + s->ss32, R_ESP);
+    }
+}
+
+static void gen_exception(DisasContext *s, int trapno, target_ulong cur_eip)
+{
+    if (s->cc_op != CC_OP_DYNAMIC)
+        gen_op_set_cc_op(s->cc_op);
+    gen_jmp_im(cur_eip);
+    gen_helper_raise_exception(tcg_const_i32(trapno));
+    s->is_jmp = DISAS_TB_JUMP;
+}
+
+/* an interrupt is different from an exception because of the
+   privilege checks */
+static void gen_interrupt(DisasContext *s, int intno,
+                          target_ulong cur_eip, target_ulong next_eip)
+{
+    if (s->cc_op != CC_OP_DYNAMIC)
+        gen_op_set_cc_op(s->cc_op);
+    gen_jmp_im(cur_eip);
+    gen_helper_raise_interrupt(tcg_const_i32(intno),
+                               tcg_const_i32(next_eip - cur_eip));
+    s->is_jmp = DISAS_TB_JUMP;
+}
+
+static void gen_debug(DisasContext *s, target_ulong cur_eip)
+{
+    if (s->cc_op != CC_OP_DYNAMIC)
+        gen_op_set_cc_op(s->cc_op);
+    gen_jmp_im(cur_eip);
+    gen_helper_debug();
+    s->is_jmp = DISAS_TB_JUMP;
+}
+
+/* generate a generic end of block. Trace exception is also generated
+   if needed */
+static void gen_eob(DisasContext *s)
+{
+    if (s->cc_op != CC_OP_DYNAMIC)
+        gen_op_set_cc_op(s->cc_op);
+    if (s->tb->flags & HF_INHIBIT_IRQ_MASK) {
+        gen_helper_reset_inhibit_irq();
+    }
+    if (s->tb->flags & HF_RF_MASK) {
+        gen_helper_reset_rf();
+    }
+    if (   s->singlestep_enabled
+#ifdef VBOX
+        && (   !(cpu_single_env->state & CPU_EMULATE_SINGLE_STEP)
+            || !(s->prefix & (PREFIX_REPNZ | PREFIX_REPZ) ))
+#endif
+       ) {
+        gen_helper_debug();
+    } else if (s->tf) {
+	gen_helper_single_step();
+    } else {
+        tcg_gen_exit_tb(0);
+    }
+    s->is_jmp = DISAS_TB_JUMP;
+}
+
+/* generate a jump to eip. No segment change must happen before as a
+   direct call to the next block may occur */
+static void gen_jmp_tb(DisasContext *s, target_ulong eip, int tb_num)
+{
+    if (s->jmp_opt) {
+        gen_update_cc_op(s);
+        gen_goto_tb(s, tb_num, eip);
+        s->is_jmp = DISAS_TB_JUMP;
+    } else {
+        gen_jmp_im(eip);
+        gen_eob(s);
+    }
+}
+
+static void gen_jmp(DisasContext *s, target_ulong eip)
+{
+    gen_jmp_tb(s, eip, 0);
+}
+
+static inline void gen_ldq_env_A0(int idx, int offset)
+{
+    int mem_index = (idx >> 2) - 1;
+    tcg_gen_qemu_ld64(cpu_tmp1_i64, cpu_A0, mem_index);
+    tcg_gen_st_i64(cpu_tmp1_i64, cpu_env, offset);
+}
+
+static inline void gen_stq_env_A0(int idx, int offset)
+{
+    int mem_index = (idx >> 2) - 1;
+    tcg_gen_ld_i64(cpu_tmp1_i64, cpu_env, offset);
+    tcg_gen_qemu_st64(cpu_tmp1_i64, cpu_A0, mem_index);
+}
+
+static inline void gen_ldo_env_A0(int idx, int offset)
+{
+    int mem_index = (idx >> 2) - 1;
+    tcg_gen_qemu_ld64(cpu_tmp1_i64, cpu_A0, mem_index);
+    tcg_gen_st_i64(cpu_tmp1_i64, cpu_env, offset + offsetof(XMMReg, XMM_Q(0)));
+    tcg_gen_addi_tl(cpu_tmp0, cpu_A0, 8);
+    tcg_gen_qemu_ld64(cpu_tmp1_i64, cpu_tmp0, mem_index);
+    tcg_gen_st_i64(cpu_tmp1_i64, cpu_env, offset + offsetof(XMMReg, XMM_Q(1)));
+}
+
+static inline void gen_sto_env_A0(int idx, int offset)
+{
+    int mem_index = (idx >> 2) - 1;
+    tcg_gen_ld_i64(cpu_tmp1_i64, cpu_env, offset + offsetof(XMMReg, XMM_Q(0)));
+    tcg_gen_qemu_st64(cpu_tmp1_i64, cpu_A0, mem_index);
+    tcg_gen_addi_tl(cpu_tmp0, cpu_A0, 8);
+    tcg_gen_ld_i64(cpu_tmp1_i64, cpu_env, offset + offsetof(XMMReg, XMM_Q(1)));
+    tcg_gen_qemu_st64(cpu_tmp1_i64, cpu_tmp0, mem_index);
+}
+
+static inline void gen_op_movo(int d_offset, int s_offset)
+{
+    tcg_gen_ld_i64(cpu_tmp1_i64, cpu_env, s_offset);
+    tcg_gen_st_i64(cpu_tmp1_i64, cpu_env, d_offset);
+    tcg_gen_ld_i64(cpu_tmp1_i64, cpu_env, s_offset + 8);
+    tcg_gen_st_i64(cpu_tmp1_i64, cpu_env, d_offset + 8);
+}
+
+static inline void gen_op_movq(int d_offset, int s_offset)
+{
+    tcg_gen_ld_i64(cpu_tmp1_i64, cpu_env, s_offset);
+    tcg_gen_st_i64(cpu_tmp1_i64, cpu_env, d_offset);
+}
+
+static inline void gen_op_movl(int d_offset, int s_offset)
+{
+    tcg_gen_ld_i32(cpu_tmp2_i32, cpu_env, s_offset);
+    tcg_gen_st_i32(cpu_tmp2_i32, cpu_env, d_offset);
+}
+
+static inline void gen_op_movq_env_0(int d_offset)
+{
+    tcg_gen_movi_i64(cpu_tmp1_i64, 0);
+    tcg_gen_st_i64(cpu_tmp1_i64, cpu_env, d_offset);
+}
+
+#define SSE_SPECIAL ((void *)1)
+#define SSE_DUMMY ((void *)2)
+
+#define MMX_OP2(x) { gen_helper_ ## x ## _mmx, gen_helper_ ## x ## _xmm }
+#define SSE_FOP(x) { gen_helper_ ## x ## ps, gen_helper_ ## x ## pd, \
+                     gen_helper_ ## x ## ss, gen_helper_ ## x ## sd, }
+
+static void *sse_op_table1[256][4] = {
+    /* 3DNow! extensions */
+    [0x0e] = { SSE_DUMMY }, /* femms */
+    [0x0f] = { SSE_DUMMY }, /* pf... */
+    /* pure SSE operations */
+    [0x10] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movups, movupd, movss, movsd */
+    [0x11] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movups, movupd, movss, movsd */
+    [0x12] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movlps, movlpd, movsldup, movddup */
+    [0x13] = { SSE_SPECIAL, SSE_SPECIAL },  /* movlps, movlpd */
+    [0x14] = { gen_helper_punpckldq_xmm, gen_helper_punpcklqdq_xmm },
+    [0x15] = { gen_helper_punpckhdq_xmm, gen_helper_punpckhqdq_xmm },
+    [0x16] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL },  /* movhps, movhpd, movshdup */
+    [0x17] = { SSE_SPECIAL, SSE_SPECIAL },  /* movhps, movhpd */
+
+    [0x28] = { SSE_SPECIAL, SSE_SPECIAL },  /* movaps, movapd */
+    [0x29] = { SSE_SPECIAL, SSE_SPECIAL },  /* movaps, movapd */
+    [0x2a] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* cvtpi2ps, cvtpi2pd, cvtsi2ss, cvtsi2sd */
+    [0x2b] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movntps, movntpd, movntss, movntsd */
+    [0x2c] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* cvttps2pi, cvttpd2pi, cvttsd2si, cvttss2si */
+    [0x2d] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* cvtps2pi, cvtpd2pi, cvtsd2si, cvtss2si */
+    [0x2e] = { gen_helper_ucomiss, gen_helper_ucomisd },
+    [0x2f] = { gen_helper_comiss, gen_helper_comisd },
+    [0x50] = { SSE_SPECIAL, SSE_SPECIAL }, /* movmskps, movmskpd */
+    [0x51] = SSE_FOP(sqrt),
+    [0x52] = { gen_helper_rsqrtps, NULL, gen_helper_rsqrtss, NULL },
+    [0x53] = { gen_helper_rcpps, NULL, gen_helper_rcpss, NULL },
+    [0x54] = { gen_helper_pand_xmm, gen_helper_pand_xmm }, /* andps, andpd */
+    [0x55] = { gen_helper_pandn_xmm, gen_helper_pandn_xmm }, /* andnps, andnpd */
+    [0x56] = { gen_helper_por_xmm, gen_helper_por_xmm }, /* orps, orpd */
+    [0x57] = { gen_helper_pxor_xmm, gen_helper_pxor_xmm }, /* xorps, xorpd */
+    [0x58] = SSE_FOP(add),
+    [0x59] = SSE_FOP(mul),
+    [0x5a] = { gen_helper_cvtps2pd, gen_helper_cvtpd2ps,
+               gen_helper_cvtss2sd, gen_helper_cvtsd2ss },
+    [0x5b] = { gen_helper_cvtdq2ps, gen_helper_cvtps2dq, gen_helper_cvttps2dq },
+    [0x5c] = SSE_FOP(sub),
+    [0x5d] = SSE_FOP(min),
+    [0x5e] = SSE_FOP(div),
+    [0x5f] = SSE_FOP(max),
+
+    [0xc2] = SSE_FOP(cmpeq),
+    [0xc6] = { gen_helper_shufps, gen_helper_shufpd },
+
+    [0x38] = { SSE_SPECIAL, SSE_SPECIAL, NULL, SSE_SPECIAL }, /* SSSE3/SSE4 */
+    [0x3a] = { SSE_SPECIAL, SSE_SPECIAL }, /* SSSE3/SSE4 */
+
+    /* MMX ops and their SSE extensions */
+    [0x60] = MMX_OP2(punpcklbw),
+    [0x61] = MMX_OP2(punpcklwd),
+    [0x62] = MMX_OP2(punpckldq),
+    [0x63] = MMX_OP2(packsswb),
+    [0x64] = MMX_OP2(pcmpgtb),
+    [0x65] = MMX_OP2(pcmpgtw),
+    [0x66] = MMX_OP2(pcmpgtl),
+    [0x67] = MMX_OP2(packuswb),
+    [0x68] = MMX_OP2(punpckhbw),
+    [0x69] = MMX_OP2(punpckhwd),
+    [0x6a] = MMX_OP2(punpckhdq),
+    [0x6b] = MMX_OP2(packssdw),
+    [0x6c] = { NULL, gen_helper_punpcklqdq_xmm },
+    [0x6d] = { NULL, gen_helper_punpckhqdq_xmm },
+    [0x6e] = { SSE_SPECIAL, SSE_SPECIAL }, /* movd mm, ea */
+    [0x6f] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movq, movdqa, , movqdu */
+    [0x70] = { gen_helper_pshufw_mmx,
+               gen_helper_pshufd_xmm,
+               gen_helper_pshufhw_xmm,
+               gen_helper_pshuflw_xmm },
+    [0x71] = { SSE_SPECIAL, SSE_SPECIAL }, /* shiftw */
+    [0x72] = { SSE_SPECIAL, SSE_SPECIAL }, /* shiftd */
+    [0x73] = { SSE_SPECIAL, SSE_SPECIAL }, /* shiftq */
+    [0x74] = MMX_OP2(pcmpeqb),
+    [0x75] = MMX_OP2(pcmpeqw),
+    [0x76] = MMX_OP2(pcmpeql),
+    [0x77] = { SSE_DUMMY }, /* emms */
+    [0x78] = { NULL, SSE_SPECIAL, NULL, SSE_SPECIAL }, /* extrq_i, insertq_i */
+    [0x79] = { NULL, gen_helper_extrq_r, NULL, gen_helper_insertq_r },
+    [0x7c] = { NULL, gen_helper_haddpd, NULL, gen_helper_haddps },
+    [0x7d] = { NULL, gen_helper_hsubpd, NULL, gen_helper_hsubps },
+    [0x7e] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movd, movd, , movq */
+    [0x7f] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movq, movdqa, movdqu */
+    [0xc4] = { SSE_SPECIAL, SSE_SPECIAL }, /* pinsrw */
+    [0xc5] = { SSE_SPECIAL, SSE_SPECIAL }, /* pextrw */
+    [0xd0] = { NULL, gen_helper_addsubpd, NULL, gen_helper_addsubps },
+    [0xd1] = MMX_OP2(psrlw),
+    [0xd2] = MMX_OP2(psrld),
+    [0xd3] = MMX_OP2(psrlq),
+    [0xd4] = MMX_OP2(paddq),
+    [0xd5] = MMX_OP2(pmullw),
+    [0xd6] = { NULL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL },
+    [0xd7] = { SSE_SPECIAL, SSE_SPECIAL }, /* pmovmskb */
+    [0xd8] = MMX_OP2(psubusb),
+    [0xd9] = MMX_OP2(psubusw),
+    [0xda] = MMX_OP2(pminub),
+    [0xdb] = MMX_OP2(pand),
+    [0xdc] = MMX_OP2(paddusb),
+    [0xdd] = MMX_OP2(paddusw),
+    [0xde] = MMX_OP2(pmaxub),
+    [0xdf] = MMX_OP2(pandn),
+    [0xe0] = MMX_OP2(pavgb),
+    [0xe1] = MMX_OP2(psraw),
+    [0xe2] = MMX_OP2(psrad),
+    [0xe3] = MMX_OP2(pavgw),
+    [0xe4] = MMX_OP2(pmulhuw),
+    [0xe5] = MMX_OP2(pmulhw),
+    [0xe6] = { NULL, gen_helper_cvttpd2dq, gen_helper_cvtdq2pd, gen_helper_cvtpd2dq },
+    [0xe7] = { SSE_SPECIAL , SSE_SPECIAL },  /* movntq, movntq */
+    [0xe8] = MMX_OP2(psubsb),
+    [0xe9] = MMX_OP2(psubsw),
+    [0xea] = MMX_OP2(pminsw),
+    [0xeb] = MMX_OP2(por),
+    [0xec] = MMX_OP2(paddsb),
+    [0xed] = MMX_OP2(paddsw),
+    [0xee] = MMX_OP2(pmaxsw),
+    [0xef] = MMX_OP2(pxor),
+    [0xf0] = { NULL, NULL, NULL, SSE_SPECIAL }, /* lddqu */
+    [0xf1] = MMX_OP2(psllw),
+    [0xf2] = MMX_OP2(pslld),
+    [0xf3] = MMX_OP2(psllq),
+    [0xf4] = MMX_OP2(pmuludq),
+    [0xf5] = MMX_OP2(pmaddwd),
+    [0xf6] = MMX_OP2(psadbw),
+    [0xf7] = MMX_OP2(maskmov),
+    [0xf8] = MMX_OP2(psubb),
+    [0xf9] = MMX_OP2(psubw),
+    [0xfa] = MMX_OP2(psubl),
+    [0xfb] = MMX_OP2(psubq),
+    [0xfc] = MMX_OP2(paddb),
+    [0xfd] = MMX_OP2(paddw),
+    [0xfe] = MMX_OP2(paddl),
+};
+
+static void *sse_op_table2[3 * 8][2] = {
+    [0 + 2] = MMX_OP2(psrlw),
+    [0 + 4] = MMX_OP2(psraw),
+    [0 + 6] = MMX_OP2(psllw),
+    [8 + 2] = MMX_OP2(psrld),
+    [8 + 4] = MMX_OP2(psrad),
+    [8 + 6] = MMX_OP2(pslld),
+    [16 + 2] = MMX_OP2(psrlq),
+    [16 + 3] = { NULL, gen_helper_psrldq_xmm },
+    [16 + 6] = MMX_OP2(psllq),
+    [16 + 7] = { NULL, gen_helper_pslldq_xmm },
+};
+
+static void *sse_op_table3[4 * 3] = {
+    gen_helper_cvtsi2ss,
+    gen_helper_cvtsi2sd,
+    X86_64_ONLY(gen_helper_cvtsq2ss),
+    X86_64_ONLY(gen_helper_cvtsq2sd),
+
+    gen_helper_cvttss2si,
+    gen_helper_cvttsd2si,
+    X86_64_ONLY(gen_helper_cvttss2sq),
+    X86_64_ONLY(gen_helper_cvttsd2sq),
+
+    gen_helper_cvtss2si,
+    gen_helper_cvtsd2si,
+    X86_64_ONLY(gen_helper_cvtss2sq),
+    X86_64_ONLY(gen_helper_cvtsd2sq),
+};
+
+static void *sse_op_table4[8][4] = {
+    SSE_FOP(cmpeq),
+    SSE_FOP(cmplt),
+    SSE_FOP(cmple),
+    SSE_FOP(cmpunord),
+    SSE_FOP(cmpneq),
+    SSE_FOP(cmpnlt),
+    SSE_FOP(cmpnle),
+    SSE_FOP(cmpord),
+};
+
+static void *sse_op_table5[256] = {
+    [0x0c] = gen_helper_pi2fw,
+    [0x0d] = gen_helper_pi2fd,
+    [0x1c] = gen_helper_pf2iw,
+    [0x1d] = gen_helper_pf2id,
+    [0x8a] = gen_helper_pfnacc,
+    [0x8e] = gen_helper_pfpnacc,
+    [0x90] = gen_helper_pfcmpge,
+    [0x94] = gen_helper_pfmin,
+    [0x96] = gen_helper_pfrcp,
+    [0x97] = gen_helper_pfrsqrt,
+    [0x9a] = gen_helper_pfsub,
+    [0x9e] = gen_helper_pfadd,
+    [0xa0] = gen_helper_pfcmpgt,
+    [0xa4] = gen_helper_pfmax,
+    [0xa6] = gen_helper_movq, /* pfrcpit1; no need to actually increase precision */
+    [0xa7] = gen_helper_movq, /* pfrsqit1 */
+    [0xaa] = gen_helper_pfsubr,
+    [0xae] = gen_helper_pfacc,
+    [0xb0] = gen_helper_pfcmpeq,
+    [0xb4] = gen_helper_pfmul,
+    [0xb6] = gen_helper_movq, /* pfrcpit2 */
+    [0xb7] = gen_helper_pmulhrw_mmx,
+    [0xbb] = gen_helper_pswapd,
+    [0xbf] = gen_helper_pavgb_mmx /* pavgusb */
+};
+
+struct sse_op_helper_s {
+    void *op[2]; uint32_t ext_mask;
+};
+#define SSSE3_OP(x) { MMX_OP2(x), CPUID_EXT_SSSE3 }
+#define SSE41_OP(x) { { NULL, gen_helper_ ## x ## _xmm }, CPUID_EXT_SSE41 }
+#define SSE42_OP(x) { { NULL, gen_helper_ ## x ## _xmm }, CPUID_EXT_SSE42 }
+#define SSE41_SPECIAL { { NULL, SSE_SPECIAL }, CPUID_EXT_SSE41 }
+static struct sse_op_helper_s sse_op_table6[256] = {
+    [0x00] = SSSE3_OP(pshufb),
+    [0x01] = SSSE3_OP(phaddw),
+    [0x02] = SSSE3_OP(phaddd),
+    [0x03] = SSSE3_OP(phaddsw),
+    [0x04] = SSSE3_OP(pmaddubsw),
+    [0x05] = SSSE3_OP(phsubw),
+    [0x06] = SSSE3_OP(phsubd),
+    [0x07] = SSSE3_OP(phsubsw),
+    [0x08] = SSSE3_OP(psignb),
+    [0x09] = SSSE3_OP(psignw),
+    [0x0a] = SSSE3_OP(psignd),
+    [0x0b] = SSSE3_OP(pmulhrsw),
+    [0x10] = SSE41_OP(pblendvb),
+    [0x14] = SSE41_OP(blendvps),
+    [0x15] = SSE41_OP(blendvpd),
+    [0x17] = SSE41_OP(ptest),
+    [0x1c] = SSSE3_OP(pabsb),
+    [0x1d] = SSSE3_OP(pabsw),
+    [0x1e] = SSSE3_OP(pabsd),
+    [0x20] = SSE41_OP(pmovsxbw),
+    [0x21] = SSE41_OP(pmovsxbd),
+    [0x22] = SSE41_OP(pmovsxbq),
+    [0x23] = SSE41_OP(pmovsxwd),
+    [0x24] = SSE41_OP(pmovsxwq),
+    [0x25] = SSE41_OP(pmovsxdq),
+    [0x28] = SSE41_OP(pmuldq),
+    [0x29] = SSE41_OP(pcmpeqq),
+    [0x2a] = SSE41_SPECIAL, /* movntqda */
+    [0x2b] = SSE41_OP(packusdw),
+    [0x30] = SSE41_OP(pmovzxbw),
+    [0x31] = SSE41_OP(pmovzxbd),
+    [0x32] = SSE41_OP(pmovzxbq),
+    [0x33] = SSE41_OP(pmovzxwd),
+    [0x34] = SSE41_OP(pmovzxwq),
+    [0x35] = SSE41_OP(pmovzxdq),
+    [0x37] = SSE42_OP(pcmpgtq),
+    [0x38] = SSE41_OP(pminsb),
+    [0x39] = SSE41_OP(pminsd),
+    [0x3a] = SSE41_OP(pminuw),
+    [0x3b] = SSE41_OP(pminud),
+    [0x3c] = SSE41_OP(pmaxsb),
+    [0x3d] = SSE41_OP(pmaxsd),
+    [0x3e] = SSE41_OP(pmaxuw),
+    [0x3f] = SSE41_OP(pmaxud),
+    [0x40] = SSE41_OP(pmulld),
+    [0x41] = SSE41_OP(phminposuw),
+};
+
+static struct sse_op_helper_s sse_op_table7[256] = {
+    [0x08] = SSE41_OP(roundps),
+    [0x09] = SSE41_OP(roundpd),
+    [0x0a] = SSE41_OP(roundss),
+    [0x0b] = SSE41_OP(roundsd),
+    [0x0c] = SSE41_OP(blendps),
+    [0x0d] = SSE41_OP(blendpd),
+    [0x0e] = SSE41_OP(pblendw),
+    [0x0f] = SSSE3_OP(palignr),
+    [0x14] = SSE41_SPECIAL, /* pextrb */
+    [0x15] = SSE41_SPECIAL, /* pextrw */
+    [0x16] = SSE41_SPECIAL, /* pextrd/pextrq */
+    [0x17] = SSE41_SPECIAL, /* extractps */
+    [0x20] = SSE41_SPECIAL, /* pinsrb */
+    [0x21] = SSE41_SPECIAL, /* insertps */
+    [0x22] = SSE41_SPECIAL, /* pinsrd/pinsrq */
+    [0x40] = SSE41_OP(dpps),
+    [0x41] = SSE41_OP(dppd),
+    [0x42] = SSE41_OP(mpsadbw),
+    [0x60] = SSE42_OP(pcmpestrm),
+    [0x61] = SSE42_OP(pcmpestri),
+    [0x62] = SSE42_OP(pcmpistrm),
+    [0x63] = SSE42_OP(pcmpistri),
+};
+
+static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
+{
+    int b1, op1_offset, op2_offset, is_xmm, val, ot;
+    int modrm, mod, rm, reg, reg_addr, offset_addr;
+    void *sse_op2;
+
+    b &= 0xff;
+    if (s->prefix & PREFIX_DATA)
+        b1 = 1;
+    else if (s->prefix & PREFIX_REPZ)
+        b1 = 2;
+    else if (s->prefix & PREFIX_REPNZ)
+        b1 = 3;
+    else
+        b1 = 0;
+    sse_op2 = sse_op_table1[b][b1];
+    if (!sse_op2)
+        goto illegal_op;
+    if ((b <= 0x5f && b >= 0x10) || b == 0xc6 || b == 0xc2) {
+        is_xmm = 1;
+    } else {
+        if (b1 == 0) {
+            /* MMX case */
+            is_xmm = 0;
+        } else {
+            is_xmm = 1;
+        }
+    }
+    /* simple MMX/SSE operation */
+    if (s->flags & HF_TS_MASK) {
+        gen_exception(s, EXCP07_PREX, pc_start - s->cs_base);
+        return;
+    }
+    if (s->flags & HF_EM_MASK) {
+    illegal_op:
+        gen_exception(s, EXCP06_ILLOP, pc_start - s->cs_base);
+        return;
+    }
+    if (is_xmm && !(s->flags & HF_OSFXSR_MASK))
+        if ((b != 0x38 && b != 0x3a) || (s->prefix & PREFIX_DATA))
+            goto illegal_op;
+    if (b == 0x0e) {
+        if (!(s->cpuid_ext2_features & CPUID_EXT2_3DNOW))
+            goto illegal_op;
+        /* femms */
+        gen_helper_emms();
+        return;
+    }
+    if (b == 0x77) {
+        /* emms */
+        gen_helper_emms();
+        return;
+    }
+    /* prepare MMX state (XXX: optimize by storing fptt and fptags in
+       the static cpu state) */
+    if (!is_xmm) {
+        gen_helper_enter_mmx();
+    }
+
+    modrm = ldub_code(s->pc++);
+    reg = ((modrm >> 3) & 7);
+    if (is_xmm)
+        reg |= rex_r;
+    mod = (modrm >> 6) & 3;
+    if (sse_op2 == SSE_SPECIAL) {
+        b |= (b1 << 8);
+        switch(b) {
+        case 0x0e7: /* movntq */
+            if (mod == 3)
+                goto illegal_op;
+            gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+            gen_stq_env_A0(s->mem_index, offsetof(CPUX86State,fpregs[reg].mmx));
+            break;
+        case 0x1e7: /* movntdq */
+        case 0x02b: /* movntps */
+        case 0x12b: /* movntps */
+            if (mod == 3)
+                goto illegal_op;
+            gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+            gen_sto_env_A0(s->mem_index, offsetof(CPUX86State,xmm_regs[reg]));
+            break;
+        case 0x3f0: /* lddqu */
+            if (mod == 3)
+                goto illegal_op;
+            gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+            gen_ldo_env_A0(s->mem_index, offsetof(CPUX86State,xmm_regs[reg]));
+            break;
+        case 0x22b: /* movntss */
+        case 0x32b: /* movntsd */
+            if (mod == 3)
+                goto illegal_op;
+            gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+            if (b1 & 1) {
+                gen_stq_env_A0(s->mem_index, offsetof(CPUX86State,
+                    xmm_regs[reg]));
+            } else {
+                tcg_gen_ld32u_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,
+                    xmm_regs[reg].XMM_L(0)));
+                gen_op_st_T0_A0(OT_LONG + s->mem_index);
+            }
+            break;
+        case 0x6e: /* movd mm, ea */
+#ifdef TARGET_X86_64
+            if (s->dflag == 2) {
+                gen_ldst_modrm(s, modrm, OT_QUAD, OR_TMP0, 0);
+                tcg_gen_st_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,fpregs[reg].mmx));
+            } else
+#endif
+            {
+                gen_ldst_modrm(s, modrm, OT_LONG, OR_TMP0, 0);
+                tcg_gen_addi_ptr(cpu_ptr0, cpu_env,
+                                 offsetof(CPUX86State,fpregs[reg].mmx));
+                tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+                gen_helper_movl_mm_T0_mmx(cpu_ptr0, cpu_tmp2_i32);
+            }
+            break;
+        case 0x16e: /* movd xmm, ea */
+#ifdef TARGET_X86_64
+            if (s->dflag == 2) {
+                gen_ldst_modrm(s, modrm, OT_QUAD, OR_TMP0, 0);
+                tcg_gen_addi_ptr(cpu_ptr0, cpu_env,
+                                 offsetof(CPUX86State,xmm_regs[reg]));
+                gen_helper_movq_mm_T0_xmm(cpu_ptr0, cpu_T[0]);
+            } else
+#endif
+            {
+                gen_ldst_modrm(s, modrm, OT_LONG, OR_TMP0, 0);
+                tcg_gen_addi_ptr(cpu_ptr0, cpu_env,
+                                 offsetof(CPUX86State,xmm_regs[reg]));
+                tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+                gen_helper_movl_mm_T0_xmm(cpu_ptr0, cpu_tmp2_i32);
+            }
+            break;
+        case 0x6f: /* movq mm, ea */
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_ldq_env_A0(s->mem_index, offsetof(CPUX86State,fpregs[reg].mmx));
+            } else {
+                rm = (modrm & 7);
+                tcg_gen_ld_i64(cpu_tmp1_i64, cpu_env,
+                               offsetof(CPUX86State,fpregs[rm].mmx));
+                tcg_gen_st_i64(cpu_tmp1_i64, cpu_env,
+                               offsetof(CPUX86State,fpregs[reg].mmx));
+            }
+            break;
+        case 0x010: /* movups */
+        case 0x110: /* movupd */
+        case 0x028: /* movaps */
+        case 0x128: /* movapd */
+        case 0x16f: /* movdqa xmm, ea */
+        case 0x26f: /* movdqu xmm, ea */
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_ldo_env_A0(s->mem_index, offsetof(CPUX86State,xmm_regs[reg]));
+            } else {
+                rm = (modrm & 7) | REX_B(s);
+                gen_op_movo(offsetof(CPUX86State,xmm_regs[reg]),
+                            offsetof(CPUX86State,xmm_regs[rm]));
+            }
+            break;
+        case 0x210: /* movss xmm, ea */
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_op_ld_T0_A0(OT_LONG + s->mem_index);
+                tcg_gen_st32_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,xmm_regs[reg].XMM_L(0)));
+                gen_op_movl_T0_0();
+                tcg_gen_st32_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,xmm_regs[reg].XMM_L(1)));
+                tcg_gen_st32_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,xmm_regs[reg].XMM_L(2)));
+                tcg_gen_st32_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,xmm_regs[reg].XMM_L(3)));
+            } else {
+                rm = (modrm & 7) | REX_B(s);
+                gen_op_movl(offsetof(CPUX86State,xmm_regs[reg].XMM_L(0)),
+                            offsetof(CPUX86State,xmm_regs[rm].XMM_L(0)));
+            }
+            break;
+        case 0x310: /* movsd xmm, ea */
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_ldq_env_A0(s->mem_index, offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)));
+                gen_op_movl_T0_0();
+                tcg_gen_st32_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,xmm_regs[reg].XMM_L(2)));
+                tcg_gen_st32_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,xmm_regs[reg].XMM_L(3)));
+            } else {
+                rm = (modrm & 7) | REX_B(s);
+                gen_op_movq(offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)),
+                            offsetof(CPUX86State,xmm_regs[rm].XMM_Q(0)));
+            }
+            break;
+        case 0x012: /* movlps */
+        case 0x112: /* movlpd */
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_ldq_env_A0(s->mem_index, offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)));
+            } else {
+                /* movhlps */
+                rm = (modrm & 7) | REX_B(s);
+                gen_op_movq(offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)),
+                            offsetof(CPUX86State,xmm_regs[rm].XMM_Q(1)));
+            }
+            break;
+        case 0x212: /* movsldup */
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_ldo_env_A0(s->mem_index, offsetof(CPUX86State,xmm_regs[reg]));
+            } else {
+                rm = (modrm & 7) | REX_B(s);
+                gen_op_movl(offsetof(CPUX86State,xmm_regs[reg].XMM_L(0)),
+                            offsetof(CPUX86State,xmm_regs[rm].XMM_L(0)));
+                gen_op_movl(offsetof(CPUX86State,xmm_regs[reg].XMM_L(2)),
+                            offsetof(CPUX86State,xmm_regs[rm].XMM_L(2)));
+            }
+            gen_op_movl(offsetof(CPUX86State,xmm_regs[reg].XMM_L(1)),
+                        offsetof(CPUX86State,xmm_regs[reg].XMM_L(0)));
+            gen_op_movl(offsetof(CPUX86State,xmm_regs[reg].XMM_L(3)),
+                        offsetof(CPUX86State,xmm_regs[reg].XMM_L(2)));
+            break;
+        case 0x312: /* movddup */
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_ldq_env_A0(s->mem_index, offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)));
+            } else {
+                rm = (modrm & 7) | REX_B(s);
+                gen_op_movq(offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)),
+                            offsetof(CPUX86State,xmm_regs[rm].XMM_Q(0)));
+            }
+            gen_op_movq(offsetof(CPUX86State,xmm_regs[reg].XMM_Q(1)),
+                        offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)));
+            break;
+        case 0x016: /* movhps */
+        case 0x116: /* movhpd */
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_ldq_env_A0(s->mem_index, offsetof(CPUX86State,xmm_regs[reg].XMM_Q(1)));
+            } else {
+                /* movlhps */
+                rm = (modrm & 7) | REX_B(s);
+                gen_op_movq(offsetof(CPUX86State,xmm_regs[reg].XMM_Q(1)),
+                            offsetof(CPUX86State,xmm_regs[rm].XMM_Q(0)));
+            }
+            break;
+        case 0x216: /* movshdup */
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_ldo_env_A0(s->mem_index, offsetof(CPUX86State,xmm_regs[reg]));
+            } else {
+                rm = (modrm & 7) | REX_B(s);
+                gen_op_movl(offsetof(CPUX86State,xmm_regs[reg].XMM_L(1)),
+                            offsetof(CPUX86State,xmm_regs[rm].XMM_L(1)));
+                gen_op_movl(offsetof(CPUX86State,xmm_regs[reg].XMM_L(3)),
+                            offsetof(CPUX86State,xmm_regs[rm].XMM_L(3)));
+            }
+            gen_op_movl(offsetof(CPUX86State,xmm_regs[reg].XMM_L(0)),
+                        offsetof(CPUX86State,xmm_regs[reg].XMM_L(1)));
+            gen_op_movl(offsetof(CPUX86State,xmm_regs[reg].XMM_L(2)),
+                        offsetof(CPUX86State,xmm_regs[reg].XMM_L(3)));
+            break;
+        case 0x178:
+        case 0x378:
+            {
+                int bit_index, field_length;
+
+                if (b1 == 1 && reg != 0)
+                    goto illegal_op;
+                field_length = ldub_code(s->pc++) & 0x3F;
+                bit_index = ldub_code(s->pc++) & 0x3F;
+                tcg_gen_addi_ptr(cpu_ptr0, cpu_env,
+                    offsetof(CPUX86State,xmm_regs[reg]));
+                if (b1 == 1)
+                    gen_helper_extrq_i(cpu_ptr0, tcg_const_i32(bit_index),
+                        tcg_const_i32(field_length));
+                else
+                    gen_helper_insertq_i(cpu_ptr0, tcg_const_i32(bit_index),
+                        tcg_const_i32(field_length));
+            }
+            break;
+        case 0x7e: /* movd ea, mm */
+#ifdef TARGET_X86_64
+            if (s->dflag == 2) {
+                tcg_gen_ld_i64(cpu_T[0], cpu_env,
+                               offsetof(CPUX86State,fpregs[reg].mmx));
+                gen_ldst_modrm(s, modrm, OT_QUAD, OR_TMP0, 1);
+            } else
+#endif
+            {
+                tcg_gen_ld32u_tl(cpu_T[0], cpu_env,
+                                 offsetof(CPUX86State,fpregs[reg].mmx.MMX_L(0)));
+                gen_ldst_modrm(s, modrm, OT_LONG, OR_TMP0, 1);
+            }
+            break;
+        case 0x17e: /* movd ea, xmm */
+#ifdef TARGET_X86_64
+            if (s->dflag == 2) {
+                tcg_gen_ld_i64(cpu_T[0], cpu_env,
+                               offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)));
+                gen_ldst_modrm(s, modrm, OT_QUAD, OR_TMP0, 1);
+            } else
+#endif
+            {
+                tcg_gen_ld32u_tl(cpu_T[0], cpu_env,
+                                 offsetof(CPUX86State,xmm_regs[reg].XMM_L(0)));
+                gen_ldst_modrm(s, modrm, OT_LONG, OR_TMP0, 1);
+            }
+            break;
+        case 0x27e: /* movq xmm, ea */
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_ldq_env_A0(s->mem_index, offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)));
+            } else {
+                rm = (modrm & 7) | REX_B(s);
+                gen_op_movq(offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)),
+                            offsetof(CPUX86State,xmm_regs[rm].XMM_Q(0)));
+            }
+            gen_op_movq_env_0(offsetof(CPUX86State,xmm_regs[reg].XMM_Q(1)));
+            break;
+        case 0x7f: /* movq ea, mm */
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_stq_env_A0(s->mem_index, offsetof(CPUX86State,fpregs[reg].mmx));
+            } else {
+                rm = (modrm & 7);
+                gen_op_movq(offsetof(CPUX86State,fpregs[rm].mmx),
+                            offsetof(CPUX86State,fpregs[reg].mmx));
+            }
+            break;
+        case 0x011: /* movups */
+        case 0x111: /* movupd */
+        case 0x029: /* movaps */
+        case 0x129: /* movapd */
+        case 0x17f: /* movdqa ea, xmm */
+        case 0x27f: /* movdqu ea, xmm */
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_sto_env_A0(s->mem_index, offsetof(CPUX86State,xmm_regs[reg]));
+            } else {
+                rm = (modrm & 7) | REX_B(s);
+                gen_op_movo(offsetof(CPUX86State,xmm_regs[rm]),
+                            offsetof(CPUX86State,xmm_regs[reg]));
+            }
+            break;
+        case 0x211: /* movss ea, xmm */
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                tcg_gen_ld32u_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,xmm_regs[reg].XMM_L(0)));
+                gen_op_st_T0_A0(OT_LONG + s->mem_index);
+            } else {
+                rm = (modrm & 7) | REX_B(s);
+                gen_op_movl(offsetof(CPUX86State,xmm_regs[rm].XMM_L(0)),
+                            offsetof(CPUX86State,xmm_regs[reg].XMM_L(0)));
+            }
+            break;
+        case 0x311: /* movsd ea, xmm */
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_stq_env_A0(s->mem_index, offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)));
+            } else {
+                rm = (modrm & 7) | REX_B(s);
+                gen_op_movq(offsetof(CPUX86State,xmm_regs[rm].XMM_Q(0)),
+                            offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)));
+            }
+            break;
+        case 0x013: /* movlps */
+        case 0x113: /* movlpd */
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_stq_env_A0(s->mem_index, offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)));
+            } else {
+                goto illegal_op;
+            }
+            break;
+        case 0x017: /* movhps */
+        case 0x117: /* movhpd */
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_stq_env_A0(s->mem_index, offsetof(CPUX86State,xmm_regs[reg].XMM_Q(1)));
+            } else {
+                goto illegal_op;
+            }
+            break;
+        case 0x71: /* shift mm, im */
+        case 0x72:
+        case 0x73:
+        case 0x171: /* shift xmm, im */
+        case 0x172:
+        case 0x173:
+            if (b1 >= 2) {
+	        goto illegal_op;
+            }
+            val = ldub_code(s->pc++);
+            if (is_xmm) {
+                gen_op_movl_T0_im(val);
+                tcg_gen_st32_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,xmm_t0.XMM_L(0)));
+                gen_op_movl_T0_0();
+                tcg_gen_st32_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,xmm_t0.XMM_L(1)));
+                op1_offset = offsetof(CPUX86State,xmm_t0);
+            } else {
+                gen_op_movl_T0_im(val);
+                tcg_gen_st32_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,mmx_t0.MMX_L(0)));
+                gen_op_movl_T0_0();
+                tcg_gen_st32_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,mmx_t0.MMX_L(1)));
+                op1_offset = offsetof(CPUX86State,mmx_t0);
+            }
+            sse_op2 = sse_op_table2[((b - 1) & 3) * 8 + (((modrm >> 3)) & 7)][b1];
+            if (!sse_op2)
+                goto illegal_op;
+            if (is_xmm) {
+                rm = (modrm & 7) | REX_B(s);
+                op2_offset = offsetof(CPUX86State,xmm_regs[rm]);
+            } else {
+                rm = (modrm & 7);
+                op2_offset = offsetof(CPUX86State,fpregs[rm].mmx);
+            }
+            tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op2_offset);
+            tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op1_offset);
+            ((void (*)(TCGv_ptr, TCGv_ptr))sse_op2)(cpu_ptr0, cpu_ptr1);
+            break;
+        case 0x050: /* movmskps */
+            rm = (modrm & 7) | REX_B(s);
+            tcg_gen_addi_ptr(cpu_ptr0, cpu_env,
+                             offsetof(CPUX86State,xmm_regs[rm]));
+            gen_helper_movmskps(cpu_tmp2_i32, cpu_ptr0);
+            tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
+            gen_op_mov_reg_T0(OT_LONG, reg);
+            break;
+        case 0x150: /* movmskpd */
+            rm = (modrm & 7) | REX_B(s);
+            tcg_gen_addi_ptr(cpu_ptr0, cpu_env,
+                             offsetof(CPUX86State,xmm_regs[rm]));
+            gen_helper_movmskpd(cpu_tmp2_i32, cpu_ptr0);
+            tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
+            gen_op_mov_reg_T0(OT_LONG, reg);
+            break;
+        case 0x02a: /* cvtpi2ps */
+        case 0x12a: /* cvtpi2pd */
+            gen_helper_enter_mmx();
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                op2_offset = offsetof(CPUX86State,mmx_t0);
+                gen_ldq_env_A0(s->mem_index, op2_offset);
+            } else {
+                rm = (modrm & 7);
+                op2_offset = offsetof(CPUX86State,fpregs[rm].mmx);
+            }
+            op1_offset = offsetof(CPUX86State,xmm_regs[reg]);
+            tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset);
+            tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
+            switch(b >> 8) {
+            case 0x0:
+                gen_helper_cvtpi2ps(cpu_ptr0, cpu_ptr1);
+                break;
+            default:
+            case 0x1:
+                gen_helper_cvtpi2pd(cpu_ptr0, cpu_ptr1);
+                break;
+            }
+            break;
+        case 0x22a: /* cvtsi2ss */
+        case 0x32a: /* cvtsi2sd */
+            ot = (s->dflag == 2) ? OT_QUAD : OT_LONG;
+            gen_ldst_modrm(s, modrm, ot, OR_TMP0, 0);
+            op1_offset = offsetof(CPUX86State,xmm_regs[reg]);
+            tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset);
+            sse_op2 = sse_op_table3[(s->dflag == 2) * 2 + ((b >> 8) - 2)];
+            if (ot == OT_LONG) {
+                tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+                ((void (*)(TCGv_ptr, TCGv_i32))sse_op2)(cpu_ptr0, cpu_tmp2_i32);
+            } else {
+                ((void (*)(TCGv_ptr, TCGv))sse_op2)(cpu_ptr0, cpu_T[0]);
+            }
+            break;
+        case 0x02c: /* cvttps2pi */
+        case 0x12c: /* cvttpd2pi */
+        case 0x02d: /* cvtps2pi */
+        case 0x12d: /* cvtpd2pi */
+            gen_helper_enter_mmx();
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                op2_offset = offsetof(CPUX86State,xmm_t0);
+                gen_ldo_env_A0(s->mem_index, op2_offset);
+            } else {
+                rm = (modrm & 7) | REX_B(s);
+                op2_offset = offsetof(CPUX86State,xmm_regs[rm]);
+            }
+            op1_offset = offsetof(CPUX86State,fpregs[reg & 7].mmx);
+            tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset);
+            tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
+            switch(b) {
+            case 0x02c:
+                gen_helper_cvttps2pi(cpu_ptr0, cpu_ptr1);
+                break;
+            case 0x12c:
+                gen_helper_cvttpd2pi(cpu_ptr0, cpu_ptr1);
+                break;
+            case 0x02d:
+                gen_helper_cvtps2pi(cpu_ptr0, cpu_ptr1);
+                break;
+            case 0x12d:
+                gen_helper_cvtpd2pi(cpu_ptr0, cpu_ptr1);
+                break;
+            }
+            break;
+        case 0x22c: /* cvttss2si */
+        case 0x32c: /* cvttsd2si */
+        case 0x22d: /* cvtss2si */
+        case 0x32d: /* cvtsd2si */
+            ot = (s->dflag == 2) ? OT_QUAD : OT_LONG;
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                if ((b >> 8) & 1) {
+                    gen_ldq_env_A0(s->mem_index, offsetof(CPUX86State,xmm_t0.XMM_Q(0)));
+                } else {
+                    gen_op_ld_T0_A0(OT_LONG + s->mem_index);
+                    tcg_gen_st32_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,xmm_t0.XMM_L(0)));
+                }
+                op2_offset = offsetof(CPUX86State,xmm_t0);
+            } else {
+                rm = (modrm & 7) | REX_B(s);
+                op2_offset = offsetof(CPUX86State,xmm_regs[rm]);
+            }
+            sse_op2 = sse_op_table3[(s->dflag == 2) * 2 + ((b >> 8) - 2) + 4 +
+                                    (b & 1) * 4];
+            tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op2_offset);
+            if (ot == OT_LONG) {
+                ((void (*)(TCGv_i32, TCGv_ptr))sse_op2)(cpu_tmp2_i32, cpu_ptr0);
+                tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
+            } else {
+                ((void (*)(TCGv, TCGv_ptr))sse_op2)(cpu_T[0], cpu_ptr0);
+            }
+            gen_op_mov_reg_T0(ot, reg);
+            break;
+        case 0xc4: /* pinsrw */
+        case 0x1c4:
+            s->rip_offset = 1;
+            gen_ldst_modrm(s, modrm, OT_WORD, OR_TMP0, 0);
+            val = ldub_code(s->pc++);
+            if (b1) {
+                val &= 7;
+                tcg_gen_st16_tl(cpu_T[0], cpu_env,
+                                offsetof(CPUX86State,xmm_regs[reg].XMM_W(val)));
+            } else {
+                val &= 3;
+                tcg_gen_st16_tl(cpu_T[0], cpu_env,
+                                offsetof(CPUX86State,fpregs[reg].mmx.MMX_W(val)));
+            }
+            break;
+        case 0xc5: /* pextrw */
+        case 0x1c5:
+            if (mod != 3)
+                goto illegal_op;
+            ot = (s->dflag == 2) ? OT_QUAD : OT_LONG;
+            val = ldub_code(s->pc++);
+            if (b1) {
+                val &= 7;
+                rm = (modrm & 7) | REX_B(s);
+                tcg_gen_ld16u_tl(cpu_T[0], cpu_env,
+                                 offsetof(CPUX86State,xmm_regs[rm].XMM_W(val)));
+            } else {
+                val &= 3;
+                rm = (modrm & 7);
+                tcg_gen_ld16u_tl(cpu_T[0], cpu_env,
+                                offsetof(CPUX86State,fpregs[rm].mmx.MMX_W(val)));
+            }
+            reg = ((modrm >> 3) & 7) | rex_r;
+            gen_op_mov_reg_T0(ot, reg);
+            break;
+        case 0x1d6: /* movq ea, xmm */
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_stq_env_A0(s->mem_index, offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)));
+            } else {
+                rm = (modrm & 7) | REX_B(s);
+                gen_op_movq(offsetof(CPUX86State,xmm_regs[rm].XMM_Q(0)),
+                            offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)));
+                gen_op_movq_env_0(offsetof(CPUX86State,xmm_regs[rm].XMM_Q(1)));
+            }
+            break;
+        case 0x2d6: /* movq2dq */
+            gen_helper_enter_mmx();
+            rm = (modrm & 7);
+            gen_op_movq(offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)),
+                        offsetof(CPUX86State,fpregs[rm].mmx));
+            gen_op_movq_env_0(offsetof(CPUX86State,xmm_regs[reg].XMM_Q(1)));
+            break;
+        case 0x3d6: /* movdq2q */
+            gen_helper_enter_mmx();
+            rm = (modrm & 7) | REX_B(s);
+            gen_op_movq(offsetof(CPUX86State,fpregs[reg & 7].mmx),
+                        offsetof(CPUX86State,xmm_regs[rm].XMM_Q(0)));
+            break;
+        case 0xd7: /* pmovmskb */
+        case 0x1d7:
+            if (mod != 3)
+                goto illegal_op;
+            if (b1) {
+                rm = (modrm & 7) | REX_B(s);
+                tcg_gen_addi_ptr(cpu_ptr0, cpu_env, offsetof(CPUX86State,xmm_regs[rm]));
+                gen_helper_pmovmskb_xmm(cpu_tmp2_i32, cpu_ptr0);
+            } else {
+                rm = (modrm & 7);
+                tcg_gen_addi_ptr(cpu_ptr0, cpu_env, offsetof(CPUX86State,fpregs[rm].mmx));
+                gen_helper_pmovmskb_mmx(cpu_tmp2_i32, cpu_ptr0);
+            }
+            tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
+            reg = ((modrm >> 3) & 7) | rex_r;
+            gen_op_mov_reg_T0(OT_LONG, reg);
+            break;
+        case 0x138:
+            if (s->prefix & PREFIX_REPNZ)
+                goto crc32;
+        case 0x038:
+            b = modrm;
+            modrm = ldub_code(s->pc++);
+            rm = modrm & 7;
+            reg = ((modrm >> 3) & 7) | rex_r;
+            mod = (modrm >> 6) & 3;
+            if (b1 >= 2) {
+                goto illegal_op;
+            }
+
+            sse_op2 = sse_op_table6[b].op[b1];
+            if (!sse_op2)
+                goto illegal_op;
+            if (!(s->cpuid_ext_features & sse_op_table6[b].ext_mask))
+                goto illegal_op;
+
+            if (b1) {
+                op1_offset = offsetof(CPUX86State,xmm_regs[reg]);
+                if (mod == 3) {
+                    op2_offset = offsetof(CPUX86State,xmm_regs[rm | REX_B(s)]);
+                } else {
+                    op2_offset = offsetof(CPUX86State,xmm_t0);
+                    gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                    switch (b) {
+                    case 0x20: case 0x30: /* pmovsxbw, pmovzxbw */
+                    case 0x23: case 0x33: /* pmovsxwd, pmovzxwd */
+                    case 0x25: case 0x35: /* pmovsxdq, pmovzxdq */
+                        gen_ldq_env_A0(s->mem_index, op2_offset +
+                                        offsetof(XMMReg, XMM_Q(0)));
+                        break;
+                    case 0x21: case 0x31: /* pmovsxbd, pmovzxbd */
+                    case 0x24: case 0x34: /* pmovsxwq, pmovzxwq */
+                        tcg_gen_qemu_ld32u(cpu_tmp0, cpu_A0,
+                                          (s->mem_index >> 2) - 1);
+                        tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_tmp0);
+                        tcg_gen_st_i32(cpu_tmp2_i32, cpu_env, op2_offset +
+                                        offsetof(XMMReg, XMM_L(0)));
+                        break;
+                    case 0x22: case 0x32: /* pmovsxbq, pmovzxbq */
+                        tcg_gen_qemu_ld16u(cpu_tmp0, cpu_A0,
+                                          (s->mem_index >> 2) - 1);
+                        tcg_gen_st16_tl(cpu_tmp0, cpu_env, op2_offset +
+                                        offsetof(XMMReg, XMM_W(0)));
+                        break;
+                    case 0x2a:            /* movntqda */
+                        gen_ldo_env_A0(s->mem_index, op1_offset);
+                        return;
+                    default:
+                        gen_ldo_env_A0(s->mem_index, op2_offset);
+                    }
+                }
+            } else {
+                op1_offset = offsetof(CPUX86State,fpregs[reg].mmx);
+                if (mod == 3) {
+                    op2_offset = offsetof(CPUX86State,fpregs[rm].mmx);
+                } else {
+                    op2_offset = offsetof(CPUX86State,mmx_t0);
+                    gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                    gen_ldq_env_A0(s->mem_index, op2_offset);
+                }
+            }
+            if (sse_op2 == SSE_SPECIAL)
+                goto illegal_op;
+
+            tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset);
+            tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
+            ((void (*)(TCGv_ptr, TCGv_ptr))sse_op2)(cpu_ptr0, cpu_ptr1);
+
+            if (b == 0x17)
+                s->cc_op = CC_OP_EFLAGS;
+            break;
+        case 0x338: /* crc32 */
+        crc32:
+            b = modrm;
+            modrm = ldub_code(s->pc++);
+            reg = ((modrm >> 3) & 7) | rex_r;
+
+            if (b != 0xf0 && b != 0xf1)
+                goto illegal_op;
+            if (!(s->cpuid_ext_features & CPUID_EXT_SSE42))
+                goto illegal_op;
+
+            if (b == 0xf0)
+                ot = OT_BYTE;
+            else if (b == 0xf1 && s->dflag != 2)
+                if (s->prefix & PREFIX_DATA)
+                    ot = OT_WORD;
+                else
+                    ot = OT_LONG;
+            else
+                ot = OT_QUAD;
+
+            gen_op_mov_TN_reg(OT_LONG, 0, reg);
+            tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+            gen_ldst_modrm(s, modrm, ot, OR_TMP0, 0);
+            gen_helper_crc32(cpu_T[0], cpu_tmp2_i32,
+                             cpu_T[0], tcg_const_i32(8 << ot));
+
+            ot = (s->dflag == 2) ? OT_QUAD : OT_LONG;
+            gen_op_mov_reg_T0(ot, reg);
+            break;
+        case 0x03a:
+        case 0x13a:
+            b = modrm;
+            modrm = ldub_code(s->pc++);
+            rm = modrm & 7;
+            reg = ((modrm >> 3) & 7) | rex_r;
+            mod = (modrm >> 6) & 3;
+            if (b1 >= 2) {
+                goto illegal_op;
+            }
+
+            sse_op2 = sse_op_table7[b].op[b1];
+            if (!sse_op2)
+                goto illegal_op;
+            if (!(s->cpuid_ext_features & sse_op_table7[b].ext_mask))
+                goto illegal_op;
+
+            if (sse_op2 == SSE_SPECIAL) {
+                ot = (s->dflag == 2) ? OT_QUAD : OT_LONG;
+                rm = (modrm & 7) | REX_B(s);
+                if (mod != 3)
+                    gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                reg = ((modrm >> 3) & 7) | rex_r;
+                val = ldub_code(s->pc++);
+                switch (b) {
+                case 0x14: /* pextrb */
+                    tcg_gen_ld8u_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,
+                                            xmm_regs[reg].XMM_B(val & 15)));
+                    if (mod == 3)
+                        gen_op_mov_reg_T0(ot, rm);
+                    else
+                        tcg_gen_qemu_st8(cpu_T[0], cpu_A0,
+                                        (s->mem_index >> 2) - 1);
+                    break;
+                case 0x15: /* pextrw */
+                    tcg_gen_ld16u_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,
+                                            xmm_regs[reg].XMM_W(val & 7)));
+                    if (mod == 3)
+                        gen_op_mov_reg_T0(ot, rm);
+                    else
+                        tcg_gen_qemu_st16(cpu_T[0], cpu_A0,
+                                        (s->mem_index >> 2) - 1);
+                    break;
+                case 0x16:
+                    if (ot == OT_LONG) { /* pextrd */
+                        tcg_gen_ld_i32(cpu_tmp2_i32, cpu_env,
+                                        offsetof(CPUX86State,
+                                                xmm_regs[reg].XMM_L(val & 3)));
+                        tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
+                        if (mod == 3)
+                            gen_op_mov_reg_v(ot, rm, cpu_T[0]);
+                        else
+                            tcg_gen_qemu_st32(cpu_T[0], cpu_A0,
+                                            (s->mem_index >> 2) - 1);
+                    } else { /* pextrq */
+#ifdef TARGET_X86_64
+                        tcg_gen_ld_i64(cpu_tmp1_i64, cpu_env,
+                                        offsetof(CPUX86State,
+                                                xmm_regs[reg].XMM_Q(val & 1)));
+                        if (mod == 3)
+                            gen_op_mov_reg_v(ot, rm, cpu_tmp1_i64);
+                        else
+                            tcg_gen_qemu_st64(cpu_tmp1_i64, cpu_A0,
+                                            (s->mem_index >> 2) - 1);
+#else
+                        goto illegal_op;
+#endif
+                    }
+                    break;
+                case 0x17: /* extractps */
+                    tcg_gen_ld32u_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,
+                                            xmm_regs[reg].XMM_L(val & 3)));
+                    if (mod == 3)
+                        gen_op_mov_reg_T0(ot, rm);
+                    else
+                        tcg_gen_qemu_st32(cpu_T[0], cpu_A0,
+                                        (s->mem_index >> 2) - 1);
+                    break;
+                case 0x20: /* pinsrb */
+                    if (mod == 3)
+                        gen_op_mov_TN_reg(OT_LONG, 0, rm);
+                    else
+                        tcg_gen_qemu_ld8u(cpu_tmp0, cpu_A0,
+                                        (s->mem_index >> 2) - 1);
+                    tcg_gen_st8_tl(cpu_tmp0, cpu_env, offsetof(CPUX86State,
+                                            xmm_regs[reg].XMM_B(val & 15)));
+                    break;
+                case 0x21: /* insertps */
+                    if (mod == 3) {
+                        tcg_gen_ld_i32(cpu_tmp2_i32, cpu_env,
+                                        offsetof(CPUX86State,xmm_regs[rm]
+                                                .XMM_L((val >> 6) & 3)));
+                    } else {
+                        tcg_gen_qemu_ld32u(cpu_tmp0, cpu_A0,
+                                        (s->mem_index >> 2) - 1);
+                        tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_tmp0);
+                    }
+                    tcg_gen_st_i32(cpu_tmp2_i32, cpu_env,
+                                    offsetof(CPUX86State,xmm_regs[reg]
+                                            .XMM_L((val >> 4) & 3)));
+                    if ((val >> 0) & 1)
+                        tcg_gen_st_i32(tcg_const_i32(0 /*float32_zero*/),
+                                        cpu_env, offsetof(CPUX86State,
+                                                xmm_regs[reg].XMM_L(0)));
+                    if ((val >> 1) & 1)
+                        tcg_gen_st_i32(tcg_const_i32(0 /*float32_zero*/),
+                                        cpu_env, offsetof(CPUX86State,
+                                                xmm_regs[reg].XMM_L(1)));
+                    if ((val >> 2) & 1)
+                        tcg_gen_st_i32(tcg_const_i32(0 /*float32_zero*/),
+                                        cpu_env, offsetof(CPUX86State,
+                                                xmm_regs[reg].XMM_L(2)));
+                    if ((val >> 3) & 1)
+                        tcg_gen_st_i32(tcg_const_i32(0 /*float32_zero*/),
+                                        cpu_env, offsetof(CPUX86State,
+                                                xmm_regs[reg].XMM_L(3)));
+                    break;
+                case 0x22:
+                    if (ot == OT_LONG) { /* pinsrd */
+                        if (mod == 3)
+                            gen_op_mov_v_reg(ot, cpu_tmp0, rm);
+                        else
+                            tcg_gen_qemu_ld32u(cpu_tmp0, cpu_A0,
+                                            (s->mem_index >> 2) - 1);
+                        tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_tmp0);
+                        tcg_gen_st_i32(cpu_tmp2_i32, cpu_env,
+                                        offsetof(CPUX86State,
+                                                xmm_regs[reg].XMM_L(val & 3)));
+                    } else { /* pinsrq */
+#ifdef TARGET_X86_64
+                        if (mod == 3)
+                            gen_op_mov_v_reg(ot, cpu_tmp1_i64, rm);
+                        else
+                            tcg_gen_qemu_ld64(cpu_tmp1_i64, cpu_A0,
+                                            (s->mem_index >> 2) - 1);
+                        tcg_gen_st_i64(cpu_tmp1_i64, cpu_env,
+                                        offsetof(CPUX86State,
+                                                xmm_regs[reg].XMM_Q(val & 1)));
+#else
+                        goto illegal_op;
+#endif
+                    }
+                    break;
+                }
+                return;
+            }
+
+            if (b1) {
+                op1_offset = offsetof(CPUX86State,xmm_regs[reg]);
+                if (mod == 3) {
+                    op2_offset = offsetof(CPUX86State,xmm_regs[rm | REX_B(s)]);
+                } else {
+                    op2_offset = offsetof(CPUX86State,xmm_t0);
+                    gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                    gen_ldo_env_A0(s->mem_index, op2_offset);
+                }
+            } else {
+                op1_offset = offsetof(CPUX86State,fpregs[reg].mmx);
+                if (mod == 3) {
+                    op2_offset = offsetof(CPUX86State,fpregs[rm].mmx);
+                } else {
+                    op2_offset = offsetof(CPUX86State,mmx_t0);
+                    gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                    gen_ldq_env_A0(s->mem_index, op2_offset);
+                }
+            }
+            val = ldub_code(s->pc++);
+
+            if ((b & 0xfc) == 0x60) { /* pcmpXstrX */
+                s->cc_op = CC_OP_EFLAGS;
+
+                if (s->dflag == 2)
+                    /* The helper must use entire 64-bit gp registers */
+                    val |= 1 << 8;
+            }
+
+            tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset);
+            tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
+            ((void (*)(TCGv_ptr, TCGv_ptr, TCGv_i32))sse_op2)(cpu_ptr0, cpu_ptr1, tcg_const_i32(val));
+            break;
+        default:
+            goto illegal_op;
+        }
+    } else {
+        /* generic MMX or SSE operation */
+        switch(b) {
+        case 0x70: /* pshufx insn */
+        case 0xc6: /* pshufx insn */
+        case 0xc2: /* compare insns */
+            s->rip_offset = 1;
+            break;
+        default:
+            break;
+        }
+        if (is_xmm) {
+            op1_offset = offsetof(CPUX86State,xmm_regs[reg]);
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                op2_offset = offsetof(CPUX86State,xmm_t0);
+                if (b1 >= 2 && ((b >= 0x50 && b <= 0x5f && b != 0x5b) ||
+                                b == 0xc2)) {
+                    /* specific case for SSE single instructions */
+                    if (b1 == 2) {
+                        /* 32 bit access */
+                        gen_op_ld_T0_A0(OT_LONG + s->mem_index);
+                        tcg_gen_st32_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,xmm_t0.XMM_L(0)));
+                    } else {
+                        /* 64 bit access */
+                        gen_ldq_env_A0(s->mem_index, offsetof(CPUX86State,xmm_t0.XMM_D(0)));
+                    }
+                } else {
+                    gen_ldo_env_A0(s->mem_index, op2_offset);
+                }
+            } else {
+                rm = (modrm & 7) | REX_B(s);
+                op2_offset = offsetof(CPUX86State,xmm_regs[rm]);
+            }
+        } else {
+            op1_offset = offsetof(CPUX86State,fpregs[reg].mmx);
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                op2_offset = offsetof(CPUX86State,mmx_t0);
+                gen_ldq_env_A0(s->mem_index, op2_offset);
+            } else {
+                rm = (modrm & 7);
+                op2_offset = offsetof(CPUX86State,fpregs[rm].mmx);
+            }
+        }
+        switch(b) {
+        case 0x0f: /* 3DNow! data insns */
+            if (!(s->cpuid_ext2_features & CPUID_EXT2_3DNOW))
+                goto illegal_op;
+            val = ldub_code(s->pc++);
+            sse_op2 = sse_op_table5[val];
+            if (!sse_op2)
+                goto illegal_op;
+            tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset);
+            tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
+            ((void (*)(TCGv_ptr, TCGv_ptr))sse_op2)(cpu_ptr0, cpu_ptr1);
+            break;
+        case 0x70: /* pshufx insn */
+        case 0xc6: /* pshufx insn */
+            val = ldub_code(s->pc++);
+            tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset);
+            tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
+            ((void (*)(TCGv_ptr, TCGv_ptr, TCGv_i32))sse_op2)(cpu_ptr0, cpu_ptr1, tcg_const_i32(val));
+            break;
+        case 0xc2:
+            /* compare insns */
+            val = ldub_code(s->pc++);
+            if (val >= 8)
+                goto illegal_op;
+            sse_op2 = sse_op_table4[val][b1];
+            tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset);
+            tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
+            ((void (*)(TCGv_ptr, TCGv_ptr))sse_op2)(cpu_ptr0, cpu_ptr1);
+            break;
+        case 0xf7:
+            /* maskmov : we must prepare A0 */
+            if (mod != 3)
+                goto illegal_op;
+#ifdef TARGET_X86_64
+            if (s->aflag == 2) {
+                gen_op_movq_A0_reg(R_EDI);
+            } else
+#endif
+            {
+                gen_op_movl_A0_reg(R_EDI);
+                if (s->aflag == 0)
+                    gen_op_andl_A0_ffff();
+            }
+            gen_add_A0_ds_seg(s);
+
+            tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset);
+            tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
+            ((void (*)(TCGv_ptr, TCGv_ptr, TCGv))sse_op2)(cpu_ptr0, cpu_ptr1, cpu_A0);
+            break;
+        default:
+            tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset);
+            tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
+            ((void (*)(TCGv_ptr, TCGv_ptr))sse_op2)(cpu_ptr0, cpu_ptr1);
+            break;
+        }
+        if (b == 0x2e || b == 0x2f) {
+            s->cc_op = CC_OP_EFLAGS;
+        }
+    }
+}
+
+#ifdef VBOX
+/* Checks if it's an invalid lock sequence. Only a few instructions
+   can be used together with the lock prefix and of those only the
+   form that write a memory operand. So, this is kind of annoying
+   work to do...
+   The AMD manual lists the following instructions.
+        ADC
+        ADD
+        AND
+        BTC
+        BTR
+        BTS
+        CMPXCHG
+        CMPXCHG8B
+        CMPXCHG16B
+        DEC
+        INC
+        NEG
+        NOT
+        OR
+        SBB
+        SUB
+        XADD
+        XCHG
+        XOR */
+static bool is_invalid_lock_sequence(DisasContext *s, target_ulong pc_start, int b)
+{
+    target_ulong pc = s->pc;
+    int modrm, mod, op;
+
+    /* X={8,16,32,64}  Y={16,32,64} */
+    switch (b)
+    {
+        /* /2: ADC reg/memX, immX */
+        /* /0: ADD reg/memX, immX */
+        /* /4: AND reg/memX, immX */
+        /* /1: OR  reg/memX, immX */
+        /* /3: SBB reg/memX, immX */
+        /* /5: SUB reg/memX, immX */
+        /* /6: XOR reg/memX, immX */
+        case 0x80:
+        case 0x81:
+        case 0x83:
+            modrm = ldub_code(pc++);
+            op = (modrm >> 3) & 7;
+            if (op == 7) /* /7: CMP */
+                break;
+            mod = (modrm >> 6) & 3;
+            if (mod == 3) /* register destination */
+                break;
+            return false;
+
+        case 0x10: /* /r: ADC reg/mem8, reg8 */
+        case 0x11: /* /r: ADC reg/memX, regY */
+        case 0x00: /* /r: ADD reg/mem8, reg8 */
+        case 0x01: /* /r: ADD reg/memX, regY */
+        case 0x20: /* /r: AND reg/mem8, reg8 */
+        case 0x21: /* /r: AND reg/memY, regY */
+        case 0x08: /* /r: OR  reg/mem8, reg8 */
+        case 0x09: /* /r: OR  reg/memY, regY */
+        case 0x18: /* /r: SBB reg/mem8, reg8 */
+        case 0x19: /* /r: SBB reg/memY, regY */
+        case 0x28: /* /r: SUB reg/mem8, reg8 */
+        case 0x29: /* /r: SUB reg/memY, regY */
+        case 0x86: /* /r: XCHG reg/mem8, reg8 or XCHG reg8, reg/mem8 */
+        case 0x87: /* /r: XCHG reg/memY, regY or XCHG regY, reg/memY */
+        case 0x30: /* /r: XOR reg/mem8, reg8 */
+        case 0x31: /* /r: XOR reg/memY, regY */
+            modrm = ldub_code(pc++);
+            mod = (modrm >> 6) & 3;
+            if (mod == 3) /* register destination */
+                break;
+            return false;
+
+        /* /1: DEC reg/memX */
+        /* /0: INC reg/memX */
+        case 0xfe:
+        case 0xff:
+            modrm = ldub_code(pc++);
+            mod = (modrm >> 6) & 3;
+            if (mod == 3) /* register destination */
+                break;
+            return false;
+
+        /* /3: NEG reg/memX */
+        /* /2: NOT reg/memX */
+        case 0xf6:
+        case 0xf7:
+            modrm = ldub_code(pc++);
+            mod = (modrm >> 6) & 3;
+            if (mod == 3) /* register destination */
+                break;
+            return false;
+
+        case 0x0f:
+            b = ldub_code(pc++);
+            switch (b)
+            {
+                /* /7: BTC reg/memY, imm8 */
+                /* /6: BTR reg/memY, imm8 */
+                /* /5: BTS reg/memY, imm8 */
+                case 0xba:
+                    modrm = ldub_code(pc++);
+                    op = (modrm >> 3) & 7;
+                    if (op < 5)
+                        break;
+                    mod = (modrm >> 6) & 3;
+                    if (mod == 3) /* register destination */
+                        break;
+                    return false;
+
+                case 0xbb: /* /r: BTC reg/memY, regY */
+                case 0xb3: /* /r: BTR reg/memY, regY */
+                case 0xab: /* /r: BTS reg/memY, regY */
+                case 0xb0: /* /r: CMPXCHG reg/mem8, reg8 */
+                case 0xb1: /* /r: CMPXCHG reg/memY, regY */
+                case 0xc0: /* /r: XADD reg/mem8, reg8 */
+                case 0xc1: /* /r: XADD reg/memY, regY */
+                    modrm = ldub_code(pc++);
+                    mod = (modrm >> 6) & 3;
+                    if (mod == 3) /* register destination */
+                        break;
+                    return false;
+
+                /* /1: CMPXCHG8B mem64 or CMPXCHG16B mem128 */
+                case 0xc7:
+                    modrm = ldub_code(pc++);
+                    op = (modrm >> 3) & 7;
+                    if (op != 1)
+                        break;
+                    return false;
+            }
+            break;
+    }
+
+    /* illegal sequence. The s->pc is past the lock prefix and that
+       is sufficient for the TB, I think. */
+    Log(("illegal lock sequence %RGv (b=%#x)\n", pc_start, b));
+    return true;
+}
+#endif /* VBOX */
+
+/* convert one instruction. s->is_jmp is set if the translation must
+   be stopped. Return the next pc value */
+static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
+{
+    int b, prefixes, aflag, dflag;
+    int shift, ot;
+    int modrm, reg, rm, mod, reg_addr, op, opreg, offset_addr, val;
+    target_ulong next_eip, tval;
+    int rex_w, rex_r;
+
+    if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP)))
+        tcg_gen_debug_insn_start(pc_start);
+    s->pc = pc_start;
+    prefixes = 0;
+    aflag = s->code32;
+    dflag = s->code32;
+    s->override = -1;
+    rex_w = -1;
+    rex_r = 0;
+#ifdef TARGET_X86_64
+    s->rex_x = 0;
+    s->rex_b = 0;
+    x86_64_hregs = 0;
+#endif
+    s->rip_offset = 0; /* for relative ip address */
+#ifdef VBOX
+    /* nike: seems only slow down things */
+# if 0
+    /* Always update EIP. Otherwise one must be very careful with generated code that can raise exceptions. */
+
+    gen_update_eip(pc_start - s->cs_base);
+# endif
+#endif /* VBOX */
+
+ next_byte:
+    b = ldub_code(s->pc);
+    s->pc++;
+    /* check prefixes */
+#ifdef TARGET_X86_64
+    if (CODE64(s)) {
+        switch (b) {
+        case 0xf3:
+            prefixes |= PREFIX_REPZ;
+            goto next_byte;
+        case 0xf2:
+            prefixes |= PREFIX_REPNZ;
+            goto next_byte;
+        case 0xf0:
+            prefixes |= PREFIX_LOCK;
+            goto next_byte;
+        case 0x2e:
+            s->override = R_CS;
+            goto next_byte;
+        case 0x36:
+            s->override = R_SS;
+            goto next_byte;
+        case 0x3e:
+            s->override = R_DS;
+            goto next_byte;
+        case 0x26:
+            s->override = R_ES;
+            goto next_byte;
+        case 0x64:
+            s->override = R_FS;
+            goto next_byte;
+        case 0x65:
+            s->override = R_GS;
+            goto next_byte;
+        case 0x66:
+            prefixes |= PREFIX_DATA;
+            goto next_byte;
+        case 0x67:
+            prefixes |= PREFIX_ADR;
+            goto next_byte;
+        case 0x40 ... 0x4f:
+            /* REX prefix */
+            rex_w = (b >> 3) & 1;
+            rex_r = (b & 0x4) << 1;
+            s->rex_x = (b & 0x2) << 2;
+            REX_B(s) = (b & 0x1) << 3;
+            x86_64_hregs = 1; /* select uniform byte register addressing */
+            goto next_byte;
+        }
+        if (rex_w == 1) {
+            /* 0x66 is ignored if rex.w is set */
+            dflag = 2;
+        } else {
+            if (prefixes & PREFIX_DATA)
+                dflag ^= 1;
+        }
+        if (!(prefixes & PREFIX_ADR))
+            aflag = 2;
+    } else
+#endif
+    {
+        switch (b) {
+        case 0xf3:
+            prefixes |= PREFIX_REPZ;
+            goto next_byte;
+        case 0xf2:
+            prefixes |= PREFIX_REPNZ;
+            goto next_byte;
+        case 0xf0:
+            prefixes |= PREFIX_LOCK;
+            goto next_byte;
+        case 0x2e:
+            s->override = R_CS;
+            goto next_byte;
+        case 0x36:
+            s->override = R_SS;
+            goto next_byte;
+        case 0x3e:
+            s->override = R_DS;
+            goto next_byte;
+        case 0x26:
+            s->override = R_ES;
+            goto next_byte;
+        case 0x64:
+            s->override = R_FS;
+            goto next_byte;
+        case 0x65:
+            s->override = R_GS;
+            goto next_byte;
+        case 0x66:
+            prefixes |= PREFIX_DATA;
+            goto next_byte;
+        case 0x67:
+            prefixes |= PREFIX_ADR;
+            goto next_byte;
+        }
+        if (prefixes & PREFIX_DATA)
+            dflag ^= 1;
+        if (prefixes & PREFIX_ADR)
+            aflag ^= 1;
+    }
+
+    s->prefix = prefixes;
+    s->aflag = aflag;
+    s->dflag = dflag;
+
+    /* lock generation */
+#ifndef VBOX
+    if (prefixes & PREFIX_LOCK)
+        gen_helper_lock();
+#else /* VBOX */
+    if (prefixes & PREFIX_LOCK) {
+        if (is_invalid_lock_sequence(s, pc_start, b)) {
+            gen_exception(s, EXCP06_ILLOP, pc_start - s->cs_base);
+            return s->pc;
+        }
+        gen_helper_lock();
+    }
+#endif /* VBOX */
+
+    /* now check op code */
+ reswitch:
+    switch(b) {
+    case 0x0f:
+        /**************************/
+        /* extended op code */
+        b = ldub_code(s->pc++) | 0x100;
+        goto reswitch;
+
+        /**************************/
+        /* arith & logic */
+    case 0x00 ... 0x05:
+    case 0x08 ... 0x0d:
+    case 0x10 ... 0x15:
+    case 0x18 ... 0x1d:
+    case 0x20 ... 0x25:
+    case 0x28 ... 0x2d:
+    case 0x30 ... 0x35:
+    case 0x38 ... 0x3d:
+        {
+            int op, f, val;
+            op = (b >> 3) & 7;
+            f = (b >> 1) & 3;
+
+            if ((b & 1) == 0)
+                ot = OT_BYTE;
+            else
+                ot = dflag + OT_WORD;
+
+            switch(f) {
+            case 0: /* OP Ev, Gv */
+                modrm = ldub_code(s->pc++);
+                reg = ((modrm >> 3) & 7) | rex_r;
+                mod = (modrm >> 6) & 3;
+                rm = (modrm & 7) | REX_B(s);
+                if (mod != 3) {
+                    gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                    opreg = OR_TMP0;
+                } else if (op == OP_XORL && rm == reg) {
+                xor_zero:
+                    /* xor reg, reg optimisation */
+                    gen_op_movl_T0_0();
+                    s->cc_op = CC_OP_LOGICB + ot;
+                    gen_op_mov_reg_T0(ot, reg);
+                    gen_op_update1_cc();
+                    break;
+                } else {
+                    opreg = rm;
+                }
+                gen_op_mov_TN_reg(ot, 1, reg);
+                gen_op(s, op, ot, opreg);
+                break;
+            case 1: /* OP Gv, Ev */
+                modrm = ldub_code(s->pc++);
+                mod = (modrm >> 6) & 3;
+                reg = ((modrm >> 3) & 7) | rex_r;
+                rm = (modrm & 7) | REX_B(s);
+                if (mod != 3) {
+                    gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                    gen_op_ld_T1_A0(ot + s->mem_index);
+                } else if (op == OP_XORL && rm == reg) {
+                    goto xor_zero;
+                } else {
+                    gen_op_mov_TN_reg(ot, 1, rm);
+                }
+                gen_op(s, op, ot, reg);
+                break;
+            case 2: /* OP A, Iv */
+                val = insn_get(s, ot);
+                gen_op_movl_T1_im(val);
+                gen_op(s, op, ot, OR_EAX);
+                break;
+            }
+        }
+        break;
+
+    case 0x82:
+        if (CODE64(s))
+            goto illegal_op;
+    case 0x80: /* GRP1 */
+    case 0x81:
+    case 0x83:
+        {
+            int val;
+
+            if ((b & 1) == 0)
+                ot = OT_BYTE;
+            else
+                ot = dflag + OT_WORD;
+
+            modrm = ldub_code(s->pc++);
+            mod = (modrm >> 6) & 3;
+            rm = (modrm & 7) | REX_B(s);
+            op = (modrm >> 3) & 7;
+
+            if (mod != 3) {
+                if (b == 0x83)
+                    s->rip_offset = 1;
+                else
+                    s->rip_offset = insn_const_size(ot);
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                opreg = OR_TMP0;
+            } else {
+                opreg = rm;
+            }
+
+            switch(b) {
+            default:
+            case 0x80:
+            case 0x81:
+            case 0x82:
+                val = insn_get(s, ot);
+                break;
+            case 0x83:
+                val = (int8_t)insn_get(s, OT_BYTE);
+                break;
+            }
+            gen_op_movl_T1_im(val);
+            gen_op(s, op, ot, opreg);
+        }
+        break;
+
+        /**************************/
+        /* inc, dec, and other misc arith */
+    case 0x40 ... 0x47: /* inc Gv */
+        ot = dflag ? OT_LONG : OT_WORD;
+        gen_inc(s, ot, OR_EAX + (b & 7), 1);
+        break;
+    case 0x48 ... 0x4f: /* dec Gv */
+        ot = dflag ? OT_LONG : OT_WORD;
+        gen_inc(s, ot, OR_EAX + (b & 7), -1);
+        break;
+    case 0xf6: /* GRP3 */
+    case 0xf7:
+        if ((b & 1) == 0)
+            ot = OT_BYTE;
+        else
+            ot = dflag + OT_WORD;
+
+        modrm = ldub_code(s->pc++);
+        mod = (modrm >> 6) & 3;
+        rm = (modrm & 7) | REX_B(s);
+        op = (modrm >> 3) & 7;
+        if (mod != 3) {
+            if (op == 0)
+                s->rip_offset = insn_const_size(ot);
+            gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+            gen_op_ld_T0_A0(ot + s->mem_index);
+        } else {
+            gen_op_mov_TN_reg(ot, 0, rm);
+        }
+
+        switch(op) {
+        case 0: /* test */
+            val = insn_get(s, ot);
+            gen_op_movl_T1_im(val);
+            gen_op_testl_T0_T1_cc();
+            s->cc_op = CC_OP_LOGICB + ot;
+            break;
+        case 2: /* not */
+            tcg_gen_not_tl(cpu_T[0], cpu_T[0]);
+            if (mod != 3) {
+                gen_op_st_T0_A0(ot + s->mem_index);
+            } else {
+                gen_op_mov_reg_T0(ot, rm);
+            }
+            break;
+        case 3: /* neg */
+            tcg_gen_neg_tl(cpu_T[0], cpu_T[0]);
+            if (mod != 3) {
+                gen_op_st_T0_A0(ot + s->mem_index);
+            } else {
+                gen_op_mov_reg_T0(ot, rm);
+            }
+            gen_op_update_neg_cc();
+            s->cc_op = CC_OP_SUBB + ot;
+            break;
+        case 4: /* mul */
+            switch(ot) {
+            case OT_BYTE:
+                gen_op_mov_TN_reg(OT_BYTE, 1, R_EAX);
+                tcg_gen_ext8u_tl(cpu_T[0], cpu_T[0]);
+                tcg_gen_ext8u_tl(cpu_T[1], cpu_T[1]);
+                /* XXX: use 32 bit mul which could be faster */
+                tcg_gen_mul_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+                gen_op_mov_reg_T0(OT_WORD, R_EAX);
+                tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+                tcg_gen_andi_tl(cpu_cc_src, cpu_T[0], 0xff00);
+                s->cc_op = CC_OP_MULB;
+                break;
+            case OT_WORD:
+                gen_op_mov_TN_reg(OT_WORD, 1, R_EAX);
+                tcg_gen_ext16u_tl(cpu_T[0], cpu_T[0]);
+                tcg_gen_ext16u_tl(cpu_T[1], cpu_T[1]);
+                /* XXX: use 32 bit mul which could be faster */
+                tcg_gen_mul_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+                gen_op_mov_reg_T0(OT_WORD, R_EAX);
+                tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+                tcg_gen_shri_tl(cpu_T[0], cpu_T[0], 16);
+                gen_op_mov_reg_T0(OT_WORD, R_EDX);
+                tcg_gen_mov_tl(cpu_cc_src, cpu_T[0]);
+                s->cc_op = CC_OP_MULW;
+                break;
+            default:
+            case OT_LONG:
+#ifdef TARGET_X86_64
+                gen_op_mov_TN_reg(OT_LONG, 1, R_EAX);
+                tcg_gen_ext32u_tl(cpu_T[0], cpu_T[0]);
+                tcg_gen_ext32u_tl(cpu_T[1], cpu_T[1]);
+                tcg_gen_mul_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+                gen_op_mov_reg_T0(OT_LONG, R_EAX);
+                tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+                tcg_gen_shri_tl(cpu_T[0], cpu_T[0], 32);
+                gen_op_mov_reg_T0(OT_LONG, R_EDX);
+                tcg_gen_mov_tl(cpu_cc_src, cpu_T[0]);
+#else
+                {
+                    TCGv_i64 t0, t1;
+                    t0 = tcg_temp_new_i64();
+                    t1 = tcg_temp_new_i64();
+                    gen_op_mov_TN_reg(OT_LONG, 1, R_EAX);
+                    tcg_gen_extu_i32_i64(t0, cpu_T[0]);
+                    tcg_gen_extu_i32_i64(t1, cpu_T[1]);
+                    tcg_gen_mul_i64(t0, t0, t1);
+                    tcg_gen_trunc_i64_i32(cpu_T[0], t0);
+                    gen_op_mov_reg_T0(OT_LONG, R_EAX);
+                    tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+                    tcg_gen_shri_i64(t0, t0, 32);
+                    tcg_gen_trunc_i64_i32(cpu_T[0], t0);
+                    gen_op_mov_reg_T0(OT_LONG, R_EDX);
+                    tcg_gen_mov_tl(cpu_cc_src, cpu_T[0]);
+                }
+#endif
+                s->cc_op = CC_OP_MULL;
+                break;
+#ifdef TARGET_X86_64
+            case OT_QUAD:
+                gen_helper_mulq_EAX_T0(cpu_T[0]);
+                s->cc_op = CC_OP_MULQ;
+                break;
+#endif
+            }
+            break;
+        case 5: /* imul */
+            switch(ot) {
+            case OT_BYTE:
+                gen_op_mov_TN_reg(OT_BYTE, 1, R_EAX);
+                tcg_gen_ext8s_tl(cpu_T[0], cpu_T[0]);
+                tcg_gen_ext8s_tl(cpu_T[1], cpu_T[1]);
+                /* XXX: use 32 bit mul which could be faster */
+                tcg_gen_mul_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+                gen_op_mov_reg_T0(OT_WORD, R_EAX);
+                tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+                tcg_gen_ext8s_tl(cpu_tmp0, cpu_T[0]);
+                tcg_gen_sub_tl(cpu_cc_src, cpu_T[0], cpu_tmp0);
+                s->cc_op = CC_OP_MULB;
+                break;
+            case OT_WORD:
+                gen_op_mov_TN_reg(OT_WORD, 1, R_EAX);
+                tcg_gen_ext16s_tl(cpu_T[0], cpu_T[0]);
+                tcg_gen_ext16s_tl(cpu_T[1], cpu_T[1]);
+                /* XXX: use 32 bit mul which could be faster */
+                tcg_gen_mul_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+                gen_op_mov_reg_T0(OT_WORD, R_EAX);
+                tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+                tcg_gen_ext16s_tl(cpu_tmp0, cpu_T[0]);
+                tcg_gen_sub_tl(cpu_cc_src, cpu_T[0], cpu_tmp0);
+                tcg_gen_shri_tl(cpu_T[0], cpu_T[0], 16);
+                gen_op_mov_reg_T0(OT_WORD, R_EDX);
+                s->cc_op = CC_OP_MULW;
+                break;
+            default:
+            case OT_LONG:
+#ifdef TARGET_X86_64
+                gen_op_mov_TN_reg(OT_LONG, 1, R_EAX);
+                tcg_gen_ext32s_tl(cpu_T[0], cpu_T[0]);
+                tcg_gen_ext32s_tl(cpu_T[1], cpu_T[1]);
+                tcg_gen_mul_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+                gen_op_mov_reg_T0(OT_LONG, R_EAX);
+                tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+                tcg_gen_ext32s_tl(cpu_tmp0, cpu_T[0]);
+                tcg_gen_sub_tl(cpu_cc_src, cpu_T[0], cpu_tmp0);
+                tcg_gen_shri_tl(cpu_T[0], cpu_T[0], 32);
+                gen_op_mov_reg_T0(OT_LONG, R_EDX);
+#else
+                {
+                    TCGv_i64 t0, t1;
+                    t0 = tcg_temp_new_i64();
+                    t1 = tcg_temp_new_i64();
+                    gen_op_mov_TN_reg(OT_LONG, 1, R_EAX);
+                    tcg_gen_ext_i32_i64(t0, cpu_T[0]);
+                    tcg_gen_ext_i32_i64(t1, cpu_T[1]);
+                    tcg_gen_mul_i64(t0, t0, t1);
+                    tcg_gen_trunc_i64_i32(cpu_T[0], t0);
+                    gen_op_mov_reg_T0(OT_LONG, R_EAX);
+                    tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+                    tcg_gen_sari_tl(cpu_tmp0, cpu_T[0], 31);
+                    tcg_gen_shri_i64(t0, t0, 32);
+                    tcg_gen_trunc_i64_i32(cpu_T[0], t0);
+                    gen_op_mov_reg_T0(OT_LONG, R_EDX);
+                    tcg_gen_sub_tl(cpu_cc_src, cpu_T[0], cpu_tmp0);
+                }
+#endif
+                s->cc_op = CC_OP_MULL;
+                break;
+#ifdef TARGET_X86_64
+            case OT_QUAD:
+                gen_helper_imulq_EAX_T0(cpu_T[0]);
+                s->cc_op = CC_OP_MULQ;
+                break;
+#endif
+            }
+            break;
+        case 6: /* div */
+            switch(ot) {
+            case OT_BYTE:
+                gen_jmp_im(pc_start - s->cs_base);
+                gen_helper_divb_AL(cpu_T[0]);
+                break;
+            case OT_WORD:
+                gen_jmp_im(pc_start - s->cs_base);
+                gen_helper_divw_AX(cpu_T[0]);
+                break;
+            default:
+            case OT_LONG:
+                gen_jmp_im(pc_start - s->cs_base);
+                gen_helper_divl_EAX(cpu_T[0]);
+                break;
+#ifdef TARGET_X86_64
+            case OT_QUAD:
+                gen_jmp_im(pc_start - s->cs_base);
+                gen_helper_divq_EAX(cpu_T[0]);
+                break;
+#endif
+            }
+            break;
+        case 7: /* idiv */
+            switch(ot) {
+            case OT_BYTE:
+                gen_jmp_im(pc_start - s->cs_base);
+                gen_helper_idivb_AL(cpu_T[0]);
+                break;
+            case OT_WORD:
+                gen_jmp_im(pc_start - s->cs_base);
+                gen_helper_idivw_AX(cpu_T[0]);
+                break;
+            default:
+            case OT_LONG:
+                gen_jmp_im(pc_start - s->cs_base);
+                gen_helper_idivl_EAX(cpu_T[0]);
+                break;
+#ifdef TARGET_X86_64
+            case OT_QUAD:
+                gen_jmp_im(pc_start - s->cs_base);
+                gen_helper_idivq_EAX(cpu_T[0]);
+                break;
+#endif
+            }
+            break;
+        default:
+            goto illegal_op;
+        }
+        break;
+
+    case 0xfe: /* GRP4 */
+    case 0xff: /* GRP5 */
+        if ((b & 1) == 0)
+            ot = OT_BYTE;
+        else
+            ot = dflag + OT_WORD;
+
+        modrm = ldub_code(s->pc++);
+        mod = (modrm >> 6) & 3;
+        rm = (modrm & 7) | REX_B(s);
+        op = (modrm >> 3) & 7;
+        if (op >= 2 && b == 0xfe) {
+            goto illegal_op;
+        }
+        if (CODE64(s)) {
+            if (op == 2 || op == 4) {
+                /* operand size for jumps is 64 bit */
+                ot = OT_QUAD;
+            } else if (op == 3 || op == 5) {
+                ot = dflag ? OT_LONG + (rex_w == 1) : OT_WORD;
+            } else if (op == 6) {
+                /* default push size is 64 bit */
+                ot = dflag ? OT_QUAD : OT_WORD;
+            }
+        }
+        if (mod != 3) {
+            gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+            if (op >= 2 && op != 3 && op != 5)
+                gen_op_ld_T0_A0(ot + s->mem_index);
+        } else {
+            gen_op_mov_TN_reg(ot, 0, rm);
+        }
+
+        switch(op) {
+        case 0: /* inc Ev */
+            if (mod != 3)
+                opreg = OR_TMP0;
+            else
+                opreg = rm;
+            gen_inc(s, ot, opreg, 1);
+            break;
+        case 1: /* dec Ev */
+            if (mod != 3)
+                opreg = OR_TMP0;
+            else
+                opreg = rm;
+            gen_inc(s, ot, opreg, -1);
+            break;
+        case 2: /* call Ev */
+            /* XXX: optimize if memory (no 'and' is necessary) */
+#ifdef VBOX_WITH_CALL_RECORD
+            if (s->record_call)
+                gen_op_record_call();
+#endif
+            if (s->dflag == 0)
+                gen_op_andl_T0_ffff();
+            next_eip = s->pc - s->cs_base;
+            gen_movtl_T1_im(next_eip);
+            gen_push_T1(s);
+            gen_op_jmp_T0();
+            gen_eob(s);
+            break;
+        case 3: /* lcall Ev */
+            gen_op_ld_T1_A0(ot + s->mem_index);
+            gen_add_A0_im(s, 1 << (ot - OT_WORD + 1));
+            gen_op_ldu_T0_A0(OT_WORD + s->mem_index);
+        do_lcall:
+            if (s->pe && !s->vm86) {
+                if (s->cc_op != CC_OP_DYNAMIC)
+                    gen_op_set_cc_op(s->cc_op);
+                gen_jmp_im(pc_start - s->cs_base);
+                tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+                gen_helper_lcall_protected(cpu_tmp2_i32, cpu_T[1],
+                                           tcg_const_i32(dflag),
+                                           tcg_const_i32(s->pc - pc_start));
+            } else {
+                tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+                gen_helper_lcall_real(cpu_tmp2_i32, cpu_T[1],
+                                      tcg_const_i32(dflag),
+                                      tcg_const_i32(s->pc - s->cs_base));
+            }
+            gen_eob(s);
+            break;
+        case 4: /* jmp Ev */
+            if (s->dflag == 0)
+                gen_op_andl_T0_ffff();
+            gen_op_jmp_T0();
+            gen_eob(s);
+            break;
+        case 5: /* ljmp Ev */
+            gen_op_ld_T1_A0(ot + s->mem_index);
+            gen_add_A0_im(s, 1 << (ot - OT_WORD + 1));
+            gen_op_ldu_T0_A0(OT_WORD + s->mem_index);
+        do_ljmp:
+            if (s->pe && !s->vm86) {
+                if (s->cc_op != CC_OP_DYNAMIC)
+                    gen_op_set_cc_op(s->cc_op);
+                gen_jmp_im(pc_start - s->cs_base);
+                tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+                gen_helper_ljmp_protected(cpu_tmp2_i32, cpu_T[1],
+                                          tcg_const_i32(s->pc - pc_start));
+            } else {
+                gen_op_movl_seg_T0_vm(R_CS);
+                gen_op_movl_T0_T1();
+                gen_op_jmp_T0();
+            }
+            gen_eob(s);
+            break;
+        case 6: /* push Ev */
+            gen_push_T0(s);
+            break;
+        default:
+            goto illegal_op;
+        }
+        break;
+
+    case 0x84: /* test Ev, Gv */
+    case 0x85:
+        if ((b & 1) == 0)
+            ot = OT_BYTE;
+        else
+            ot = dflag + OT_WORD;
+
+        modrm = ldub_code(s->pc++);
+        reg = ((modrm >> 3) & 7) | rex_r;
+
+        gen_ldst_modrm(s, modrm, ot, OR_TMP0, 0);
+        gen_op_mov_TN_reg(ot, 1, reg);
+        gen_op_testl_T0_T1_cc();
+        s->cc_op = CC_OP_LOGICB + ot;
+        break;
+
+    case 0xa8: /* test eAX, Iv */
+    case 0xa9:
+        if ((b & 1) == 0)
+            ot = OT_BYTE;
+        else
+            ot = dflag + OT_WORD;
+        val = insn_get(s, ot);
+
+        gen_op_mov_TN_reg(ot, 0, OR_EAX);
+        gen_op_movl_T1_im(val);
+        gen_op_testl_T0_T1_cc();
+        s->cc_op = CC_OP_LOGICB + ot;
+        break;
+
+    case 0x98: /* CWDE/CBW */
+#ifdef TARGET_X86_64
+        if (dflag == 2) {
+            gen_op_mov_TN_reg(OT_LONG, 0, R_EAX);
+            tcg_gen_ext32s_tl(cpu_T[0], cpu_T[0]);
+            gen_op_mov_reg_T0(OT_QUAD, R_EAX);
+        } else
+#endif
+        if (dflag == 1) {
+            gen_op_mov_TN_reg(OT_WORD, 0, R_EAX);
+            tcg_gen_ext16s_tl(cpu_T[0], cpu_T[0]);
+            gen_op_mov_reg_T0(OT_LONG, R_EAX);
+        } else {
+            gen_op_mov_TN_reg(OT_BYTE, 0, R_EAX);
+            tcg_gen_ext8s_tl(cpu_T[0], cpu_T[0]);
+            gen_op_mov_reg_T0(OT_WORD, R_EAX);
+        }
+        break;
+    case 0x99: /* CDQ/CWD */
+#ifdef TARGET_X86_64
+        if (dflag == 2) {
+            gen_op_mov_TN_reg(OT_QUAD, 0, R_EAX);
+            tcg_gen_sari_tl(cpu_T[0], cpu_T[0], 63);
+            gen_op_mov_reg_T0(OT_QUAD, R_EDX);
+        } else
+#endif
+        if (dflag == 1) {
+            gen_op_mov_TN_reg(OT_LONG, 0, R_EAX);
+            tcg_gen_ext32s_tl(cpu_T[0], cpu_T[0]);
+            tcg_gen_sari_tl(cpu_T[0], cpu_T[0], 31);
+            gen_op_mov_reg_T0(OT_LONG, R_EDX);
+        } else {
+            gen_op_mov_TN_reg(OT_WORD, 0, R_EAX);
+            tcg_gen_ext16s_tl(cpu_T[0], cpu_T[0]);
+            tcg_gen_sari_tl(cpu_T[0], cpu_T[0], 15);
+            gen_op_mov_reg_T0(OT_WORD, R_EDX);
+        }
+        break;
+    case 0x1af: /* imul Gv, Ev */
+    case 0x69: /* imul Gv, Ev, I */
+    case 0x6b:
+        ot = dflag + OT_WORD;
+        modrm = ldub_code(s->pc++);
+        reg = ((modrm >> 3) & 7) | rex_r;
+        if (b == 0x69)
+            s->rip_offset = insn_const_size(ot);
+        else if (b == 0x6b)
+            s->rip_offset = 1;
+        gen_ldst_modrm(s, modrm, ot, OR_TMP0, 0);
+        if (b == 0x69) {
+            val = insn_get(s, ot);
+            gen_op_movl_T1_im(val);
+        } else if (b == 0x6b) {
+            val = (int8_t)insn_get(s, OT_BYTE);
+            gen_op_movl_T1_im(val);
+        } else {
+            gen_op_mov_TN_reg(ot, 1, reg);
+        }
+
+#ifdef TARGET_X86_64
+        if (ot == OT_QUAD) {
+            gen_helper_imulq_T0_T1(cpu_T[0], cpu_T[0], cpu_T[1]);
+        } else
+#endif
+        if (ot == OT_LONG) {
+#ifdef TARGET_X86_64
+                tcg_gen_ext32s_tl(cpu_T[0], cpu_T[0]);
+                tcg_gen_ext32s_tl(cpu_T[1], cpu_T[1]);
+                tcg_gen_mul_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+                tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+                tcg_gen_ext32s_tl(cpu_tmp0, cpu_T[0]);
+                tcg_gen_sub_tl(cpu_cc_src, cpu_T[0], cpu_tmp0);
+#else
+                {
+                    TCGv_i64 t0, t1;
+                    t0 = tcg_temp_new_i64();
+                    t1 = tcg_temp_new_i64();
+                    tcg_gen_ext_i32_i64(t0, cpu_T[0]);
+                    tcg_gen_ext_i32_i64(t1, cpu_T[1]);
+                    tcg_gen_mul_i64(t0, t0, t1);
+                    tcg_gen_trunc_i64_i32(cpu_T[0], t0);
+                    tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+                    tcg_gen_sari_tl(cpu_tmp0, cpu_T[0], 31);
+                    tcg_gen_shri_i64(t0, t0, 32);
+                    tcg_gen_trunc_i64_i32(cpu_T[1], t0);
+                    tcg_gen_sub_tl(cpu_cc_src, cpu_T[1], cpu_tmp0);
+                }
+#endif
+        } else {
+            tcg_gen_ext16s_tl(cpu_T[0], cpu_T[0]);
+            tcg_gen_ext16s_tl(cpu_T[1], cpu_T[1]);
+            /* XXX: use 32 bit mul which could be faster */
+            tcg_gen_mul_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+            tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+            tcg_gen_ext16s_tl(cpu_tmp0, cpu_T[0]);
+            tcg_gen_sub_tl(cpu_cc_src, cpu_T[0], cpu_tmp0);
+        }
+        gen_op_mov_reg_T0(ot, reg);
+        s->cc_op = CC_OP_MULB + ot;
+        break;
+    case 0x1c0:
+    case 0x1c1: /* xadd Ev, Gv */
+        if ((b & 1) == 0)
+            ot = OT_BYTE;
+        else
+            ot = dflag + OT_WORD;
+        modrm = ldub_code(s->pc++);
+        reg = ((modrm >> 3) & 7) | rex_r;
+        mod = (modrm >> 6) & 3;
+        if (mod == 3) {
+            rm = (modrm & 7) | REX_B(s);
+            gen_op_mov_TN_reg(ot, 0, reg);
+            gen_op_mov_TN_reg(ot, 1, rm);
+            gen_op_addl_T0_T1();
+            gen_op_mov_reg_T1(ot, reg);
+            gen_op_mov_reg_T0(ot, rm);
+        } else {
+            gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+            gen_op_mov_TN_reg(ot, 0, reg);
+            gen_op_ld_T1_A0(ot + s->mem_index);
+            gen_op_addl_T0_T1();
+            gen_op_st_T0_A0(ot + s->mem_index);
+            gen_op_mov_reg_T1(ot, reg);
+        }
+        gen_op_update2_cc();
+        s->cc_op = CC_OP_ADDB + ot;
+        break;
+    case 0x1b0:
+    case 0x1b1: /* cmpxchg Ev, Gv */
+        {
+            int label1, label2;
+            TCGv t0, t1, t2, a0;
+
+            if ((b & 1) == 0)
+                ot = OT_BYTE;
+            else
+                ot = dflag + OT_WORD;
+            modrm = ldub_code(s->pc++);
+            reg = ((modrm >> 3) & 7) | rex_r;
+            mod = (modrm >> 6) & 3;
+            t0 = tcg_temp_local_new();
+            t1 = tcg_temp_local_new();
+            t2 = tcg_temp_local_new();
+            a0 = tcg_temp_local_new();
+            gen_op_mov_v_reg(ot, t1, reg);
+            if (mod == 3) {
+                rm = (modrm & 7) | REX_B(s);
+                gen_op_mov_v_reg(ot, t0, rm);
+            } else {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                tcg_gen_mov_tl(a0, cpu_A0);
+                gen_op_ld_v(ot + s->mem_index, t0, a0);
+                rm = 0; /* avoid warning */
+            }
+            label1 = gen_new_label();
+            tcg_gen_sub_tl(t2, cpu_regs[R_EAX], t0);
+            gen_extu(ot, t2);
+            tcg_gen_brcondi_tl(TCG_COND_EQ, t2, 0, label1);
+            if (mod == 3) {
+                label2 = gen_new_label();
+                gen_op_mov_reg_v(ot, R_EAX, t0);
+                tcg_gen_br(label2);
+                gen_set_label(label1);
+                gen_op_mov_reg_v(ot, rm, t1);
+                gen_set_label(label2);
+            } else {
+                tcg_gen_mov_tl(t1, t0);
+                gen_op_mov_reg_v(ot, R_EAX, t0);
+                gen_set_label(label1);
+                /* always store */
+                gen_op_st_v(ot + s->mem_index, t1, a0);
+            }
+            tcg_gen_mov_tl(cpu_cc_src, t0);
+            tcg_gen_mov_tl(cpu_cc_dst, t2);
+            s->cc_op = CC_OP_SUBB + ot;
+            tcg_temp_free(t0);
+            tcg_temp_free(t1);
+            tcg_temp_free(t2);
+            tcg_temp_free(a0);
+        }
+        break;
+    case 0x1c7: /* cmpxchg8b */
+        modrm = ldub_code(s->pc++);
+        mod = (modrm >> 6) & 3;
+        if ((mod == 3) || ((modrm & 0x38) != 0x8))
+            goto illegal_op;
+#ifdef TARGET_X86_64
+        if (dflag == 2) {
+            if (!(s->cpuid_ext_features & CPUID_EXT_CX16))
+                goto illegal_op;
+            gen_jmp_im(pc_start - s->cs_base);
+            if (s->cc_op != CC_OP_DYNAMIC)
+                gen_op_set_cc_op(s->cc_op);
+            gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+            gen_helper_cmpxchg16b(cpu_A0);
+        } else
+#endif
+        {
+            if (!(s->cpuid_features & CPUID_CX8))
+                goto illegal_op;
+            gen_jmp_im(pc_start - s->cs_base);
+            if (s->cc_op != CC_OP_DYNAMIC)
+                gen_op_set_cc_op(s->cc_op);
+            gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+            gen_helper_cmpxchg8b(cpu_A0);
+        }
+        s->cc_op = CC_OP_EFLAGS;
+        break;
+
+        /**************************/
+        /* push/pop */
+    case 0x50 ... 0x57: /* push */
+        gen_op_mov_TN_reg(OT_LONG, 0, (b & 7) | REX_B(s));
+        gen_push_T0(s);
+        break;
+    case 0x58 ... 0x5f: /* pop */
+        if (CODE64(s)) {
+            ot = dflag ? OT_QUAD : OT_WORD;
+        } else {
+            ot = dflag + OT_WORD;
+        }
+        gen_pop_T0(s);
+        /* NOTE: order is important for pop %sp */
+        gen_pop_update(s);
+        gen_op_mov_reg_T0(ot, (b & 7) | REX_B(s));
+        break;
+    case 0x60: /* pusha */
+        if (CODE64(s))
+            goto illegal_op;
+        gen_pusha(s);
+        break;
+    case 0x61: /* popa */
+        if (CODE64(s))
+            goto illegal_op;
+        gen_popa(s);
+        break;
+    case 0x68: /* push Iv */
+    case 0x6a:
+        if (CODE64(s)) {
+            ot = dflag ? OT_QUAD : OT_WORD;
+        } else {
+            ot = dflag + OT_WORD;
+        }
+        if (b == 0x68)
+            val = insn_get(s, ot);
+        else
+            val = (int8_t)insn_get(s, OT_BYTE);
+        gen_op_movl_T0_im(val);
+        gen_push_T0(s);
+        break;
+    case 0x8f: /* pop Ev */
+        if (CODE64(s)) {
+            ot = dflag ? OT_QUAD : OT_WORD;
+        } else {
+            ot = dflag + OT_WORD;
+        }
+        modrm = ldub_code(s->pc++);
+        mod = (modrm >> 6) & 3;
+        gen_pop_T0(s);
+        if (mod == 3) {
+            /* NOTE: order is important for pop %sp */
+            gen_pop_update(s);
+            rm = (modrm & 7) | REX_B(s);
+            gen_op_mov_reg_T0(ot, rm);
+        } else {
+            /* NOTE: order is important too for MMU exceptions */
+            s->popl_esp_hack = 1 << ot;
+            gen_ldst_modrm(s, modrm, ot, OR_TMP0, 1);
+            s->popl_esp_hack = 0;
+            gen_pop_update(s);
+        }
+        break;
+    case 0xc8: /* enter */
+        {
+            int level;
+            val = lduw_code(s->pc);
+            s->pc += 2;
+            level = ldub_code(s->pc++);
+            gen_enter(s, val, level);
+        }
+        break;
+    case 0xc9: /* leave */
+        /* XXX: exception not precise (ESP is updated before potential exception) */
+        if (CODE64(s)) {
+            gen_op_mov_TN_reg(OT_QUAD, 0, R_EBP);
+            gen_op_mov_reg_T0(OT_QUAD, R_ESP);
+        } else if (s->ss32) {
+            gen_op_mov_TN_reg(OT_LONG, 0, R_EBP);
+            gen_op_mov_reg_T0(OT_LONG, R_ESP);
+        } else {
+            gen_op_mov_TN_reg(OT_WORD, 0, R_EBP);
+            gen_op_mov_reg_T0(OT_WORD, R_ESP);
+        }
+        gen_pop_T0(s);
+        if (CODE64(s)) {
+            ot = dflag ? OT_QUAD : OT_WORD;
+        } else {
+            ot = dflag + OT_WORD;
+        }
+        gen_op_mov_reg_T0(ot, R_EBP);
+        gen_pop_update(s);
+        break;
+    case 0x06: /* push es */
+    case 0x0e: /* push cs */
+    case 0x16: /* push ss */
+    case 0x1e: /* push ds */
+        if (CODE64(s))
+            goto illegal_op;
+        gen_op_movl_T0_seg(b >> 3);
+        gen_push_T0(s);
+        break;
+    case 0x1a0: /* push fs */
+    case 0x1a8: /* push gs */
+        gen_op_movl_T0_seg((b >> 3) & 7);
+        gen_push_T0(s);
+        break;
+    case 0x07: /* pop es */
+    case 0x17: /* pop ss */
+    case 0x1f: /* pop ds */
+        if (CODE64(s))
+            goto illegal_op;
+        reg = b >> 3;
+        gen_pop_T0(s);
+        gen_movl_seg_T0(s, reg, pc_start - s->cs_base);
+        gen_pop_update(s);
+        if (reg == R_SS) {
+            /* if reg == SS, inhibit interrupts/trace. */
+            /* If several instructions disable interrupts, only the
+               _first_ does it */
+            if (!(s->tb->flags & HF_INHIBIT_IRQ_MASK))
+                gen_helper_set_inhibit_irq();
+            s->tf = 0;
+        }
+        if (s->is_jmp) {
+            gen_jmp_im(s->pc - s->cs_base);
+            gen_eob(s);
+        }
+        break;
+    case 0x1a1: /* pop fs */
+    case 0x1a9: /* pop gs */
+        gen_pop_T0(s);
+        gen_movl_seg_T0(s, (b >> 3) & 7, pc_start - s->cs_base);
+        gen_pop_update(s);
+        if (s->is_jmp) {
+            gen_jmp_im(s->pc - s->cs_base);
+            gen_eob(s);
+        }
+        break;
+
+        /**************************/
+        /* mov */
+    case 0x88:
+    case 0x89: /* mov Gv, Ev */
+        if ((b & 1) == 0)
+            ot = OT_BYTE;
+        else
+            ot = dflag + OT_WORD;
+        modrm = ldub_code(s->pc++);
+        reg = ((modrm >> 3) & 7) | rex_r;
+
+        /* generate a generic store */
+        gen_ldst_modrm(s, modrm, ot, reg, 1);
+        break;
+    case 0xc6:
+    case 0xc7: /* mov Ev, Iv */
+        if ((b & 1) == 0)
+            ot = OT_BYTE;
+        else
+            ot = dflag + OT_WORD;
+        modrm = ldub_code(s->pc++);
+        mod = (modrm >> 6) & 3;
+        if (mod != 3) {
+            s->rip_offset = insn_const_size(ot);
+            gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+        }
+        val = insn_get(s, ot);
+        gen_op_movl_T0_im(val);
+        if (mod != 3)
+            gen_op_st_T0_A0(ot + s->mem_index);
+        else
+            gen_op_mov_reg_T0(ot, (modrm & 7) | REX_B(s));
+        break;
+    case 0x8a:
+    case 0x8b: /* mov Ev, Gv */
+#ifdef VBOX /* dtrace hot fix */
+        if (prefixes & PREFIX_LOCK)
+            goto illegal_op;
+#endif
+        if ((b & 1) == 0)
+            ot = OT_BYTE;
+        else
+            ot = OT_WORD + dflag;
+        modrm = ldub_code(s->pc++);
+        reg = ((modrm >> 3) & 7) | rex_r;
+
+        gen_ldst_modrm(s, modrm, ot, OR_TMP0, 0);
+        gen_op_mov_reg_T0(ot, reg);
+        break;
+    case 0x8e: /* mov seg, Gv */
+        modrm = ldub_code(s->pc++);
+        reg = (modrm >> 3) & 7;
+        if (reg >= 6 || reg == R_CS)
+            goto illegal_op;
+        gen_ldst_modrm(s, modrm, OT_WORD, OR_TMP0, 0);
+        gen_movl_seg_T0(s, reg, pc_start - s->cs_base);
+        if (reg == R_SS) {
+            /* if reg == SS, inhibit interrupts/trace */
+            /* If several instructions disable interrupts, only the
+               _first_ does it */
+            if (!(s->tb->flags & HF_INHIBIT_IRQ_MASK))
+                gen_helper_set_inhibit_irq();
+            s->tf = 0;
+        }
+        if (s->is_jmp) {
+            gen_jmp_im(s->pc - s->cs_base);
+            gen_eob(s);
+        }
+        break;
+    case 0x8c: /* mov Gv, seg */
+        modrm = ldub_code(s->pc++);
+        reg = (modrm >> 3) & 7;
+        mod = (modrm >> 6) & 3;
+        if (reg >= 6)
+            goto illegal_op;
+        gen_op_movl_T0_seg(reg);
+        if (mod == 3)
+            ot = OT_WORD + dflag;
+        else
+            ot = OT_WORD;
+        gen_ldst_modrm(s, modrm, ot, OR_TMP0, 1);
+        break;
+
+    case 0x1b6: /* movzbS Gv, Eb */
+    case 0x1b7: /* movzwS Gv, Eb */
+    case 0x1be: /* movsbS Gv, Eb */
+    case 0x1bf: /* movswS Gv, Eb */
+        {
+            int d_ot;
+            /* d_ot is the size of destination */
+            d_ot = dflag + OT_WORD;
+            /* ot is the size of source */
+            ot = (b & 1) + OT_BYTE;
+            modrm = ldub_code(s->pc++);
+            reg = ((modrm >> 3) & 7) | rex_r;
+            mod = (modrm >> 6) & 3;
+            rm = (modrm & 7) | REX_B(s);
+
+            if (mod == 3) {
+                gen_op_mov_TN_reg(ot, 0, rm);
+                switch(ot | (b & 8)) {
+                case OT_BYTE:
+                    tcg_gen_ext8u_tl(cpu_T[0], cpu_T[0]);
+                    break;
+                case OT_BYTE | 8:
+                    tcg_gen_ext8s_tl(cpu_T[0], cpu_T[0]);
+                    break;
+                case OT_WORD:
+                    tcg_gen_ext16u_tl(cpu_T[0], cpu_T[0]);
+                    break;
+                default:
+                case OT_WORD | 8:
+                    tcg_gen_ext16s_tl(cpu_T[0], cpu_T[0]);
+                    break;
+                }
+                gen_op_mov_reg_T0(d_ot, reg);
+            } else {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                if (b & 8) {
+                    gen_op_lds_T0_A0(ot + s->mem_index);
+                } else {
+                    gen_op_ldu_T0_A0(ot + s->mem_index);
+                }
+                gen_op_mov_reg_T0(d_ot, reg);
+            }
+        }
+        break;
+
+    case 0x8d: /* lea */
+        ot = dflag + OT_WORD;
+        modrm = ldub_code(s->pc++);
+        mod = (modrm >> 6) & 3;
+        if (mod == 3)
+            goto illegal_op;
+        reg = ((modrm >> 3) & 7) | rex_r;
+        /* we must ensure that no segment is added */
+        s->override = -1;
+        val = s->addseg;
+        s->addseg = 0;
+        gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+        s->addseg = val;
+        gen_op_mov_reg_A0(ot - OT_WORD, reg);
+        break;
+
+    case 0xa0: /* mov EAX, Ov */
+    case 0xa1:
+    case 0xa2: /* mov Ov, EAX */
+    case 0xa3:
+        {
+            target_ulong offset_addr;
+
+            if ((b & 1) == 0)
+                ot = OT_BYTE;
+            else
+                ot = dflag + OT_WORD;
+#ifdef TARGET_X86_64
+            if (s->aflag == 2) {
+                offset_addr = ldq_code(s->pc);
+                s->pc += 8;
+                gen_op_movq_A0_im(offset_addr);
+            } else
+#endif
+            {
+                if (s->aflag) {
+                    offset_addr = insn_get(s, OT_LONG);
+                } else {
+                    offset_addr = insn_get(s, OT_WORD);
+                }
+                gen_op_movl_A0_im(offset_addr);
+            }
+            gen_add_A0_ds_seg(s);
+            if ((b & 2) == 0) {
+                gen_op_ld_T0_A0(ot + s->mem_index);
+                gen_op_mov_reg_T0(ot, R_EAX);
+            } else {
+                gen_op_mov_TN_reg(ot, 0, R_EAX);
+                gen_op_st_T0_A0(ot + s->mem_index);
+            }
+        }
+        break;
+    case 0xd7: /* xlat */
+#ifdef TARGET_X86_64
+        if (s->aflag == 2) {
+            gen_op_movq_A0_reg(R_EBX);
+            gen_op_mov_TN_reg(OT_QUAD, 0, R_EAX);
+            tcg_gen_andi_tl(cpu_T[0], cpu_T[0], 0xff);
+            tcg_gen_add_tl(cpu_A0, cpu_A0, cpu_T[0]);
+        } else
+#endif
+        {
+            gen_op_movl_A0_reg(R_EBX);
+            gen_op_mov_TN_reg(OT_LONG, 0, R_EAX);
+            tcg_gen_andi_tl(cpu_T[0], cpu_T[0], 0xff);
+            tcg_gen_add_tl(cpu_A0, cpu_A0, cpu_T[0]);
+            if (s->aflag == 0)
+                gen_op_andl_A0_ffff();
+            else
+                tcg_gen_andi_tl(cpu_A0, cpu_A0, 0xffffffff);
+        }
+        gen_add_A0_ds_seg(s);
+        gen_op_ldu_T0_A0(OT_BYTE + s->mem_index);
+        gen_op_mov_reg_T0(OT_BYTE, R_EAX);
+        break;
+    case 0xb0 ... 0xb7: /* mov R, Ib */
+        val = insn_get(s, OT_BYTE);
+        gen_op_movl_T0_im(val);
+        gen_op_mov_reg_T0(OT_BYTE, (b & 7) | REX_B(s));
+        break;
+    case 0xb8 ... 0xbf: /* mov R, Iv */
+#ifdef TARGET_X86_64
+        if (dflag == 2) {
+            uint64_t tmp;
+            /* 64 bit case */
+            tmp = ldq_code(s->pc);
+            s->pc += 8;
+            reg = (b & 7) | REX_B(s);
+            gen_movtl_T0_im(tmp);
+            gen_op_mov_reg_T0(OT_QUAD, reg);
+        } else
+#endif
+        {
+            ot = dflag ? OT_LONG : OT_WORD;
+            val = insn_get(s, ot);
+            reg = (b & 7) | REX_B(s);
+            gen_op_movl_T0_im(val);
+            gen_op_mov_reg_T0(ot, reg);
+        }
+        break;
+
+    case 0x91 ... 0x97: /* xchg R, EAX */
+    do_xchg_reg_eax:
+        ot = dflag + OT_WORD;
+        reg = (b & 7) | REX_B(s);
+        rm = R_EAX;
+        goto do_xchg_reg;
+    case 0x86:
+    case 0x87: /* xchg Ev, Gv */
+        if ((b & 1) == 0)
+            ot = OT_BYTE;
+        else
+            ot = dflag + OT_WORD;
+        modrm = ldub_code(s->pc++);
+        reg = ((modrm >> 3) & 7) | rex_r;
+        mod = (modrm >> 6) & 3;
+        if (mod == 3) {
+            rm = (modrm & 7) | REX_B(s);
+        do_xchg_reg:
+            gen_op_mov_TN_reg(ot, 0, reg);
+            gen_op_mov_TN_reg(ot, 1, rm);
+            gen_op_mov_reg_T0(ot, rm);
+            gen_op_mov_reg_T1(ot, reg);
+        } else {
+            gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+            gen_op_mov_TN_reg(ot, 0, reg);
+            /* for xchg, lock is implicit */
+            if (!(prefixes & PREFIX_LOCK))
+                gen_helper_lock();
+            gen_op_ld_T1_A0(ot + s->mem_index);
+            gen_op_st_T0_A0(ot + s->mem_index);
+            if (!(prefixes & PREFIX_LOCK))
+                gen_helper_unlock();
+            gen_op_mov_reg_T1(ot, reg);
+        }
+        break;
+    case 0xc4: /* les Gv */
+        if (CODE64(s))
+            goto illegal_op;
+        op = R_ES;
+        goto do_lxx;
+    case 0xc5: /* lds Gv */
+        if (CODE64(s))
+            goto illegal_op;
+        op = R_DS;
+        goto do_lxx;
+    case 0x1b2: /* lss Gv */
+        op = R_SS;
+        goto do_lxx;
+    case 0x1b4: /* lfs Gv */
+        op = R_FS;
+        goto do_lxx;
+    case 0x1b5: /* lgs Gv */
+        op = R_GS;
+    do_lxx:
+        ot = dflag ? OT_LONG : OT_WORD;
+        modrm = ldub_code(s->pc++);
+        reg = ((modrm >> 3) & 7) | rex_r;
+        mod = (modrm >> 6) & 3;
+        if (mod == 3)
+            goto illegal_op;
+        gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+        gen_op_ld_T1_A0(ot + s->mem_index);
+        gen_add_A0_im(s, 1 << (ot - OT_WORD + 1));
+        /* load the segment first to handle exceptions properly */
+        gen_op_ldu_T0_A0(OT_WORD + s->mem_index);
+        gen_movl_seg_T0(s, op, pc_start - s->cs_base);
+        /* then put the data */
+        gen_op_mov_reg_T1(ot, reg);
+        if (s->is_jmp) {
+            gen_jmp_im(s->pc - s->cs_base);
+            gen_eob(s);
+        }
+        break;
+
+        /************************/
+        /* shifts */
+    case 0xc0:
+    case 0xc1:
+        /* shift Ev,Ib */
+        shift = 2;
+    grp2:
+        {
+            if ((b & 1) == 0)
+                ot = OT_BYTE;
+            else
+                ot = dflag + OT_WORD;
+
+            modrm = ldub_code(s->pc++);
+            mod = (modrm >> 6) & 3;
+            op = (modrm >> 3) & 7;
+
+            if (mod != 3) {
+                if (shift == 2) {
+                    s->rip_offset = 1;
+                }
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                opreg = OR_TMP0;
+            } else {
+                opreg = (modrm & 7) | REX_B(s);
+            }
+
+            /* simpler op */
+            if (shift == 0) {
+                gen_shift(s, op, ot, opreg, OR_ECX);
+            } else {
+                if (shift == 2) {
+                    shift = ldub_code(s->pc++);
+                }
+                gen_shifti(s, op, ot, opreg, shift);
+            }
+        }
+        break;
+    case 0xd0:
+    case 0xd1:
+        /* shift Ev,1 */
+        shift = 1;
+        goto grp2;
+    case 0xd2:
+    case 0xd3:
+        /* shift Ev,cl */
+        shift = 0;
+        goto grp2;
+
+    case 0x1a4: /* shld imm */
+        op = 0;
+        shift = 1;
+        goto do_shiftd;
+    case 0x1a5: /* shld cl */
+        op = 0;
+        shift = 0;
+        goto do_shiftd;
+    case 0x1ac: /* shrd imm */
+        op = 1;
+        shift = 1;
+        goto do_shiftd;
+    case 0x1ad: /* shrd cl */
+        op = 1;
+        shift = 0;
+    do_shiftd:
+        ot = dflag + OT_WORD;
+        modrm = ldub_code(s->pc++);
+        mod = (modrm >> 6) & 3;
+        rm = (modrm & 7) | REX_B(s);
+        reg = ((modrm >> 3) & 7) | rex_r;
+        if (mod != 3) {
+            gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+            opreg = OR_TMP0;
+        } else {
+            opreg = rm;
+        }
+        gen_op_mov_TN_reg(ot, 1, reg);
+
+        if (shift) {
+            val = ldub_code(s->pc++);
+            tcg_gen_movi_tl(cpu_T3, val);
+        } else {
+            tcg_gen_mov_tl(cpu_T3, cpu_regs[R_ECX]);
+        }
+        gen_shiftd_rm_T1_T3(s, ot, opreg, op);
+        break;
+
+        /************************/
+        /* floats */
+    case 0xd8 ... 0xdf:
+        if (s->flags & (HF_EM_MASK | HF_TS_MASK)) {
+            /* if CR0.EM or CR0.TS are set, generate an FPU exception */
+            /* XXX: what to do if illegal op ? */
+            gen_exception(s, EXCP07_PREX, pc_start - s->cs_base);
+            break;
+        }
+        modrm = ldub_code(s->pc++);
+        mod = (modrm >> 6) & 3;
+        rm = modrm & 7;
+        op = ((b & 7) << 3) | ((modrm >> 3) & 7);
+        if (mod != 3) {
+            /* memory op */
+            gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+            switch(op) {
+            case 0x00 ... 0x07: /* fxxxs */
+            case 0x10 ... 0x17: /* fixxxl */
+            case 0x20 ... 0x27: /* fxxxl */
+            case 0x30 ... 0x37: /* fixxx */
+                {
+                    int op1;
+                    op1 = op & 7;
+
+                    switch(op >> 4) {
+                    case 0:
+                        gen_op_ld_T0_A0(OT_LONG + s->mem_index);
+                        tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+                        gen_helper_flds_FT0(cpu_tmp2_i32);
+                        break;
+                    case 1:
+                        gen_op_ld_T0_A0(OT_LONG + s->mem_index);
+                        tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+                        gen_helper_fildl_FT0(cpu_tmp2_i32);
+                        break;
+                    case 2:
+                        tcg_gen_qemu_ld64(cpu_tmp1_i64, cpu_A0,
+                                          (s->mem_index >> 2) - 1);
+                        gen_helper_fldl_FT0(cpu_tmp1_i64);
+                        break;
+                    case 3:
+                    default:
+                        gen_op_lds_T0_A0(OT_WORD + s->mem_index);
+                        tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+                        gen_helper_fildl_FT0(cpu_tmp2_i32);
+                        break;
+                    }
+
+                    gen_helper_fp_arith_ST0_FT0(op1);
+                    if (op1 == 3) {
+                        /* fcomp needs pop */
+                        gen_helper_fpop();
+                    }
+                }
+                break;
+            case 0x08: /* flds */
+            case 0x0a: /* fsts */
+            case 0x0b: /* fstps */
+            case 0x18 ... 0x1b: /* fildl, fisttpl, fistl, fistpl */
+            case 0x28 ... 0x2b: /* fldl, fisttpll, fstl, fstpl */
+            case 0x38 ... 0x3b: /* filds, fisttps, fists, fistps */
+                switch(op & 7) {
+                case 0:
+                    switch(op >> 4) {
+                    case 0:
+                        gen_op_ld_T0_A0(OT_LONG + s->mem_index);
+                        tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+                        gen_helper_flds_ST0(cpu_tmp2_i32);
+                        break;
+                    case 1:
+                        gen_op_ld_T0_A0(OT_LONG + s->mem_index);
+                        tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+                        gen_helper_fildl_ST0(cpu_tmp2_i32);
+                        break;
+                    case 2:
+                        tcg_gen_qemu_ld64(cpu_tmp1_i64, cpu_A0,
+                                          (s->mem_index >> 2) - 1);
+                        gen_helper_fldl_ST0(cpu_tmp1_i64);
+                        break;
+                    case 3:
+                    default:
+                        gen_op_lds_T0_A0(OT_WORD + s->mem_index);
+                        tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+                        gen_helper_fildl_ST0(cpu_tmp2_i32);
+                        break;
+                    }
+                    break;
+                case 1:
+                    /* XXX: the corresponding CPUID bit must be tested ! */
+                    switch(op >> 4) {
+                    case 1:
+                        gen_helper_fisttl_ST0(cpu_tmp2_i32);
+                        tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
+                        gen_op_st_T0_A0(OT_LONG + s->mem_index);
+                        break;
+                    case 2:
+                        gen_helper_fisttll_ST0(cpu_tmp1_i64);
+                        tcg_gen_qemu_st64(cpu_tmp1_i64, cpu_A0,
+                                          (s->mem_index >> 2) - 1);
+                        break;
+                    case 3:
+                    default:
+                        gen_helper_fistt_ST0(cpu_tmp2_i32);
+                        tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
+                        gen_op_st_T0_A0(OT_WORD + s->mem_index);
+                        break;
+                    }
+                    gen_helper_fpop();
+                    break;
+                default:
+                    switch(op >> 4) {
+                    case 0:
+                        gen_helper_fsts_ST0(cpu_tmp2_i32);
+                        tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
+                        gen_op_st_T0_A0(OT_LONG + s->mem_index);
+                        break;
+                    case 1:
+                        gen_helper_fistl_ST0(cpu_tmp2_i32);
+                        tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
+                        gen_op_st_T0_A0(OT_LONG + s->mem_index);
+                        break;
+                    case 2:
+                        gen_helper_fstl_ST0(cpu_tmp1_i64);
+                        tcg_gen_qemu_st64(cpu_tmp1_i64, cpu_A0,
+                                          (s->mem_index >> 2) - 1);
+                        break;
+                    case 3:
+                    default:
+                        gen_helper_fist_ST0(cpu_tmp2_i32);
+                        tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
+                        gen_op_st_T0_A0(OT_WORD + s->mem_index);
+                        break;
+                    }
+                    if ((op & 7) == 3)
+                        gen_helper_fpop();
+                    break;
+                }
+                break;
+            case 0x0c: /* fldenv mem */
+                if (s->cc_op != CC_OP_DYNAMIC)
+                    gen_op_set_cc_op(s->cc_op);
+                gen_jmp_im(pc_start - s->cs_base);
+                gen_helper_fldenv(
+                                   cpu_A0, tcg_const_i32(s->dflag));
+                break;
+            case 0x0d: /* fldcw mem */
+                gen_op_ld_T0_A0(OT_WORD + s->mem_index);
+                tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+                gen_helper_fldcw(cpu_tmp2_i32);
+                break;
+            case 0x0e: /* fnstenv mem */
+                if (s->cc_op != CC_OP_DYNAMIC)
+                    gen_op_set_cc_op(s->cc_op);
+                gen_jmp_im(pc_start - s->cs_base);
+                gen_helper_fstenv(cpu_A0, tcg_const_i32(s->dflag));
+                break;
+            case 0x0f: /* fnstcw mem */
+                gen_helper_fnstcw(cpu_tmp2_i32);
+                tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
+                gen_op_st_T0_A0(OT_WORD + s->mem_index);
+                break;
+            case 0x1d: /* fldt mem */
+                if (s->cc_op != CC_OP_DYNAMIC)
+                    gen_op_set_cc_op(s->cc_op);
+                gen_jmp_im(pc_start - s->cs_base);
+                gen_helper_fldt_ST0(cpu_A0);
+                break;
+            case 0x1f: /* fstpt mem */
+                if (s->cc_op != CC_OP_DYNAMIC)
+                    gen_op_set_cc_op(s->cc_op);
+                gen_jmp_im(pc_start - s->cs_base);
+                gen_helper_fstt_ST0(cpu_A0);
+                gen_helper_fpop();
+                break;
+            case 0x2c: /* frstor mem */
+                if (s->cc_op != CC_OP_DYNAMIC)
+                    gen_op_set_cc_op(s->cc_op);
+                gen_jmp_im(pc_start - s->cs_base);
+                gen_helper_frstor(cpu_A0, tcg_const_i32(s->dflag));
+                break;
+            case 0x2e: /* fnsave mem */
+                if (s->cc_op != CC_OP_DYNAMIC)
+                    gen_op_set_cc_op(s->cc_op);
+                gen_jmp_im(pc_start - s->cs_base);
+                gen_helper_fsave(cpu_A0, tcg_const_i32(s->dflag));
+                break;
+            case 0x2f: /* fnstsw mem */
+                gen_helper_fnstsw(cpu_tmp2_i32);
+                tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
+                gen_op_st_T0_A0(OT_WORD + s->mem_index);
+                break;
+            case 0x3c: /* fbld */
+                if (s->cc_op != CC_OP_DYNAMIC)
+                    gen_op_set_cc_op(s->cc_op);
+                gen_jmp_im(pc_start - s->cs_base);
+                gen_helper_fbld_ST0(cpu_A0);
+                break;
+            case 0x3e: /* fbstp */
+                if (s->cc_op != CC_OP_DYNAMIC)
+                    gen_op_set_cc_op(s->cc_op);
+                gen_jmp_im(pc_start - s->cs_base);
+                gen_helper_fbst_ST0(cpu_A0);
+                gen_helper_fpop();
+                break;
+            case 0x3d: /* fildll */
+                tcg_gen_qemu_ld64(cpu_tmp1_i64, cpu_A0,
+                                  (s->mem_index >> 2) - 1);
+                gen_helper_fildll_ST0(cpu_tmp1_i64);
+                break;
+            case 0x3f: /* fistpll */
+                gen_helper_fistll_ST0(cpu_tmp1_i64);
+                tcg_gen_qemu_st64(cpu_tmp1_i64, cpu_A0,
+                                  (s->mem_index >> 2) - 1);
+                gen_helper_fpop();
+                break;
+            default:
+                goto illegal_op;
+            }
+        } else {
+            /* register float ops */
+            opreg = rm;
+
+            switch(op) {
+            case 0x08: /* fld sti */
+                gen_helper_fpush();
+                gen_helper_fmov_ST0_STN(tcg_const_i32((opreg + 1) & 7));
+                break;
+            case 0x09: /* fxchg sti */
+            case 0x29: /* fxchg4 sti, undocumented op */
+            case 0x39: /* fxchg7 sti, undocumented op */
+                gen_helper_fxchg_ST0_STN(tcg_const_i32(opreg));
+                break;
+            case 0x0a: /* grp d9/2 */
+                switch(rm) {
+                case 0: /* fnop */
+                    /* check exceptions (FreeBSD FPU probe) */
+                    if (s->cc_op != CC_OP_DYNAMIC)
+                        gen_op_set_cc_op(s->cc_op);
+                    gen_jmp_im(pc_start - s->cs_base);
+                    gen_helper_fwait();
+                    break;
+                default:
+                    goto illegal_op;
+                }
+                break;
+            case 0x0c: /* grp d9/4 */
+                switch(rm) {
+                case 0: /* fchs */
+                    gen_helper_fchs_ST0();
+                    break;
+                case 1: /* fabs */
+                    gen_helper_fabs_ST0();
+                    break;
+                case 4: /* ftst */
+                    gen_helper_fldz_FT0();
+                    gen_helper_fcom_ST0_FT0();
+                    break;
+                case 5: /* fxam */
+                    gen_helper_fxam_ST0();
+                    break;
+                default:
+                    goto illegal_op;
+                }
+                break;
+            case 0x0d: /* grp d9/5 */
+                {
+                    switch(rm) {
+                    case 0:
+                        gen_helper_fpush();
+                        gen_helper_fld1_ST0();
+                        break;
+                    case 1:
+                        gen_helper_fpush();
+                        gen_helper_fldl2t_ST0();
+                        break;
+                    case 2:
+                        gen_helper_fpush();
+                        gen_helper_fldl2e_ST0();
+                        break;
+                    case 3:
+                        gen_helper_fpush();
+                        gen_helper_fldpi_ST0();
+                        break;
+                    case 4:
+                        gen_helper_fpush();
+                        gen_helper_fldlg2_ST0();
+                        break;
+                    case 5:
+                        gen_helper_fpush();
+                        gen_helper_fldln2_ST0();
+                        break;
+                    case 6:
+                        gen_helper_fpush();
+                        gen_helper_fldz_ST0();
+                        break;
+                    default:
+                        goto illegal_op;
+                    }
+                }
+                break;
+            case 0x0e: /* grp d9/6 */
+                switch(rm) {
+                case 0: /* f2xm1 */
+                    gen_helper_f2xm1();
+                    break;
+                case 1: /* fyl2x */
+                    gen_helper_fyl2x();
+                    break;
+                case 2: /* fptan */
+                    gen_helper_fptan();
+                    break;
+                case 3: /* fpatan */
+                    gen_helper_fpatan();
+                    break;
+                case 4: /* fxtract */
+                    gen_helper_fxtract();
+                    break;
+                case 5: /* fprem1 */
+                    gen_helper_fprem1();
+                    break;
+                case 6: /* fdecstp */
+                    gen_helper_fdecstp();
+                    break;
+                default:
+                case 7: /* fincstp */
+                    gen_helper_fincstp();
+                    break;
+                }
+                break;
+            case 0x0f: /* grp d9/7 */
+                switch(rm) {
+                case 0: /* fprem */
+                    gen_helper_fprem();
+                    break;
+                case 1: /* fyl2xp1 */
+                    gen_helper_fyl2xp1();
+                    break;
+                case 2: /* fsqrt */
+                    gen_helper_fsqrt();
+                    break;
+                case 3: /* fsincos */
+                    gen_helper_fsincos();
+                    break;
+                case 5: /* fscale */
+                    gen_helper_fscale();
+                    break;
+                case 4: /* frndint */
+                    gen_helper_frndint();
+                    break;
+                case 6: /* fsin */
+                    gen_helper_fsin();
+                    break;
+                default:
+                case 7: /* fcos */
+                    gen_helper_fcos();
+                    break;
+                }
+                break;
+            case 0x00: case 0x01: case 0x04 ... 0x07: /* fxxx st, sti */
+            case 0x20: case 0x21: case 0x24 ... 0x27: /* fxxx sti, st */
+            case 0x30: case 0x31: case 0x34 ... 0x37: /* fxxxp sti, st */
+                {
+                    int op1;
+
+                    op1 = op & 7;
+                    if (op >= 0x20) {
+                        gen_helper_fp_arith_STN_ST0(op1, opreg);
+                        if (op >= 0x30)
+                            gen_helper_fpop();
+                    } else {
+                        gen_helper_fmov_FT0_STN(tcg_const_i32(opreg));
+                        gen_helper_fp_arith_ST0_FT0(op1);
+                    }
+                }
+                break;
+            case 0x02: /* fcom */
+            case 0x22: /* fcom2, undocumented op */
+                gen_helper_fmov_FT0_STN(tcg_const_i32(opreg));
+                gen_helper_fcom_ST0_FT0();
+                break;
+            case 0x03: /* fcomp */
+            case 0x23: /* fcomp3, undocumented op */
+            case 0x32: /* fcomp5, undocumented op */
+                gen_helper_fmov_FT0_STN(tcg_const_i32(opreg));
+                gen_helper_fcom_ST0_FT0();
+                gen_helper_fpop();
+                break;
+            case 0x15: /* da/5 */
+                switch(rm) {
+                case 1: /* fucompp */
+                    gen_helper_fmov_FT0_STN(tcg_const_i32(1));
+                    gen_helper_fucom_ST0_FT0();
+                    gen_helper_fpop();
+                    gen_helper_fpop();
+                    break;
+                default:
+                    goto illegal_op;
+                }
+                break;
+            case 0x1c:
+                switch(rm) {
+                case 0: /* feni (287 only, just do nop here) */
+                    break;
+                case 1: /* fdisi (287 only, just do nop here) */
+                    break;
+                case 2: /* fclex */
+                    gen_helper_fclex();
+                    break;
+                case 3: /* fninit */
+                    gen_helper_fninit();
+                    break;
+                case 4: /* fsetpm (287 only, just do nop here) */
+                    break;
+                default:
+                    goto illegal_op;
+                }
+                break;
+            case 0x1d: /* fucomi */
+                if (s->cc_op != CC_OP_DYNAMIC)
+                    gen_op_set_cc_op(s->cc_op);
+                gen_helper_fmov_FT0_STN(tcg_const_i32(opreg));
+                gen_helper_fucomi_ST0_FT0();
+                s->cc_op = CC_OP_EFLAGS;
+                break;
+            case 0x1e: /* fcomi */
+                if (s->cc_op != CC_OP_DYNAMIC)
+                    gen_op_set_cc_op(s->cc_op);
+                gen_helper_fmov_FT0_STN(tcg_const_i32(opreg));
+                gen_helper_fcomi_ST0_FT0();
+                s->cc_op = CC_OP_EFLAGS;
+                break;
+            case 0x28: /* ffree sti */
+                gen_helper_ffree_STN(tcg_const_i32(opreg));
+                break;
+            case 0x2a: /* fst sti */
+                gen_helper_fmov_STN_ST0(tcg_const_i32(opreg));
+                break;
+            case 0x2b: /* fstp sti */
+            case 0x0b: /* fstp1 sti, undocumented op */
+            case 0x3a: /* fstp8 sti, undocumented op */
+            case 0x3b: /* fstp9 sti, undocumented op */
+                gen_helper_fmov_STN_ST0(tcg_const_i32(opreg));
+                gen_helper_fpop();
+                break;
+            case 0x2c: /* fucom st(i) */
+                gen_helper_fmov_FT0_STN(tcg_const_i32(opreg));
+                gen_helper_fucom_ST0_FT0();
+                break;
+            case 0x2d: /* fucomp st(i) */
+                gen_helper_fmov_FT0_STN(tcg_const_i32(opreg));
+                gen_helper_fucom_ST0_FT0();
+                gen_helper_fpop();
+                break;
+            case 0x33: /* de/3 */
+                switch(rm) {
+                case 1: /* fcompp */
+                    gen_helper_fmov_FT0_STN(tcg_const_i32(1));
+                    gen_helper_fcom_ST0_FT0();
+                    gen_helper_fpop();
+                    gen_helper_fpop();
+                    break;
+                default:
+                    goto illegal_op;
+                }
+                break;
+            case 0x38: /* ffreep sti, undocumented op */
+                gen_helper_ffree_STN(tcg_const_i32(opreg));
+                gen_helper_fpop();
+                break;
+            case 0x3c: /* df/4 */
+                switch(rm) {
+                case 0:
+                    gen_helper_fnstsw(cpu_tmp2_i32);
+                    tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
+                    gen_op_mov_reg_T0(OT_WORD, R_EAX);
+                    break;
+                default:
+                    goto illegal_op;
+                }
+                break;
+            case 0x3d: /* fucomip */
+                if (s->cc_op != CC_OP_DYNAMIC)
+                    gen_op_set_cc_op(s->cc_op);
+                gen_helper_fmov_FT0_STN(tcg_const_i32(opreg));
+                gen_helper_fucomi_ST0_FT0();
+                gen_helper_fpop();
+                s->cc_op = CC_OP_EFLAGS;
+                break;
+            case 0x3e: /* fcomip */
+                if (s->cc_op != CC_OP_DYNAMIC)
+                    gen_op_set_cc_op(s->cc_op);
+                gen_helper_fmov_FT0_STN(tcg_const_i32(opreg));
+                gen_helper_fcomi_ST0_FT0();
+                gen_helper_fpop();
+                s->cc_op = CC_OP_EFLAGS;
+                break;
+            case 0x10 ... 0x13: /* fcmovxx */
+            case 0x18 ... 0x1b:
+                {
+                    int op1, l1;
+                    static const uint8_t fcmov_cc[8] = {
+                        (JCC_B << 1),
+                        (JCC_Z << 1),
+                        (JCC_BE << 1),
+                        (JCC_P << 1),
+                    };
+                    op1 = fcmov_cc[op & 3] | (((op >> 3) & 1) ^ 1);
+                    l1 = gen_new_label();
+                    gen_jcc1(s, s->cc_op, op1, l1);
+                    gen_helper_fmov_ST0_STN(tcg_const_i32(opreg));
+                    gen_set_label(l1);
+                }
+                break;
+            default:
+                goto illegal_op;
+            }
+        }
+        break;
+        /************************/
+        /* string ops */
+
+    case 0xa4: /* movsS */
+    case 0xa5:
+        if ((b & 1) == 0)
+            ot = OT_BYTE;
+        else
+            ot = dflag + OT_WORD;
+
+        if (prefixes & (PREFIX_REPZ | PREFIX_REPNZ)) {
+            gen_repz_movs(s, ot, pc_start - s->cs_base, s->pc - s->cs_base);
+        } else {
+            gen_movs(s, ot);
+        }
+        break;
+
+    case 0xaa: /* stosS */
+    case 0xab:
+        if ((b & 1) == 0)
+            ot = OT_BYTE;
+        else
+            ot = dflag + OT_WORD;
+
+        if (prefixes & (PREFIX_REPZ | PREFIX_REPNZ)) {
+            gen_repz_stos(s, ot, pc_start - s->cs_base, s->pc - s->cs_base);
+        } else {
+            gen_stos(s, ot);
+        }
+        break;
+    case 0xac: /* lodsS */
+    case 0xad:
+        if ((b & 1) == 0)
+            ot = OT_BYTE;
+        else
+            ot = dflag + OT_WORD;
+        if (prefixes & (PREFIX_REPZ | PREFIX_REPNZ)) {
+            gen_repz_lods(s, ot, pc_start - s->cs_base, s->pc - s->cs_base);
+        } else {
+            gen_lods(s, ot);
+        }
+        break;
+    case 0xae: /* scasS */
+    case 0xaf:
+        if ((b & 1) == 0)
+            ot = OT_BYTE;
+        else
+            ot = dflag + OT_WORD;
+        if (prefixes & PREFIX_REPNZ) {
+            gen_repz_scas(s, ot, pc_start - s->cs_base, s->pc - s->cs_base, 1);
+        } else if (prefixes & PREFIX_REPZ) {
+            gen_repz_scas(s, ot, pc_start - s->cs_base, s->pc - s->cs_base, 0);
+        } else {
+            gen_scas(s, ot);
+            s->cc_op = CC_OP_SUBB + ot;
+        }
+        break;
+
+    case 0xa6: /* cmpsS */
+    case 0xa7:
+        if ((b & 1) == 0)
+            ot = OT_BYTE;
+        else
+            ot = dflag + OT_WORD;
+        if (prefixes & PREFIX_REPNZ) {
+            gen_repz_cmps(s, ot, pc_start - s->cs_base, s->pc - s->cs_base, 1);
+        } else if (prefixes & PREFIX_REPZ) {
+            gen_repz_cmps(s, ot, pc_start - s->cs_base, s->pc - s->cs_base, 0);
+        } else {
+            gen_cmps(s, ot);
+            s->cc_op = CC_OP_SUBB + ot;
+        }
+        break;
+    case 0x6c: /* insS */
+    case 0x6d:
+        if ((b & 1) == 0)
+            ot = OT_BYTE;
+        else
+            ot = dflag ? OT_LONG : OT_WORD;
+        gen_op_mov_TN_reg(OT_WORD, 0, R_EDX);
+        gen_op_andl_T0_ffff();
+        gen_check_io(s, ot, pc_start - s->cs_base,
+                     SVM_IOIO_TYPE_MASK | svm_is_rep(prefixes) | 4);
+        if (prefixes & (PREFIX_REPZ | PREFIX_REPNZ)) {
+            gen_repz_ins(s, ot, pc_start - s->cs_base, s->pc - s->cs_base);
+        } else {
+            gen_ins(s, ot);
+            if (use_icount) {
+                gen_jmp(s, s->pc - s->cs_base);
+            }
+        }
+        break;
+    case 0x6e: /* outsS */
+    case 0x6f:
+        if ((b & 1) == 0)
+            ot = OT_BYTE;
+        else
+            ot = dflag ? OT_LONG : OT_WORD;
+        gen_op_mov_TN_reg(OT_WORD, 0, R_EDX);
+        gen_op_andl_T0_ffff();
+        gen_check_io(s, ot, pc_start - s->cs_base,
+                     svm_is_rep(prefixes) | 4);
+        if (prefixes & (PREFIX_REPZ | PREFIX_REPNZ)) {
+            gen_repz_outs(s, ot, pc_start - s->cs_base, s->pc - s->cs_base);
+        } else {
+            gen_outs(s, ot);
+            if (use_icount) {
+                gen_jmp(s, s->pc - s->cs_base);
+            }
+        }
+        break;
+
+        /************************/
+        /* port I/O */
+
+    case 0xe4:
+    case 0xe5:
+        if ((b & 1) == 0)
+            ot = OT_BYTE;
+        else
+            ot = dflag ? OT_LONG : OT_WORD;
+        val = ldub_code(s->pc++);
+        gen_op_movl_T0_im(val);
+        gen_check_io(s, ot, pc_start - s->cs_base,
+                     SVM_IOIO_TYPE_MASK | svm_is_rep(prefixes));
+        if (use_icount)
+            gen_io_start();
+        tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+        gen_helper_in_func(ot, cpu_T[1], cpu_tmp2_i32);
+        gen_op_mov_reg_T1(ot, R_EAX);
+        if (use_icount) {
+            gen_io_end();
+            gen_jmp(s, s->pc - s->cs_base);
+        }
+        break;
+    case 0xe6:
+    case 0xe7:
+        if ((b & 1) == 0)
+            ot = OT_BYTE;
+        else
+            ot = dflag ? OT_LONG : OT_WORD;
+        val = ldub_code(s->pc++);
+        gen_op_movl_T0_im(val);
+        gen_check_io(s, ot, pc_start - s->cs_base,
+                     svm_is_rep(prefixes));
+        gen_op_mov_TN_reg(ot, 1, R_EAX);
+
+        if (use_icount)
+            gen_io_start();
+        tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+        tcg_gen_andi_i32(cpu_tmp2_i32, cpu_tmp2_i32, 0xffff);
+        tcg_gen_trunc_tl_i32(cpu_tmp3_i32, cpu_T[1]);
+        gen_helper_out_func(ot, cpu_tmp2_i32, cpu_tmp3_i32);
+        if (use_icount) {
+            gen_io_end();
+            gen_jmp(s, s->pc - s->cs_base);
+        }
+        break;
+    case 0xec:
+    case 0xed:
+        if ((b & 1) == 0)
+            ot = OT_BYTE;
+        else
+            ot = dflag ? OT_LONG : OT_WORD;
+        gen_op_mov_TN_reg(OT_WORD, 0, R_EDX);
+        gen_op_andl_T0_ffff();
+        gen_check_io(s, ot, pc_start - s->cs_base,
+                     SVM_IOIO_TYPE_MASK | svm_is_rep(prefixes));
+        if (use_icount)
+            gen_io_start();
+        tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+        gen_helper_in_func(ot, cpu_T[1], cpu_tmp2_i32);
+        gen_op_mov_reg_T1(ot, R_EAX);
+        if (use_icount) {
+            gen_io_end();
+            gen_jmp(s, s->pc - s->cs_base);
+        }
+        break;
+    case 0xee:
+    case 0xef:
+        if ((b & 1) == 0)
+            ot = OT_BYTE;
+        else
+            ot = dflag ? OT_LONG : OT_WORD;
+        gen_op_mov_TN_reg(OT_WORD, 0, R_EDX);
+        gen_op_andl_T0_ffff();
+        gen_check_io(s, ot, pc_start - s->cs_base,
+                     svm_is_rep(prefixes));
+        gen_op_mov_TN_reg(ot, 1, R_EAX);
+
+        if (use_icount)
+            gen_io_start();
+        tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+        tcg_gen_andi_i32(cpu_tmp2_i32, cpu_tmp2_i32, 0xffff);
+        tcg_gen_trunc_tl_i32(cpu_tmp3_i32, cpu_T[1]);
+        gen_helper_out_func(ot, cpu_tmp2_i32, cpu_tmp3_i32);
+        if (use_icount) {
+            gen_io_end();
+            gen_jmp(s, s->pc - s->cs_base);
+        }
+        break;
+
+        /************************/
+        /* control */
+    case 0xc2: /* ret im */
+        val = ldsw_code(s->pc);
+        s->pc += 2;
+        gen_pop_T0(s);
+        if (CODE64(s) && s->dflag)
+            s->dflag = 2;
+        gen_stack_update(s, val + (2 << s->dflag));
+        if (s->dflag == 0)
+            gen_op_andl_T0_ffff();
+        gen_op_jmp_T0();
+        gen_eob(s);
+        break;
+    case 0xc3: /* ret */
+        gen_pop_T0(s);
+        gen_pop_update(s);
+        if (s->dflag == 0)
+            gen_op_andl_T0_ffff();
+        gen_op_jmp_T0();
+        gen_eob(s);
+        break;
+    case 0xca: /* lret im */
+        val = ldsw_code(s->pc);
+        s->pc += 2;
+    do_lret:
+        if (s->pe && !s->vm86) {
+            if (s->cc_op != CC_OP_DYNAMIC)
+                gen_op_set_cc_op(s->cc_op);
+            gen_jmp_im(pc_start - s->cs_base);
+            gen_helper_lret_protected(tcg_const_i32(s->dflag),
+                                      tcg_const_i32(val));
+        } else {
+            gen_stack_A0(s);
+            /* pop offset */
+            gen_op_ld_T0_A0(1 + s->dflag + s->mem_index);
+            if (s->dflag == 0)
+                gen_op_andl_T0_ffff();
+            /* NOTE: keeping EIP updated is not a problem in case of
+               exception */
+            gen_op_jmp_T0();
+            /* pop selector */
+            gen_op_addl_A0_im(2 << s->dflag);
+            gen_op_ld_T0_A0(1 + s->dflag + s->mem_index);
+            gen_op_movl_seg_T0_vm(R_CS);
+            /* add stack offset */
+            gen_stack_update(s, val + (4 << s->dflag));
+        }
+        gen_eob(s);
+        break;
+    case 0xcb: /* lret */
+        val = 0;
+        goto do_lret;
+    case 0xcf: /* iret */
+        gen_svm_check_intercept(s, pc_start, SVM_EXIT_IRET);
+        if (!s->pe) {
+            /* real mode */
+            gen_helper_iret_real(tcg_const_i32(s->dflag));
+            s->cc_op = CC_OP_EFLAGS;
+        } else if (s->vm86) {
+#ifdef VBOX
+            if (s->iopl != 3 && (!s->vme || s->dflag)) {
+#else
+            if (s->iopl != 3) {
+#endif
+                gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
+            } else {
+                gen_helper_iret_real(tcg_const_i32(s->dflag));
+                s->cc_op = CC_OP_EFLAGS;
+            }
+        } else {
+            if (s->cc_op != CC_OP_DYNAMIC)
+                gen_op_set_cc_op(s->cc_op);
+            gen_jmp_im(pc_start - s->cs_base);
+            gen_helper_iret_protected(tcg_const_i32(s->dflag),
+                                      tcg_const_i32(s->pc - s->cs_base));
+            s->cc_op = CC_OP_EFLAGS;
+        }
+        gen_eob(s);
+        break;
+    case 0xe8: /* call im */
+        {
+            if (dflag)
+                tval = (int32_t)insn_get(s, OT_LONG);
+            else
+                tval = (int16_t)insn_get(s, OT_WORD);
+            next_eip = s->pc - s->cs_base;
+            tval += next_eip;
+            if (s->dflag == 0)
+                tval &= 0xffff;
+            else if(!CODE64(s))
+                tval &= 0xffffffff;
+            gen_movtl_T0_im(next_eip);
+            gen_push_T0(s);
+            gen_jmp(s, tval);
+        }
+        break;
+    case 0x9a: /* lcall im */
+        {
+            unsigned int selector, offset;
+
+            if (CODE64(s))
+                goto illegal_op;
+            ot = dflag ? OT_LONG : OT_WORD;
+            offset = insn_get(s, ot);
+            selector = insn_get(s, OT_WORD);
+
+            gen_op_movl_T0_im(selector);
+            gen_op_movl_T1_imu(offset);
+        }
+        goto do_lcall;
+    case 0xe9: /* jmp im */
+        if (dflag)
+            tval = (int32_t)insn_get(s, OT_LONG);
+        else
+            tval = (int16_t)insn_get(s, OT_WORD);
+        tval += s->pc - s->cs_base;
+        if (s->dflag == 0)
+            tval &= 0xffff;
+        else if(!CODE64(s))
+            tval &= 0xffffffff;
+        gen_jmp(s, tval);
+        break;
+    case 0xea: /* ljmp im */
+        {
+            unsigned int selector, offset;
+
+            if (CODE64(s))
+                goto illegal_op;
+            ot = dflag ? OT_LONG : OT_WORD;
+            offset = insn_get(s, ot);
+            selector = insn_get(s, OT_WORD);
+
+            gen_op_movl_T0_im(selector);
+            gen_op_movl_T1_imu(offset);
+        }
+        goto do_ljmp;
+    case 0xeb: /* jmp Jb */
+        tval = (int8_t)insn_get(s, OT_BYTE);
+        tval += s->pc - s->cs_base;
+        if (s->dflag == 0)
+            tval &= 0xffff;
+        gen_jmp(s, tval);
+        break;
+    case 0x70 ... 0x7f: /* jcc Jb */
+        tval = (int8_t)insn_get(s, OT_BYTE);
+        goto do_jcc;
+    case 0x180 ... 0x18f: /* jcc Jv */
+        if (dflag) {
+            tval = (int32_t)insn_get(s, OT_LONG);
+        } else {
+            tval = (int16_t)insn_get(s, OT_WORD);
+        }
+    do_jcc:
+        next_eip = s->pc - s->cs_base;
+        tval += next_eip;
+        if (s->dflag == 0)
+            tval &= 0xffff;
+        gen_jcc(s, b, tval, next_eip);
+        break;
+
+    case 0x190 ... 0x19f: /* setcc Gv */
+        modrm = ldub_code(s->pc++);
+        gen_setcc(s, b);
+        gen_ldst_modrm(s, modrm, OT_BYTE, OR_TMP0, 1);
+        break;
+    case 0x140 ... 0x14f: /* cmov Gv, Ev */
+        {
+            int l1;
+            TCGv t0;
+
+            ot = dflag + OT_WORD;
+            modrm = ldub_code(s->pc++);
+            reg = ((modrm >> 3) & 7) | rex_r;
+            mod = (modrm >> 6) & 3;
+            t0 = tcg_temp_local_new();
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_op_ld_v(ot + s->mem_index, t0, cpu_A0);
+            } else {
+                rm = (modrm & 7) | REX_B(s);
+                gen_op_mov_v_reg(ot, t0, rm);
+            }
+#ifdef TARGET_X86_64
+            if (ot == OT_LONG) {
+                /* XXX: specific Intel behaviour ? */
+                l1 = gen_new_label();
+                gen_jcc1(s, s->cc_op, b ^ 1, l1);
+                tcg_gen_mov_tl(cpu_regs[reg], t0);
+                gen_set_label(l1);
+                tcg_gen_ext32u_tl(cpu_regs[reg], cpu_regs[reg]);
+            } else
+#endif
+            {
+                l1 = gen_new_label();
+                gen_jcc1(s, s->cc_op, b ^ 1, l1);
+                gen_op_mov_reg_v(ot, reg, t0);
+                gen_set_label(l1);
+            }
+            tcg_temp_free(t0);
+        }
+        break;
+
+        /************************/
+        /* flags */
+    case 0x9c: /* pushf */
+        gen_svm_check_intercept(s, pc_start, SVM_EXIT_PUSHF);
+#ifdef VBOX
+        if (s->vm86 && s->iopl != 3 && (!s->vme || s->dflag)) {
+#else
+        if (s->vm86 && s->iopl != 3) {
+#endif
+            gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
+        } else {
+            if (s->cc_op != CC_OP_DYNAMIC)
+                gen_op_set_cc_op(s->cc_op);
+#ifdef VBOX
+            if (s->vm86 && s->vme && s->iopl != 3)
+                gen_helper_read_eflags_vme(cpu_T[0]);
+            else
+#endif
+            gen_helper_read_eflags(cpu_T[0]);
+            gen_push_T0(s);
+        }
+        break;
+    case 0x9d: /* popf */
+        gen_svm_check_intercept(s, pc_start, SVM_EXIT_POPF);
+#ifdef VBOX
+        if (s->vm86 && s->iopl != 3 && (!s->vme || s->dflag)) {
+#else
+        if (s->vm86 && s->iopl != 3) {
+#endif
+            gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
+        } else {
+            gen_pop_T0(s);
+            if (s->cpl == 0) {
+                if (s->dflag) {
+                    gen_helper_write_eflags(cpu_T[0],
+                                       tcg_const_i32((TF_MASK | AC_MASK | ID_MASK | NT_MASK | IF_MASK | IOPL_MASK)));
+                } else {
+                    gen_helper_write_eflags(cpu_T[0],
+                                       tcg_const_i32((TF_MASK | AC_MASK | ID_MASK | NT_MASK | IF_MASK | IOPL_MASK) & 0xffff));
+                }
+            } else {
+                if (s->cpl <= s->iopl) {
+                    if (s->dflag) {
+                        gen_helper_write_eflags(cpu_T[0],
+                                           tcg_const_i32((TF_MASK | AC_MASK | ID_MASK | NT_MASK | IF_MASK)));
+                    } else {
+                        gen_helper_write_eflags(cpu_T[0],
+                                           tcg_const_i32((TF_MASK | AC_MASK | ID_MASK | NT_MASK | IF_MASK) & 0xffff));
+                    }
+                } else {
+                    if (s->dflag) {
+                        gen_helper_write_eflags(cpu_T[0],
+                                           tcg_const_i32((TF_MASK | AC_MASK | ID_MASK | NT_MASK)));
+                    } else {
+#ifdef VBOX
+                        if (s->vm86 && s->vme)
+                            gen_helper_write_eflags_vme(cpu_T[0]);
+                        else
+#endif
+                            gen_helper_write_eflags(cpu_T[0],
+                                               tcg_const_i32((TF_MASK | AC_MASK | ID_MASK | NT_MASK) & 0xffff));
+                    }
+                }
+            }
+            gen_pop_update(s);
+            s->cc_op = CC_OP_EFLAGS;
+            /* abort translation because TF flag may change */
+            gen_jmp_im(s->pc - s->cs_base);
+            gen_eob(s);
+        }
+        break;
+    case 0x9e: /* sahf */
+        if (CODE64(s) && !(s->cpuid_ext3_features & CPUID_EXT3_LAHF_LM))
+            goto illegal_op;
+        gen_op_mov_TN_reg(OT_BYTE, 0, R_AH);
+        if (s->cc_op != CC_OP_DYNAMIC)
+            gen_op_set_cc_op(s->cc_op);
+        gen_compute_eflags(cpu_cc_src);
+        tcg_gen_andi_tl(cpu_cc_src, cpu_cc_src, CC_O);
+        tcg_gen_andi_tl(cpu_T[0], cpu_T[0], CC_S | CC_Z | CC_A | CC_P | CC_C);
+        tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, cpu_T[0]);
+        s->cc_op = CC_OP_EFLAGS;
+        break;
+    case 0x9f: /* lahf */
+        if (CODE64(s) && !(s->cpuid_ext3_features & CPUID_EXT3_LAHF_LM))
+            goto illegal_op;
+        if (s->cc_op != CC_OP_DYNAMIC)
+            gen_op_set_cc_op(s->cc_op);
+        gen_compute_eflags(cpu_T[0]);
+        /* Note: gen_compute_eflags() only gives the condition codes */
+        tcg_gen_ori_tl(cpu_T[0], cpu_T[0], 0x02);
+        gen_op_mov_reg_T0(OT_BYTE, R_AH);
+        break;
+    case 0xf5: /* cmc */
+        if (s->cc_op != CC_OP_DYNAMIC)
+            gen_op_set_cc_op(s->cc_op);
+        gen_compute_eflags(cpu_cc_src);
+        tcg_gen_xori_tl(cpu_cc_src, cpu_cc_src, CC_C);
+        s->cc_op = CC_OP_EFLAGS;
+        break;
+    case 0xf8: /* clc */
+        if (s->cc_op != CC_OP_DYNAMIC)
+            gen_op_set_cc_op(s->cc_op);
+        gen_compute_eflags(cpu_cc_src);
+        tcg_gen_andi_tl(cpu_cc_src, cpu_cc_src, ~CC_C);
+        s->cc_op = CC_OP_EFLAGS;
+        break;
+    case 0xf9: /* stc */
+        if (s->cc_op != CC_OP_DYNAMIC)
+            gen_op_set_cc_op(s->cc_op);
+        gen_compute_eflags(cpu_cc_src);
+        tcg_gen_ori_tl(cpu_cc_src, cpu_cc_src, CC_C);
+        s->cc_op = CC_OP_EFLAGS;
+        break;
+    case 0xfc: /* cld */
+        tcg_gen_movi_i32(cpu_tmp2_i32, 1);
+        tcg_gen_st_i32(cpu_tmp2_i32, cpu_env, offsetof(CPUState, df));
+        break;
+    case 0xfd: /* std */
+        tcg_gen_movi_i32(cpu_tmp2_i32, -1);
+        tcg_gen_st_i32(cpu_tmp2_i32, cpu_env, offsetof(CPUState, df));
+        break;
+
+        /************************/
+        /* bit operations */
+    case 0x1ba: /* bt/bts/btr/btc Gv, im */
+        ot = dflag + OT_WORD;
+        modrm = ldub_code(s->pc++);
+        op = (modrm >> 3) & 7;
+        mod = (modrm >> 6) & 3;
+        rm = (modrm & 7) | REX_B(s);
+        if (mod != 3) {
+            s->rip_offset = 1;
+            gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+            gen_op_ld_T0_A0(ot + s->mem_index);
+        } else {
+            gen_op_mov_TN_reg(ot, 0, rm);
+        }
+        /* load shift */
+        val = ldub_code(s->pc++);
+        gen_op_movl_T1_im(val);
+        if (op < 4)
+            goto illegal_op;
+        op -= 4;
+        goto bt_op;
+    case 0x1a3: /* bt Gv, Ev */
+        op = 0;
+        goto do_btx;
+    case 0x1ab: /* bts */
+        op = 1;
+        goto do_btx;
+    case 0x1b3: /* btr */
+        op = 2;
+        goto do_btx;
+    case 0x1bb: /* btc */
+        op = 3;
+    do_btx:
+        ot = dflag + OT_WORD;
+        modrm = ldub_code(s->pc++);
+        reg = ((modrm >> 3) & 7) | rex_r;
+        mod = (modrm >> 6) & 3;
+        rm = (modrm & 7) | REX_B(s);
+        gen_op_mov_TN_reg(OT_LONG, 1, reg);
+        if (mod != 3) {
+            gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+            /* specific case: we need to add a displacement */
+            gen_exts(ot, cpu_T[1]);
+            tcg_gen_sari_tl(cpu_tmp0, cpu_T[1], 3 + ot);
+            tcg_gen_shli_tl(cpu_tmp0, cpu_tmp0, ot);
+            tcg_gen_add_tl(cpu_A0, cpu_A0, cpu_tmp0);
+            gen_op_ld_T0_A0(ot + s->mem_index);
+        } else {
+            gen_op_mov_TN_reg(ot, 0, rm);
+        }
+    bt_op:
+        tcg_gen_andi_tl(cpu_T[1], cpu_T[1], (1 << (3 + ot)) - 1);
+        switch(op) {
+        case 0:
+            tcg_gen_shr_tl(cpu_cc_src, cpu_T[0], cpu_T[1]);
+            tcg_gen_movi_tl(cpu_cc_dst, 0);
+            break;
+        case 1:
+            tcg_gen_shr_tl(cpu_tmp4, cpu_T[0], cpu_T[1]);
+            tcg_gen_movi_tl(cpu_tmp0, 1);
+            tcg_gen_shl_tl(cpu_tmp0, cpu_tmp0, cpu_T[1]);
+            tcg_gen_or_tl(cpu_T[0], cpu_T[0], cpu_tmp0);
+            break;
+        case 2:
+            tcg_gen_shr_tl(cpu_tmp4, cpu_T[0], cpu_T[1]);
+            tcg_gen_movi_tl(cpu_tmp0, 1);
+            tcg_gen_shl_tl(cpu_tmp0, cpu_tmp0, cpu_T[1]);
+            tcg_gen_not_tl(cpu_tmp0, cpu_tmp0);
+            tcg_gen_and_tl(cpu_T[0], cpu_T[0], cpu_tmp0);
+            break;
+        default:
+        case 3:
+            tcg_gen_shr_tl(cpu_tmp4, cpu_T[0], cpu_T[1]);
+            tcg_gen_movi_tl(cpu_tmp0, 1);
+            tcg_gen_shl_tl(cpu_tmp0, cpu_tmp0, cpu_T[1]);
+            tcg_gen_xor_tl(cpu_T[0], cpu_T[0], cpu_tmp0);
+            break;
+        }
+        s->cc_op = CC_OP_SARB + ot;
+        if (op != 0) {
+            if (mod != 3)
+                gen_op_st_T0_A0(ot + s->mem_index);
+            else
+                gen_op_mov_reg_T0(ot, rm);
+            tcg_gen_mov_tl(cpu_cc_src, cpu_tmp4);
+            tcg_gen_movi_tl(cpu_cc_dst, 0);
+        }
+        break;
+    case 0x1bc: /* bsf */
+    case 0x1bd: /* bsr */
+        {
+            int label1;
+            TCGv t0;
+
+            ot = dflag + OT_WORD;
+            modrm = ldub_code(s->pc++);
+            reg = ((modrm >> 3) & 7) | rex_r;
+            gen_ldst_modrm(s,modrm, ot, OR_TMP0, 0);
+            gen_extu(ot, cpu_T[0]);
+            t0 = tcg_temp_local_new();
+            tcg_gen_mov_tl(t0, cpu_T[0]);
+            if ((b & 1) && (prefixes & PREFIX_REPZ) &&
+                (s->cpuid_ext3_features & CPUID_EXT3_ABM)) {
+                switch(ot) {
+                case OT_WORD: gen_helper_lzcnt(cpu_T[0], t0,
+                    tcg_const_i32(16)); break;
+                case OT_LONG: gen_helper_lzcnt(cpu_T[0], t0,
+                    tcg_const_i32(32)); break;
+                case OT_QUAD: gen_helper_lzcnt(cpu_T[0], t0,
+                    tcg_const_i32(64)); break;
+                }
+                gen_op_mov_reg_T0(ot, reg);
+            } else {
+                label1 = gen_new_label();
+                tcg_gen_movi_tl(cpu_cc_dst, 0);
+                tcg_gen_brcondi_tl(TCG_COND_EQ, t0, 0, label1);
+                if (b & 1) {
+                    gen_helper_bsr(cpu_T[0], t0);
+                } else {
+                    gen_helper_bsf(cpu_T[0], t0);
+                }
+                gen_op_mov_reg_T0(ot, reg);
+                tcg_gen_movi_tl(cpu_cc_dst, 1);
+                gen_set_label(label1);
+                tcg_gen_discard_tl(cpu_cc_src);
+                s->cc_op = CC_OP_LOGICB + ot;
+            }
+            tcg_temp_free(t0);
+        }
+        break;
+        /************************/
+        /* bcd */
+    case 0x27: /* daa */
+        if (CODE64(s))
+            goto illegal_op;
+        if (s->cc_op != CC_OP_DYNAMIC)
+            gen_op_set_cc_op(s->cc_op);
+        gen_helper_daa();
+        s->cc_op = CC_OP_EFLAGS;
+        break;
+    case 0x2f: /* das */
+        if (CODE64(s))
+            goto illegal_op;
+        if (s->cc_op != CC_OP_DYNAMIC)
+            gen_op_set_cc_op(s->cc_op);
+        gen_helper_das();
+        s->cc_op = CC_OP_EFLAGS;
+        break;
+    case 0x37: /* aaa */
+        if (CODE64(s))
+            goto illegal_op;
+        if (s->cc_op != CC_OP_DYNAMIC)
+            gen_op_set_cc_op(s->cc_op);
+        gen_helper_aaa();
+        s->cc_op = CC_OP_EFLAGS;
+        break;
+    case 0x3f: /* aas */
+        if (CODE64(s))
+            goto illegal_op;
+        if (s->cc_op != CC_OP_DYNAMIC)
+            gen_op_set_cc_op(s->cc_op);
+        gen_helper_aas();
+        s->cc_op = CC_OP_EFLAGS;
+        break;
+    case 0xd4: /* aam */
+        if (CODE64(s))
+            goto illegal_op;
+        val = ldub_code(s->pc++);
+        if (val == 0) {
+            gen_exception(s, EXCP00_DIVZ, pc_start - s->cs_base);
+        } else {
+            gen_helper_aam(tcg_const_i32(val));
+            s->cc_op = CC_OP_LOGICB;
+        }
+        break;
+    case 0xd5: /* aad */
+        if (CODE64(s))
+            goto illegal_op;
+        val = ldub_code(s->pc++);
+        gen_helper_aad(tcg_const_i32(val));
+        s->cc_op = CC_OP_LOGICB;
+        break;
+        /************************/
+        /* misc */
+    case 0x90: /* nop */
+        /* XXX: correct lock test for all insn */
+        if (prefixes & PREFIX_LOCK) {
+            goto illegal_op;
+        }
+        /* If REX_B is set, then this is xchg eax, r8d, not a nop.  */
+        if (REX_B(s)) {
+            goto do_xchg_reg_eax;
+        }
+        if (prefixes & PREFIX_REPZ) {
+            gen_svm_check_intercept(s, pc_start, SVM_EXIT_PAUSE);
+        }
+        break;
+    case 0x9b: /* fwait */
+        if ((s->flags & (HF_MP_MASK | HF_TS_MASK)) ==
+            (HF_MP_MASK | HF_TS_MASK)) {
+            gen_exception(s, EXCP07_PREX, pc_start - s->cs_base);
+        } else {
+            if (s->cc_op != CC_OP_DYNAMIC)
+                gen_op_set_cc_op(s->cc_op);
+            gen_jmp_im(pc_start - s->cs_base);
+            gen_helper_fwait();
+        }
+        break;
+    case 0xcc: /* int3 */
+#ifdef VBOX
+        if (s->vm86 && s->iopl != 3 && !s->vme) {
+            gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
+        } else
+#endif
+            gen_interrupt(s, EXCP03_INT3, pc_start - s->cs_base, s->pc - s->cs_base);
+        break;
+    case 0xcd: /* int N */
+        val = ldub_code(s->pc++);
+#ifdef VBOX
+        if (s->vm86 && s->iopl != 3 && !s->vme) {
+#else
+        if (s->vm86 && s->iopl != 3) {
+#endif
+            gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
+        } else {
+            gen_interrupt(s, val, pc_start - s->cs_base, s->pc - s->cs_base);
+        }
+        break;
+    case 0xce: /* into */
+        if (CODE64(s))
+            goto illegal_op;
+        if (s->cc_op != CC_OP_DYNAMIC)
+            gen_op_set_cc_op(s->cc_op);
+        gen_jmp_im(pc_start - s->cs_base);
+        gen_helper_into(tcg_const_i32(s->pc - pc_start));
+        break;
+#ifdef WANT_ICEBP
+    case 0xf1: /* icebp (undocumented, exits to external debugger) */
+        gen_svm_check_intercept(s, pc_start, SVM_EXIT_ICEBP);
+#if 1
+        gen_debug(s, pc_start - s->cs_base);
+#else
+        /* start debug */
+        tb_flush(cpu_single_env);
+        cpu_set_log(CPU_LOG_INT | CPU_LOG_TB_IN_ASM);
+#endif
+        break;
+#endif
+    case 0xfa: /* cli */
+        if (!s->vm86) {
+            if (s->cpl <= s->iopl) {
+                gen_helper_cli();
+            } else {
+                gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
+            }
+        } else {
+            if (s->iopl == 3) {
+                gen_helper_cli();
+#ifdef VBOX
+            } else if (s->iopl != 3 && s->vme) {
+                gen_helper_cli_vme();
+#endif
+            } else {
+                gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
+            }
+        }
+        break;
+    case 0xfb: /* sti */
+        if (!s->vm86) {
+            if (s->cpl <= s->iopl) {
+            gen_sti:
+                gen_helper_sti();
+                /* interruptions are enabled only the first insn after sti */
+                /* If several instructions disable interrupts, only the
+                   _first_ does it */
+                if (!(s->tb->flags & HF_INHIBIT_IRQ_MASK))
+                    gen_helper_set_inhibit_irq();
+                /* give a chance to handle pending irqs */
+                gen_jmp_im(s->pc - s->cs_base);
+                gen_eob(s);
+            } else {
+                gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
+            }
+        } else {
+            if (s->iopl == 3) {
+                goto gen_sti;
+#ifdef VBOX
+            } else if (s->iopl != 3 && s->vme) {
+                gen_helper_sti_vme();
+                /* give a chance to handle pending irqs */
+                gen_jmp_im(s->pc - s->cs_base);
+                gen_eob(s);
+#endif
+            } else {
+                gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
+            }
+        }
+        break;
+    case 0x62: /* bound */
+        if (CODE64(s))
+            goto illegal_op;
+        ot = dflag ? OT_LONG : OT_WORD;
+        modrm = ldub_code(s->pc++);
+        reg = (modrm >> 3) & 7;
+        mod = (modrm >> 6) & 3;
+        if (mod == 3)
+            goto illegal_op;
+        gen_op_mov_TN_reg(ot, 0, reg);
+        gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+        gen_jmp_im(pc_start - s->cs_base);
+        tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+        if (ot == OT_WORD)
+            gen_helper_boundw(cpu_A0, cpu_tmp2_i32);
+        else
+            gen_helper_boundl(cpu_A0, cpu_tmp2_i32);
+        break;
+    case 0x1c8 ... 0x1cf: /* bswap reg */
+        reg = (b & 7) | REX_B(s);
+#ifdef TARGET_X86_64
+        if (dflag == 2) {
+            gen_op_mov_TN_reg(OT_QUAD, 0, reg);
+            tcg_gen_bswap64_i64(cpu_T[0], cpu_T[0]);
+            gen_op_mov_reg_T0(OT_QUAD, reg);
+        } else
+#endif
+        {
+            gen_op_mov_TN_reg(OT_LONG, 0, reg);
+            tcg_gen_ext32u_tl(cpu_T[0], cpu_T[0]);
+            tcg_gen_bswap32_tl(cpu_T[0], cpu_T[0]);
+            gen_op_mov_reg_T0(OT_LONG, reg);
+        }
+        break;
+    case 0xd6: /* salc */
+        if (CODE64(s))
+            goto illegal_op;
+        if (s->cc_op != CC_OP_DYNAMIC)
+            gen_op_set_cc_op(s->cc_op);
+        gen_compute_eflags_c(cpu_T[0]);
+        tcg_gen_neg_tl(cpu_T[0], cpu_T[0]);
+        gen_op_mov_reg_T0(OT_BYTE, R_EAX);
+        break;
+    case 0xe0: /* loopnz */
+    case 0xe1: /* loopz */
+    case 0xe2: /* loop */
+    case 0xe3: /* jecxz */
+        {
+            int l1, l2, l3;
+
+            tval = (int8_t)insn_get(s, OT_BYTE);
+            next_eip = s->pc - s->cs_base;
+            tval += next_eip;
+            if (s->dflag == 0)
+                tval &= 0xffff;
+
+            l1 = gen_new_label();
+            l2 = gen_new_label();
+            l3 = gen_new_label();
+            b &= 3;
+            switch(b) {
+            case 0: /* loopnz */
+            case 1: /* loopz */
+                if (s->cc_op != CC_OP_DYNAMIC)
+                    gen_op_set_cc_op(s->cc_op);
+                gen_op_add_reg_im(s->aflag, R_ECX, -1);
+                gen_op_jz_ecx(s->aflag, l3);
+                gen_compute_eflags(cpu_tmp0);
+                tcg_gen_andi_tl(cpu_tmp0, cpu_tmp0, CC_Z);
+                if (b == 0) {
+                    tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_tmp0, 0, l1);
+                } else {
+                    tcg_gen_brcondi_tl(TCG_COND_NE, cpu_tmp0, 0, l1);
+                }
+                break;
+            case 2: /* loop */
+                gen_op_add_reg_im(s->aflag, R_ECX, -1);
+                gen_op_jnz_ecx(s->aflag, l1);
+                break;
+            default:
+            case 3: /* jcxz */
+                gen_op_jz_ecx(s->aflag, l1);
+                break;
+            }
+
+            gen_set_label(l3);
+            gen_jmp_im(next_eip);
+            tcg_gen_br(l2);
+
+            gen_set_label(l1);
+            gen_jmp_im(tval);
+            gen_set_label(l2);
+            gen_eob(s);
+        }
+        break;
+    case 0x130: /* wrmsr */
+    case 0x132: /* rdmsr */
+        if (s->cpl != 0) {
+            gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
+        } else {
+            if (s->cc_op != CC_OP_DYNAMIC)
+                gen_op_set_cc_op(s->cc_op);
+            gen_jmp_im(pc_start - s->cs_base);
+            if (b & 2) {
+                gen_helper_rdmsr();
+            } else {
+                gen_helper_wrmsr();
+            }
+        }
+        break;
+    case 0x131: /* rdtsc */
+        if (s->cc_op != CC_OP_DYNAMIC)
+            gen_op_set_cc_op(s->cc_op);
+        gen_jmp_im(pc_start - s->cs_base);
+        if (use_icount)
+            gen_io_start();
+        gen_helper_rdtsc();
+        if (use_icount) {
+            gen_io_end();
+            gen_jmp(s, s->pc - s->cs_base);
+        }
+        break;
+    case 0x133: /* rdpmc */
+        if (s->cc_op != CC_OP_DYNAMIC)
+            gen_op_set_cc_op(s->cc_op);
+        gen_jmp_im(pc_start - s->cs_base);
+        gen_helper_rdpmc();
+        break;
+    case 0x134: /* sysenter */
+#ifndef VBOX
+        /* For Intel SYSENTER is valid on 64-bit */
+        if (CODE64(s) && cpu_single_env->cpuid_vendor1 != CPUID_VENDOR_INTEL_1)
+#else
+        if (   !(cpu_single_env->cpuid_features & CPUID_SEP)
+            || (   IS_LONG_MODE(s)
+                && CPUMGetGuestCpuVendor(cpu_single_env->pVM) != CPUMCPUVENDOR_INTEL))
+#endif
+            goto illegal_op;
+        if (!s->pe) {
+            gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
+        } else {
+            gen_update_cc_op(s);
+            gen_jmp_im(pc_start - s->cs_base);
+            gen_helper_sysenter();
+            gen_eob(s);
+        }
+        break;
+    case 0x135: /* sysexit */
+#ifndef VBOX
+        /* For Intel SYSEXIT is valid on 64-bit */
+        if (CODE64(s) && cpu_single_env->cpuid_vendor1 != CPUID_VENDOR_INTEL_1)
+#else
+        if (   !(cpu_single_env->cpuid_features & CPUID_SEP)
+            || (   IS_LONG_MODE(s)
+                && CPUMGetGuestCpuVendor(cpu_single_env->pVM) != CPUMCPUVENDOR_INTEL))
+#endif
+            goto illegal_op;
+        if (!s->pe) {
+            gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
+        } else {
+            gen_update_cc_op(s);
+            gen_jmp_im(pc_start - s->cs_base);
+            gen_helper_sysexit(tcg_const_i32(dflag));
+            gen_eob(s);
+        }
+        break;
+#ifdef TARGET_X86_64
+    case 0x105: /* syscall */
+        /* XXX: is it usable in real mode ? */
+        gen_update_cc_op(s);
+        gen_jmp_im(pc_start - s->cs_base);
+        gen_helper_syscall(tcg_const_i32(s->pc - pc_start));
+        gen_eob(s);
+        break;
+    case 0x107: /* sysret */
+        if (!s->pe) {
+            gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
+        } else {
+            gen_update_cc_op(s);
+            gen_jmp_im(pc_start - s->cs_base);
+            gen_helper_sysret(tcg_const_i32(s->dflag));
+            /* condition codes are modified only in long mode */
+            if (s->lma)
+                s->cc_op = CC_OP_EFLAGS;
+            gen_eob(s);
+        }
+        break;
+#endif
+    case 0x1a2: /* cpuid */
+        if (s->cc_op != CC_OP_DYNAMIC)
+            gen_op_set_cc_op(s->cc_op);
+        gen_jmp_im(pc_start - s->cs_base);
+        gen_helper_cpuid();
+        break;
+    case 0xf4: /* hlt */
+        if (s->cpl != 0) {
+            gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
+        } else {
+            if (s->cc_op != CC_OP_DYNAMIC)
+                gen_op_set_cc_op(s->cc_op);
+            gen_jmp_im(pc_start - s->cs_base);
+            gen_helper_hlt(tcg_const_i32(s->pc - pc_start));
+            s->is_jmp = DISAS_TB_JUMP;
+        }
+        break;
+    case 0x100:
+        modrm = ldub_code(s->pc++);
+        mod = (modrm >> 6) & 3;
+        op = (modrm >> 3) & 7;
+        switch(op) {
+        case 0: /* sldt */
+            if (!s->pe || s->vm86)
+                goto illegal_op;
+            gen_svm_check_intercept(s, pc_start, SVM_EXIT_LDTR_READ);
+            tcg_gen_ld32u_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,ldt.selector));
+            ot = OT_WORD;
+            if (mod == 3)
+                ot += s->dflag;
+            gen_ldst_modrm(s, modrm, ot, OR_TMP0, 1);
+            break;
+        case 2: /* lldt */
+            if (!s->pe || s->vm86)
+                goto illegal_op;
+            if (s->cpl != 0) {
+                gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
+            } else {
+                gen_svm_check_intercept(s, pc_start, SVM_EXIT_LDTR_WRITE);
+                gen_ldst_modrm(s, modrm, OT_WORD, OR_TMP0, 0);
+                gen_jmp_im(pc_start - s->cs_base);
+                tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+                gen_helper_lldt(cpu_tmp2_i32);
+            }
+            break;
+        case 1: /* str */
+            if (!s->pe || s->vm86)
+                goto illegal_op;
+            gen_svm_check_intercept(s, pc_start, SVM_EXIT_TR_READ);
+            tcg_gen_ld32u_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,tr.selector));
+            ot = OT_WORD;
+            if (mod == 3)
+                ot += s->dflag;
+            gen_ldst_modrm(s, modrm, ot, OR_TMP0, 1);
+            break;
+        case 3: /* ltr */
+            if (!s->pe || s->vm86)
+                goto illegal_op;
+            if (s->cpl != 0) {
+                gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
+            } else {
+                gen_svm_check_intercept(s, pc_start, SVM_EXIT_TR_WRITE);
+                gen_ldst_modrm(s, modrm, OT_WORD, OR_TMP0, 0);
+                gen_jmp_im(pc_start - s->cs_base);
+                tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+                gen_helper_ltr(cpu_tmp2_i32);
+            }
+            break;
+        case 4: /* verr */
+        case 5: /* verw */
+            if (!s->pe || s->vm86)
+                goto illegal_op;
+            gen_ldst_modrm(s, modrm, OT_WORD, OR_TMP0, 0);
+            if (s->cc_op != CC_OP_DYNAMIC)
+                gen_op_set_cc_op(s->cc_op);
+            if (op == 4)
+                gen_helper_verr(cpu_T[0]);
+            else
+                gen_helper_verw(cpu_T[0]);
+            s->cc_op = CC_OP_EFLAGS;
+            break;
+        default:
+            goto illegal_op;
+        }
+        break;
+    case 0x101:
+        modrm = ldub_code(s->pc++);
+        mod = (modrm >> 6) & 3;
+        op = (modrm >> 3) & 7;
+        rm = modrm & 7;
+        switch(op) {
+        case 0: /* sgdt */
+            if (mod == 3)
+                goto illegal_op;
+            gen_svm_check_intercept(s, pc_start, SVM_EXIT_GDTR_READ);
+            gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+            tcg_gen_ld32u_tl(cpu_T[0], cpu_env, offsetof(CPUX86State, gdt.limit));
+            gen_op_st_T0_A0(OT_WORD + s->mem_index);
+            gen_add_A0_im(s, 2);
+            tcg_gen_ld_tl(cpu_T[0], cpu_env, offsetof(CPUX86State, gdt.base));
+            gen_op_st_T0_A0(CODE64(s) + OT_LONG + s->mem_index);
+            break;
+        case 1:
+            if (mod == 3) {
+                switch (rm) {
+                case 0: /* monitor */
+                    if (!(s->cpuid_ext_features & CPUID_EXT_MONITOR) ||
+                        s->cpl != 0)
+                        goto illegal_op;
+                    if (s->cc_op != CC_OP_DYNAMIC)
+                        gen_op_set_cc_op(s->cc_op);
+                    gen_jmp_im(pc_start - s->cs_base);
+#ifdef TARGET_X86_64
+                    if (s->aflag == 2) {
+                        gen_op_movq_A0_reg(R_EAX);
+                    } else
+#endif
+                    {
+                        gen_op_movl_A0_reg(R_EAX);
+                        if (s->aflag == 0)
+                            gen_op_andl_A0_ffff();
+                    }
+                    gen_add_A0_ds_seg(s);
+                    gen_helper_monitor(cpu_A0);
+                    break;
+                case 1: /* mwait */
+                    if (!(s->cpuid_ext_features & CPUID_EXT_MONITOR) ||
+                        s->cpl != 0)
+                        goto illegal_op;
+                    gen_update_cc_op(s);
+                    gen_jmp_im(pc_start - s->cs_base);
+                    gen_helper_mwait(tcg_const_i32(s->pc - pc_start));
+                    gen_eob(s);
+                    break;
+                default:
+                    goto illegal_op;
+                }
+            } else { /* sidt */
+                gen_svm_check_intercept(s, pc_start, SVM_EXIT_IDTR_READ);
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                tcg_gen_ld32u_tl(cpu_T[0], cpu_env, offsetof(CPUX86State, idt.limit));
+                gen_op_st_T0_A0(OT_WORD + s->mem_index);
+                gen_add_A0_im(s, 2);
+                tcg_gen_ld_tl(cpu_T[0], cpu_env, offsetof(CPUX86State, idt.base));
+                gen_op_st_T0_A0(CODE64(s) + OT_LONG + s->mem_index);
+            }
+            break;
+        case 2: /* lgdt */
+        case 3: /* lidt */
+            if (mod == 3) {
+                if (s->cc_op != CC_OP_DYNAMIC)
+                    gen_op_set_cc_op(s->cc_op);
+                gen_jmp_im(pc_start - s->cs_base);
+                switch(rm) {
+                case 0: /* VMRUN */
+                    if (!(s->flags & HF_SVME_MASK) || !s->pe)
+                        goto illegal_op;
+                    if (s->cpl != 0) {
+                        gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
+                        break;
+                    } else {
+                        gen_helper_vmrun(tcg_const_i32(s->aflag),
+                                         tcg_const_i32(s->pc - pc_start));
+                        tcg_gen_exit_tb(0);
+                        s->is_jmp = DISAS_TB_JUMP;
+                    }
+                    break;
+                case 1: /* VMMCALL */
+                    if (!(s->flags & HF_SVME_MASK))
+                        goto illegal_op;
+                    gen_helper_vmmcall();
+                    break;
+                case 2: /* VMLOAD */
+                    if (!(s->flags & HF_SVME_MASK) || !s->pe)
+                        goto illegal_op;
+                    if (s->cpl != 0) {
+                        gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
+                        break;
+                    } else {
+                        gen_helper_vmload(tcg_const_i32(s->aflag));
+                    }
+                    break;
+                case 3: /* VMSAVE */
+                    if (!(s->flags & HF_SVME_MASK) || !s->pe)
+                        goto illegal_op;
+                    if (s->cpl != 0) {
+                        gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
+                        break;
+                    } else {
+                        gen_helper_vmsave(tcg_const_i32(s->aflag));
+                    }
+                    break;
+                case 4: /* STGI */
+                    if ((!(s->flags & HF_SVME_MASK) &&
+                         !(s->cpuid_ext3_features & CPUID_EXT3_SKINIT)) ||
+                        !s->pe)
+                        goto illegal_op;
+                    if (s->cpl != 0) {
+                        gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
+                        break;
+                    } else {
+                        gen_helper_stgi();
+                    }
+                    break;
+                case 5: /* CLGI */
+                    if (!(s->flags & HF_SVME_MASK) || !s->pe)
+                        goto illegal_op;
+                    if (s->cpl != 0) {
+                        gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
+                        break;
+                    } else {
+                        gen_helper_clgi();
+                    }
+                    break;
+                case 6: /* SKINIT */
+                    if ((!(s->flags & HF_SVME_MASK) &&
+                         !(s->cpuid_ext3_features & CPUID_EXT3_SKINIT)) ||
+                        !s->pe)
+                        goto illegal_op;
+                    gen_helper_skinit();
+                    break;
+                case 7: /* INVLPGA */
+                    if (!(s->flags & HF_SVME_MASK) || !s->pe)
+                        goto illegal_op;
+                    if (s->cpl != 0) {
+                        gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
+                        break;
+                    } else {
+                        gen_helper_invlpga(tcg_const_i32(s->aflag));
+                    }
+                    break;
+                default:
+                    goto illegal_op;
+                }
+            } else if (s->cpl != 0) {
+                gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
+            } else {
+                gen_svm_check_intercept(s, pc_start,
+                                        op==2 ? SVM_EXIT_GDTR_WRITE : SVM_EXIT_IDTR_WRITE);
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_op_ld_T1_A0(OT_WORD + s->mem_index);
+                gen_add_A0_im(s, 2);
+                gen_op_ld_T0_A0(CODE64(s) + OT_LONG + s->mem_index);
+                if (!s->dflag)
+                    gen_op_andl_T0_im(0xffffff);
+                if (op == 2) {
+                    tcg_gen_st_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,gdt.base));
+                    tcg_gen_st32_tl(cpu_T[1], cpu_env, offsetof(CPUX86State,gdt.limit));
+                } else {
+                    tcg_gen_st_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,idt.base));
+                    tcg_gen_st32_tl(cpu_T[1], cpu_env, offsetof(CPUX86State,idt.limit));
+                }
+            }
+            break;
+        case 4: /* smsw */
+            gen_svm_check_intercept(s, pc_start, SVM_EXIT_READ_CR0);
+#if defined TARGET_X86_64 && defined HOST_WORDS_BIGENDIAN
+            tcg_gen_ld32u_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,cr[0]) + 4);
+#else
+            tcg_gen_ld32u_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,cr[0]));
+#endif
+            gen_ldst_modrm(s, modrm, OT_WORD, OR_TMP0, 1);
+            break;
+        case 6: /* lmsw */
+            if (s->cpl != 0) {
+                gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
+            } else {
+                gen_svm_check_intercept(s, pc_start, SVM_EXIT_WRITE_CR0);
+                gen_ldst_modrm(s, modrm, OT_WORD, OR_TMP0, 0);
+                gen_helper_lmsw(cpu_T[0]);
+                gen_jmp_im(s->pc - s->cs_base);
+                gen_eob(s);
+            }
+            break;
+        case 7:
+            if (mod != 3) { /* invlpg */
+                if (s->cpl != 0) {
+                    gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
+                } else {
+                    if (s->cc_op != CC_OP_DYNAMIC)
+                        gen_op_set_cc_op(s->cc_op);
+                    gen_jmp_im(pc_start - s->cs_base);
+                    gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                    gen_helper_invlpg(cpu_A0);
+                    gen_jmp_im(s->pc - s->cs_base);
+                    gen_eob(s);
+                }
+            } else {
+                switch (rm) {
+                case 0: /* swapgs */
+#ifdef TARGET_X86_64
+                    if (CODE64(s)) {
+                        if (s->cpl != 0) {
+                            gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
+                        } else {
+                            tcg_gen_ld_tl(cpu_T[0], cpu_env,
+                                offsetof(CPUX86State,segs[R_GS].base));
+                            tcg_gen_ld_tl(cpu_T[1], cpu_env,
+                                offsetof(CPUX86State,kernelgsbase));
+                            tcg_gen_st_tl(cpu_T[1], cpu_env,
+                                offsetof(CPUX86State,segs[R_GS].base));
+                            tcg_gen_st_tl(cpu_T[0], cpu_env,
+                                offsetof(CPUX86State,kernelgsbase));
+                        }
+                    } else
+#endif
+                    {
+                        goto illegal_op;
+                    }
+                    break;
+                case 1: /* rdtscp */
+                    if (!(s->cpuid_ext2_features & CPUID_EXT2_RDTSCP))
+                        goto illegal_op;
+                    if (s->cc_op != CC_OP_DYNAMIC)
+                        gen_op_set_cc_op(s->cc_op);
+                    gen_jmp_im(pc_start - s->cs_base);
+                    if (use_icount)
+                        gen_io_start();
+                    gen_helper_rdtscp();
+                    if (use_icount) {
+                        gen_io_end();
+                        gen_jmp(s, s->pc - s->cs_base);
+                    }
+                    break;
+                default:
+                    goto illegal_op;
+                }
+            }
+            break;
+        default:
+            goto illegal_op;
+        }
+        break;
+    case 0x108: /* invd */
+    case 0x109: /* wbinvd */
+        if (s->cpl != 0) {
+            gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
+        } else {
+            gen_svm_check_intercept(s, pc_start, (b & 2) ? SVM_EXIT_INVD : SVM_EXIT_WBINVD);
+            /* nothing to do */
+        }
+        break;
+    case 0x63: /* arpl or movslS (x86_64) */
+#ifdef TARGET_X86_64
+        if (CODE64(s)) {
+            int d_ot;
+            /* d_ot is the size of destination */
+            d_ot = dflag + OT_WORD;
+
+            modrm = ldub_code(s->pc++);
+            reg = ((modrm >> 3) & 7) | rex_r;
+            mod = (modrm >> 6) & 3;
+            rm = (modrm & 7) | REX_B(s);
+
+            if (mod == 3) {
+                gen_op_mov_TN_reg(OT_LONG, 0, rm);
+                /* sign extend */
+                if (d_ot == OT_QUAD)
+                    tcg_gen_ext32s_tl(cpu_T[0], cpu_T[0]);
+                gen_op_mov_reg_T0(d_ot, reg);
+            } else {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                if (d_ot == OT_QUAD) {
+                    gen_op_lds_T0_A0(OT_LONG + s->mem_index);
+                } else {
+                    gen_op_ld_T0_A0(OT_LONG + s->mem_index);
+                }
+                gen_op_mov_reg_T0(d_ot, reg);
+            }
+        } else
+#endif
+        {
+            int label1;
+            TCGv t0, t1, t2, a0;
+
+            if (!s->pe || s->vm86)
+                goto illegal_op;
+            t0 = tcg_temp_local_new();
+            t1 = tcg_temp_local_new();
+            t2 = tcg_temp_local_new();
+            ot = OT_WORD;
+            modrm = ldub_code(s->pc++);
+            reg = (modrm >> 3) & 7;
+            mod = (modrm >> 6) & 3;
+            rm = modrm & 7;
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_op_ld_v(ot + s->mem_index, t0, cpu_A0);
+                a0 = tcg_temp_local_new();
+                tcg_gen_mov_tl(a0, cpu_A0);
+            } else {
+                gen_op_mov_v_reg(ot, t0, rm);
+                TCGV_UNUSED(a0);
+            }
+            gen_op_mov_v_reg(ot, t1, reg);
+            tcg_gen_andi_tl(cpu_tmp0, t0, 3);
+            tcg_gen_andi_tl(t1, t1, 3);
+            tcg_gen_movi_tl(t2, 0);
+            label1 = gen_new_label();
+            tcg_gen_brcond_tl(TCG_COND_GE, cpu_tmp0, t1, label1);
+            tcg_gen_andi_tl(t0, t0, ~3);
+            tcg_gen_or_tl(t0, t0, t1);
+            tcg_gen_movi_tl(t2, CC_Z);
+            gen_set_label(label1);
+            if (mod != 3) {
+                gen_op_st_v(ot + s->mem_index, t0, a0);
+                tcg_temp_free(a0);
+           } else {
+                gen_op_mov_reg_v(ot, rm, t0);
+            }
+            if (s->cc_op != CC_OP_DYNAMIC)
+                gen_op_set_cc_op(s->cc_op);
+            gen_compute_eflags(cpu_cc_src);
+            tcg_gen_andi_tl(cpu_cc_src, cpu_cc_src, ~CC_Z);
+            tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, t2);
+            s->cc_op = CC_OP_EFLAGS;
+            tcg_temp_free(t0);
+            tcg_temp_free(t1);
+            tcg_temp_free(t2);
+        }
+        break;
+    case 0x102: /* lar */
+    case 0x103: /* lsl */
+        {
+            int label1;
+            TCGv t0;
+            if (!s->pe || s->vm86)
+                goto illegal_op;
+            ot = dflag ? OT_LONG : OT_WORD;
+            modrm = ldub_code(s->pc++);
+            reg = ((modrm >> 3) & 7) | rex_r;
+            gen_ldst_modrm(s, modrm, OT_WORD, OR_TMP0, 0);
+            t0 = tcg_temp_local_new();
+            if (s->cc_op != CC_OP_DYNAMIC)
+                gen_op_set_cc_op(s->cc_op);
+            if (b == 0x102)
+                gen_helper_lar(t0, cpu_T[0]);
+            else
+                gen_helper_lsl(t0, cpu_T[0]);
+            tcg_gen_andi_tl(cpu_tmp0, cpu_cc_src, CC_Z);
+            label1 = gen_new_label();
+            tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_tmp0, 0, label1);
+            gen_op_mov_reg_v(ot, reg, t0);
+            gen_set_label(label1);
+            s->cc_op = CC_OP_EFLAGS;
+            tcg_temp_free(t0);
+        }
+        break;
+    case 0x118:
+        modrm = ldub_code(s->pc++);
+        mod = (modrm >> 6) & 3;
+        op = (modrm >> 3) & 7;
+        switch(op) {
+        case 0: /* prefetchnta */
+        case 1: /* prefetchnt0 */
+        case 2: /* prefetchnt0 */
+        case 3: /* prefetchnt0 */
+            if (mod == 3)
+                goto illegal_op;
+            gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+            /* nothing more to do */
+            break;
+        default: /* nop (multi byte) */
+            gen_nop_modrm(s, modrm);
+            break;
+        }
+        break;
+    case 0x119 ... 0x11f: /* nop (multi byte) */
+        modrm = ldub_code(s->pc++);
+        gen_nop_modrm(s, modrm);
+        break;
+    case 0x120: /* mov reg, crN */
+    case 0x122: /* mov crN, reg */
+        if (s->cpl != 0) {
+            gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
+        } else {
+            modrm = ldub_code(s->pc++);
+#ifndef VBOX    /* mod bits are always understood to be 11 (0xc0) regardless of actual content; see AMD manuals */
+            if ((modrm & 0xc0) != 0xc0)
+                goto illegal_op;
+#endif
+            rm = (modrm & 7) | REX_B(s);
+            reg = ((modrm >> 3) & 7) | rex_r;
+            if (CODE64(s))
+                ot = OT_QUAD;
+            else
+                ot = OT_LONG;
+            if ((prefixes & PREFIX_LOCK) && (reg == 0) &&
+                (s->cpuid_ext3_features & CPUID_EXT3_CR8LEG)) {
+                reg = 8;
+            }
+            switch(reg) {
+            case 0:
+            case 2:
+            case 3:
+            case 4:
+            case 8:
+                if (s->cc_op != CC_OP_DYNAMIC)
+                    gen_op_set_cc_op(s->cc_op);
+                gen_jmp_im(pc_start - s->cs_base);
+                if (b & 2) {
+                    gen_op_mov_TN_reg(ot, 0, rm);
+                    gen_helper_write_crN(tcg_const_i32(reg), cpu_T[0]);
+                    gen_jmp_im(s->pc - s->cs_base);
+                    gen_eob(s);
+                } else {
+                    gen_helper_read_crN(cpu_T[0], tcg_const_i32(reg));
+                    gen_op_mov_reg_T0(ot, rm);
+                }
+                break;
+            default:
+                goto illegal_op;
+            }
+        }
+        break;
+    case 0x121: /* mov reg, drN */
+    case 0x123: /* mov drN, reg */
+        if (s->cpl != 0) {
+            gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
+        } else {
+            modrm = ldub_code(s->pc++);
+#ifndef VBOX    /* mod bits are always understood to be 11 (0xc0) regardless of actual content; see AMD manuals */
+            if ((modrm & 0xc0) != 0xc0)
+                goto illegal_op;
+#endif
+            rm = (modrm & 7) | REX_B(s);
+            reg = ((modrm >> 3) & 7) | rex_r;
+            if (CODE64(s))
+                ot = OT_QUAD;
+            else
+                ot = OT_LONG;
+            /* XXX: do it dynamically with CR4.DE bit */
+            if (reg == 4 || reg == 5 || reg >= 8)
+                goto illegal_op;
+            if (b & 2) {
+                gen_svm_check_intercept(s, pc_start, SVM_EXIT_WRITE_DR0 + reg);
+                gen_op_mov_TN_reg(ot, 0, rm);
+                gen_helper_movl_drN_T0(tcg_const_i32(reg), cpu_T[0]);
+                gen_jmp_im(s->pc - s->cs_base);
+                gen_eob(s);
+            } else {
+                gen_svm_check_intercept(s, pc_start, SVM_EXIT_READ_DR0 + reg);
+                tcg_gen_ld_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,dr[reg]));
+                gen_op_mov_reg_T0(ot, rm);
+            }
+        }
+        break;
+    case 0x106: /* clts */
+        if (s->cpl != 0) {
+            gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
+        } else {
+            gen_svm_check_intercept(s, pc_start, SVM_EXIT_WRITE_CR0);
+            gen_helper_clts();
+            /* abort block because static cpu state changed */
+            gen_jmp_im(s->pc - s->cs_base);
+            gen_eob(s);
+        }
+        break;
+    /* MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4 support */
+    case 0x1c3: /* MOVNTI reg, mem */
+        if (!(s->cpuid_features & CPUID_SSE2))
+            goto illegal_op;
+        ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+        modrm = ldub_code(s->pc++);
+        mod = (modrm >> 6) & 3;
+        if (mod == 3)
+            goto illegal_op;
+        reg = ((modrm >> 3) & 7) | rex_r;
+        /* generate a generic store */
+        gen_ldst_modrm(s, modrm, ot, reg, 1);
+        break;
+    case 0x1ae:
+        modrm = ldub_code(s->pc++);
+        mod = (modrm >> 6) & 3;
+        op = (modrm >> 3) & 7;
+        switch(op) {
+        case 0: /* fxsave */
+            if (mod == 3 || !(s->cpuid_features & CPUID_FXSR) ||
+                (s->prefix & PREFIX_LOCK))
+                goto illegal_op;
+            if ((s->flags & HF_EM_MASK) || (s->flags & HF_TS_MASK)) {
+                gen_exception(s, EXCP07_PREX, pc_start - s->cs_base);
+                break;
+            }
+            gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+            if (s->cc_op != CC_OP_DYNAMIC)
+                gen_op_set_cc_op(s->cc_op);
+            gen_jmp_im(pc_start - s->cs_base);
+            gen_helper_fxsave(cpu_A0, tcg_const_i32((s->dflag == 2)));
+            break;
+        case 1: /* fxrstor */
+            if (mod == 3 || !(s->cpuid_features & CPUID_FXSR) ||
+                (s->prefix & PREFIX_LOCK))
+                goto illegal_op;
+            if ((s->flags & HF_EM_MASK) || (s->flags & HF_TS_MASK)) {
+                gen_exception(s, EXCP07_PREX, pc_start - s->cs_base);
+                break;
+            }
+            gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+            if (s->cc_op != CC_OP_DYNAMIC)
+                gen_op_set_cc_op(s->cc_op);
+            gen_jmp_im(pc_start - s->cs_base);
+            gen_helper_fxrstor(cpu_A0, tcg_const_i32((s->dflag == 2)));
+            break;
+        case 2: /* ldmxcsr */
+        case 3: /* stmxcsr */
+            if (s->flags & HF_TS_MASK) {
+                gen_exception(s, EXCP07_PREX, pc_start - s->cs_base);
+                break;
+            }
+            if ((s->flags & HF_EM_MASK) || !(s->flags & HF_OSFXSR_MASK) ||
+                mod == 3)
+                goto illegal_op;
+            gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+            if (op == 2) {
+                gen_op_ld_T0_A0(OT_LONG + s->mem_index);
+                tcg_gen_st32_tl(cpu_T[0], cpu_env, offsetof(CPUX86State, mxcsr));
+            } else {
+                tcg_gen_ld32u_tl(cpu_T[0], cpu_env, offsetof(CPUX86State, mxcsr));
+                gen_op_st_T0_A0(OT_LONG + s->mem_index);
+            }
+            break;
+        case 5: /* lfence */
+        case 6: /* mfence */
+            if ((modrm & 0xc7) != 0xc0 || !(s->cpuid_features & CPUID_SSE))
+                goto illegal_op;
+            break;
+        case 7: /* sfence / clflush */
+            if ((modrm & 0xc7) == 0xc0) {
+                /* sfence */
+                /* XXX: also check for cpuid_ext2_features & CPUID_EXT2_EMMX */
+                if (!(s->cpuid_features & CPUID_SSE))
+                    goto illegal_op;
+            } else {
+                /* clflush */
+                if (!(s->cpuid_features & CPUID_CLFLUSH))
+                    goto illegal_op;
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+            }
+            break;
+        default:
+            goto illegal_op;
+        }
+        break;
+    case 0x10d: /* 3DNow! prefetch(w) */
+        modrm = ldub_code(s->pc++);
+        mod = (modrm >> 6) & 3;
+        if (mod == 3)
+            goto illegal_op;
+        gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+        /* ignore for now */
+        break;
+    case 0x1aa: /* rsm */
+        gen_svm_check_intercept(s, pc_start, SVM_EXIT_RSM);
+        if (!(s->flags & HF_SMM_MASK))
+            goto illegal_op;
+        gen_update_cc_op(s);
+        gen_jmp_im(s->pc - s->cs_base);
+        gen_helper_rsm();
+        gen_eob(s);
+        break;
+    case 0x1b8: /* SSE4.2 popcnt */
+        if ((prefixes & (PREFIX_REPZ | PREFIX_LOCK | PREFIX_REPNZ)) !=
+             PREFIX_REPZ)
+            goto illegal_op;
+        if (!(s->cpuid_ext_features & CPUID_EXT_POPCNT))
+            goto illegal_op;
+
+        modrm = ldub_code(s->pc++);
+        reg = ((modrm >> 3) & 7);
+
+        if (s->prefix & PREFIX_DATA)
+            ot = OT_WORD;
+        else if (s->dflag != 2)
+            ot = OT_LONG;
+        else
+            ot = OT_QUAD;
+
+        gen_ldst_modrm(s, modrm, ot, OR_TMP0, 0);
+        gen_helper_popcnt(cpu_T[0], cpu_T[0], tcg_const_i32(ot));
+        gen_op_mov_reg_T0(ot, reg);
+
+        s->cc_op = CC_OP_EFLAGS;
+        break;
+    case 0x10e ... 0x10f:
+        /* 3DNow! instructions, ignore prefixes */
+        s->prefix &= ~(PREFIX_REPZ | PREFIX_REPNZ | PREFIX_DATA);
+    case 0x110 ... 0x117:
+    case 0x128 ... 0x12f:
+    case 0x138 ... 0x13a:
+    case 0x150 ... 0x179:
+    case 0x17c ... 0x17f:
+    case 0x1c2:
+    case 0x1c4 ... 0x1c6:
+    case 0x1d0 ... 0x1fe:
+        gen_sse(s, b, pc_start, rex_r);
+        break;
+    default:
+        goto illegal_op;
+    }
+    /* lock generation */
+    if (s->prefix & PREFIX_LOCK)
+        gen_helper_unlock();
+    return s->pc;
+ illegal_op:
+    if (s->prefix & PREFIX_LOCK)
+        gen_helper_unlock();
+    /* XXX: ensure that no lock was generated */
+    gen_exception(s, EXCP06_ILLOP, pc_start - s->cs_base);
+    return s->pc;
+}
+
+void optimize_flags_init(void)
+{
+#if TCG_TARGET_REG_BITS == 32
+    assert(sizeof(CCTable) == (1 << 3));
+#else
+    assert(sizeof(CCTable) == (1 << 4));
+#endif
+    cpu_env = tcg_global_reg_new_ptr(TCG_AREG0, "env");
+    cpu_cc_op = tcg_global_mem_new_i32(TCG_AREG0,
+                                       offsetof(CPUState, cc_op), "cc_op");
+    cpu_cc_src = tcg_global_mem_new(TCG_AREG0, offsetof(CPUState, cc_src),
+                                    "cc_src");
+    cpu_cc_dst = tcg_global_mem_new(TCG_AREG0, offsetof(CPUState, cc_dst),
+                                    "cc_dst");
+    cpu_cc_tmp = tcg_global_mem_new(TCG_AREG0, offsetof(CPUState, cc_tmp),
+                                    "cc_tmp");
+
+#ifdef TARGET_X86_64
+    cpu_regs[R_EAX] = tcg_global_mem_new_i64(TCG_AREG0,
+                                             offsetof(CPUState, regs[R_EAX]), "rax");
+    cpu_regs[R_ECX] = tcg_global_mem_new_i64(TCG_AREG0,
+                                             offsetof(CPUState, regs[R_ECX]), "rcx");
+    cpu_regs[R_EDX] = tcg_global_mem_new_i64(TCG_AREG0,
+                                             offsetof(CPUState, regs[R_EDX]), "rdx");
+    cpu_regs[R_EBX] = tcg_global_mem_new_i64(TCG_AREG0,
+                                             offsetof(CPUState, regs[R_EBX]), "rbx");
+    cpu_regs[R_ESP] = tcg_global_mem_new_i64(TCG_AREG0,
+                                             offsetof(CPUState, regs[R_ESP]), "rsp");
+    cpu_regs[R_EBP] = tcg_global_mem_new_i64(TCG_AREG0,
+                                             offsetof(CPUState, regs[R_EBP]), "rbp");
+    cpu_regs[R_ESI] = tcg_global_mem_new_i64(TCG_AREG0,
+                                             offsetof(CPUState, regs[R_ESI]), "rsi");
+    cpu_regs[R_EDI] = tcg_global_mem_new_i64(TCG_AREG0,
+                                             offsetof(CPUState, regs[R_EDI]), "rdi");
+    cpu_regs[8] = tcg_global_mem_new_i64(TCG_AREG0,
+                                         offsetof(CPUState, regs[8]), "r8");
+    cpu_regs[9] = tcg_global_mem_new_i64(TCG_AREG0,
+                                          offsetof(CPUState, regs[9]), "r9");
+    cpu_regs[10] = tcg_global_mem_new_i64(TCG_AREG0,
+                                          offsetof(CPUState, regs[10]), "r10");
+    cpu_regs[11] = tcg_global_mem_new_i64(TCG_AREG0,
+                                          offsetof(CPUState, regs[11]), "r11");
+    cpu_regs[12] = tcg_global_mem_new_i64(TCG_AREG0,
+                                          offsetof(CPUState, regs[12]), "r12");
+    cpu_regs[13] = tcg_global_mem_new_i64(TCG_AREG0,
+                                          offsetof(CPUState, regs[13]), "r13");
+    cpu_regs[14] = tcg_global_mem_new_i64(TCG_AREG0,
+                                          offsetof(CPUState, regs[14]), "r14");
+    cpu_regs[15] = tcg_global_mem_new_i64(TCG_AREG0,
+                                          offsetof(CPUState, regs[15]), "r15");
+#else
+    cpu_regs[R_EAX] = tcg_global_mem_new_i32(TCG_AREG0,
+                                             offsetof(CPUState, regs[R_EAX]), "eax");
+    cpu_regs[R_ECX] = tcg_global_mem_new_i32(TCG_AREG0,
+                                             offsetof(CPUState, regs[R_ECX]), "ecx");
+    cpu_regs[R_EDX] = tcg_global_mem_new_i32(TCG_AREG0,
+                                             offsetof(CPUState, regs[R_EDX]), "edx");
+    cpu_regs[R_EBX] = tcg_global_mem_new_i32(TCG_AREG0,
+                                             offsetof(CPUState, regs[R_EBX]), "ebx");
+    cpu_regs[R_ESP] = tcg_global_mem_new_i32(TCG_AREG0,
+                                             offsetof(CPUState, regs[R_ESP]), "esp");
+    cpu_regs[R_EBP] = tcg_global_mem_new_i32(TCG_AREG0,
+                                             offsetof(CPUState, regs[R_EBP]), "ebp");
+    cpu_regs[R_ESI] = tcg_global_mem_new_i32(TCG_AREG0,
+                                             offsetof(CPUState, regs[R_ESI]), "esi");
+    cpu_regs[R_EDI] = tcg_global_mem_new_i32(TCG_AREG0,
+                                             offsetof(CPUState, regs[R_EDI]), "edi");
+#endif
+
+    /* register helpers */
+#define GEN_HELPER 2
+#include "helper.h"
+}
+
+/* generate intermediate code in gen_opc_buf and gen_opparam_buf for
+   basic block 'tb'. If search_pc is TRUE, also generate PC
+   information for each intermediate instruction. */
+static inline void gen_intermediate_code_internal(CPUState *env,
+                                                  TranslationBlock *tb,
+                                                  int search_pc)
+{
+    DisasContext dc1, *dc = &dc1;
+    target_ulong pc_ptr;
+    uint16_t *gen_opc_end;
+    CPUBreakpoint *bp;
+    int j, lj;
+    uint64_t flags;
+    target_ulong pc_start;
+    target_ulong cs_base;
+    int num_insns;
+    int max_insns;
+#ifdef VBOX
+    int const singlestep = env->state & CPU_EMULATE_SINGLE_STEP;
+#endif
+
+    /* generate intermediate code */
+    pc_start = tb->pc;
+    cs_base = tb->cs_base;
+    flags = tb->flags;
+
+    dc->pe = (flags >> HF_PE_SHIFT) & 1;
+    dc->code32 = (flags >> HF_CS32_SHIFT) & 1;
+    dc->ss32 = (flags >> HF_SS32_SHIFT) & 1;
+    dc->addseg = (flags >> HF_ADDSEG_SHIFT) & 1;
+    dc->f_st = 0;
+    dc->vm86 = (flags >> VM_SHIFT) & 1;
+#ifdef VBOX
+    dc->vme = !!(env->cr[4] & CR4_VME_MASK);
+    dc->pvi = !!(env->cr[4] & CR4_PVI_MASK);
+# ifdef VBOX_WITH_CALL_RECORD
+    if (    !(env->state & CPU_RAW_RING0)
+        &&  (env->cr[0] & CR0_PG_MASK)
+        &&  !(env->eflags & X86_EFL_IF)
+        &&  dc->code32)
+        dc->record_call = 1;
+    else
+        dc->record_call = 0;
+# endif
+#endif /* VBOX */
+    dc->cpl = (flags >> HF_CPL_SHIFT) & 3;
+    dc->iopl = (flags >> IOPL_SHIFT) & 3;
+    dc->tf = (flags >> TF_SHIFT) & 1;
+    dc->singlestep_enabled = env->singlestep_enabled;
+    dc->cc_op = CC_OP_DYNAMIC;
+    dc->cs_base = cs_base;
+    dc->tb = tb;
+    dc->popl_esp_hack = 0;
+    /* select memory access functions */
+    dc->mem_index = 0;
+    if (flags & HF_SOFTMMU_MASK) {
+        if (dc->cpl == 3)
+            dc->mem_index = 2 * 4;
+        else
+            dc->mem_index = 1 * 4;
+    }
+    dc->cpuid_features = env->cpuid_features;
+    dc->cpuid_ext_features = env->cpuid_ext_features;
+    dc->cpuid_ext2_features = env->cpuid_ext2_features;
+    dc->cpuid_ext3_features = env->cpuid_ext3_features;
+#ifdef TARGET_X86_64
+    dc->lma = (flags >> HF_LMA_SHIFT) & 1;
+    dc->code64 = (flags >> HF_CS64_SHIFT) & 1;
+#endif
+    dc->flags = flags;
+    dc->jmp_opt = !(dc->tf || env->singlestep_enabled ||
+                    (flags & HF_INHIBIT_IRQ_MASK)
+#ifndef CONFIG_SOFTMMU
+                    || (flags & HF_SOFTMMU_MASK)
+#endif
+                    );
+#if 0
+    /* check addseg logic */
+    if (!dc->addseg && (dc->vm86 || !dc->pe || !dc->code32))
+        printf("ERROR addseg\n");
+#endif
+
+    cpu_T[0] = tcg_temp_new();
+    cpu_T[1] = tcg_temp_new();
+    cpu_A0 = tcg_temp_new();
+    cpu_T3 = tcg_temp_new();
+
+    cpu_tmp0 = tcg_temp_new();
+    cpu_tmp1_i64 = tcg_temp_new_i64();
+    cpu_tmp2_i32 = tcg_temp_new_i32();
+    cpu_tmp3_i32 = tcg_temp_new_i32();
+    cpu_tmp4 = tcg_temp_new();
+    cpu_tmp5 = tcg_temp_new();
+    cpu_ptr0 = tcg_temp_new_ptr();
+    cpu_ptr1 = tcg_temp_new_ptr();
+
+    gen_opc_end = gen_opc_buf + OPC_MAX_SIZE;
+
+    dc->is_jmp = DISAS_NEXT;
+    pc_ptr = pc_start;
+    lj = -1;
+    num_insns = 0;
+    max_insns = tb->cflags & CF_COUNT_MASK;
+    if (max_insns == 0)
+        max_insns = CF_COUNT_MASK;
+
+    gen_icount_start();
+    for(;;) {
+        if (unlikely(!QTAILQ_EMPTY(&env->breakpoints))) {
+            QTAILQ_FOREACH(bp, &env->breakpoints, entry) {
+                if (bp->pc == pc_ptr &&
+                    !((bp->flags & BP_CPU) && (tb->flags & HF_RF_MASK))) {
+                    gen_debug(dc, pc_ptr - dc->cs_base);
+                    break;
+                }
+            }
+        }
+        if (search_pc) {
+            j = gen_opc_ptr - gen_opc_buf;
+            if (lj < j) {
+                lj++;
+                while (lj < j)
+                    gen_opc_instr_start[lj++] = 0;
+            }
+            gen_opc_pc[lj] = pc_ptr;
+            gen_opc_cc_op[lj] = dc->cc_op;
+            gen_opc_instr_start[lj] = 1;
+            gen_opc_icount[lj] = num_insns;
+        }
+        if (num_insns + 1 == max_insns && (tb->cflags & CF_LAST_IO))
+            gen_io_start();
+
+        pc_ptr = disas_insn(dc, pc_ptr);
+        num_insns++;
+        /* stop translation if indicated */
+        if (dc->is_jmp)
+            break;
+#ifdef VBOX
+# ifdef DEBUG
+/*
+        if(cpu_check_code_raw(env, pc_ptr, env->hflags | (env->eflags & (IOPL_MASK | TF_MASK | VM_MASK))) == ERROR_SUCCESS)
+        {
+            //should never happen as the jump to the patch code terminates the translation block
+            dprintf(("QEmu is about to execute instructions in our patch block at %08X!!\n", pc_ptr));
+        }
+*/
+# endif /* DEBUG */
+        if (env->state & CPU_EMULATE_SINGLE_INSTR)
+        {
+            env->state &= ~CPU_EMULATE_SINGLE_INSTR;
+            gen_jmp_im(pc_ptr - dc->cs_base);
+            gen_eob(dc);
+            break;
+        }
+#endif /* VBOX */
+
+        /* if single step mode, we generate only one instruction and
+           generate an exception */
+        /* if irq were inhibited with HF_INHIBIT_IRQ_MASK, we clear
+           the flag and abort the translation to give the irqs a
+           change to be happen */
+        if (dc->tf || dc->singlestep_enabled ||
+            (flags & HF_INHIBIT_IRQ_MASK)) {
+            gen_jmp_im(pc_ptr - dc->cs_base);
+            gen_eob(dc);
+            break;
+        }
+        /* if too long translation, stop generation too */
+        if (gen_opc_ptr >= gen_opc_end ||
+            (pc_ptr - pc_start) >= (TARGET_PAGE_SIZE - 32) ||
+            num_insns >= max_insns) {
+            gen_jmp_im(pc_ptr - dc->cs_base);
+            gen_eob(dc);
+            break;
+        }
+        if (singlestep) {
+            gen_jmp_im(pc_ptr - dc->cs_base);
+            gen_eob(dc);
+            break;
+        }
+    }
+    if (tb->cflags & CF_LAST_IO)
+        gen_io_end();
+    gen_icount_end(tb, num_insns);
+    *gen_opc_ptr = INDEX_op_end;
+    /* we don't forget to fill the last values */
+    if (search_pc) {
+        j = gen_opc_ptr - gen_opc_buf;
+        lj++;
+        while (lj <= j)
+            gen_opc_instr_start[lj++] = 0;
+    }
+
+#ifdef DEBUG_DISAS
+    if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)) {
+        int disas_flags;
+        qemu_log("----------------\n");
+        qemu_log("IN: %s\n", lookup_symbol(pc_start));
+#ifdef TARGET_X86_64
+        if (dc->code64)
+            disas_flags = 2;
+        else
+#endif
+            disas_flags = !dc->code32;
+        log_target_disas(pc_start, pc_ptr - pc_start, disas_flags);
+        qemu_log("\n");
+    }
+#endif
+
+    if (!search_pc) {
+        tb->size = pc_ptr - pc_start;
+        tb->icount = num_insns;
+    }
+}
+
+void gen_intermediate_code(CPUState *env, TranslationBlock *tb)
+{
+    gen_intermediate_code_internal(env, tb, 0);
+}
+
+void gen_intermediate_code_pc(CPUState *env, TranslationBlock *tb)
+{
+    gen_intermediate_code_internal(env, tb, 1);
+}
+
+void gen_pc_load(CPUState *env, TranslationBlock *tb,
+                 uintptr_t searched_pc, int pc_pos, void *puc)
+{
+    int cc_op;
+#ifdef DEBUG_DISAS
+    if (qemu_loglevel_mask(CPU_LOG_TB_OP)) {
+        int i;
+        qemu_log("RESTORE:\n");
+        for(i = 0;i <= pc_pos; i++) {
+            if (gen_opc_instr_start[i]) {
+                qemu_log("0x%04x: " TARGET_FMT_lx "\n", i, gen_opc_pc[i]);
+            }
+        }
+        qemu_log("spc=0x%08lx pc_pos=0x%x eip=" TARGET_FMT_lx " cs_base=%x\n",
+                searched_pc, pc_pos, gen_opc_pc[pc_pos] - tb->cs_base,
+                (uint32_t)tb->cs_base);
+    }
+#endif
+    env->eip = gen_opc_pc[pc_pos] - tb->cs_base;
+    cc_op = gen_opc_cc_op[pc_pos];
+    if (cc_op != CC_OP_DYNAMIC)
+        env->cc_op = cc_op;
+}
diff --git a/src/recompiler/targphys.h b/src/recompiler/targphys.h
new file mode 100644
index 00000000..95648d68
--- /dev/null
+++ b/src/recompiler/targphys.h
@@ -0,0 +1,21 @@
+/* Define target_phys_addr_t if it exists.  */
+
+#ifndef TARGPHYS_H
+#define TARGPHYS_H
+
+#ifdef TARGET_PHYS_ADDR_BITS
+/* target_phys_addr_t is the type of a physical address (its size can
+   be different from 'target_ulong').  */
+
+#if TARGET_PHYS_ADDR_BITS == 32
+typedef uint32_t target_phys_addr_t;
+#define TARGET_PHYS_ADDR_MAX UINT32_MAX
+#define TARGET_FMT_plx "%08x"
+#elif TARGET_PHYS_ADDR_BITS == 64
+typedef uint64_t target_phys_addr_t;
+#define TARGET_PHYS_ADDR_MAX UINT64_MAX
+#define TARGET_FMT_plx "%016" PRIx64
+#endif
+#endif
+
+#endif
diff --git a/src/recompiler/tcg-runtime.c b/src/recompiler/tcg-runtime.c
new file mode 100644
index 00000000..ab2df5f0
--- /dev/null
+++ b/src/recompiler/tcg-runtime.c
@@ -0,0 +1,89 @@
+/*
+ * Tiny Code Generator for QEMU
+ *
+ * Copyright (c) 2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef VBOX
+#include <stdint.h>
+#else
+# include <VBox/types.h>
+#endif
+
+#include "tcg/tcg-runtime.h"
+
+/* 32-bit helpers */
+
+int32_t tcg_helper_div_i32(int32_t arg1, int32_t arg2)
+{
+    return arg1 / arg2;
+}
+
+int32_t tcg_helper_rem_i32(int32_t arg1, int32_t arg2)
+{
+    return arg1 % arg2;
+}
+
+uint32_t tcg_helper_divu_i32(uint32_t arg1, uint32_t arg2)
+{
+    return arg1 / arg2;
+}
+
+uint32_t tcg_helper_remu_i32(uint32_t arg1, uint32_t arg2)
+{
+    return arg1 % arg2;
+}
+
+/* 64-bit helpers */
+
+int64_t tcg_helper_shl_i64(int64_t arg1, int64_t arg2)
+{
+    return arg1 << arg2;
+}
+
+int64_t tcg_helper_shr_i64(int64_t arg1, int64_t arg2)
+{
+    return (uint64_t)arg1 >> arg2;
+}
+
+int64_t tcg_helper_sar_i64(int64_t arg1, int64_t arg2)
+{
+    return arg1 >> arg2;
+}
+
+int64_t tcg_helper_div_i64(int64_t arg1, int64_t arg2)
+{
+    return arg1 / arg2;
+}
+
+int64_t tcg_helper_rem_i64(int64_t arg1, int64_t arg2)
+{
+    return arg1 % arg2;
+}
+
+uint64_t tcg_helper_divu_i64(uint64_t arg1, uint64_t arg2)
+{
+    return arg1 / arg2;
+}
+
+uint64_t tcg_helper_remu_i64(uint64_t arg1, uint64_t arg2)
+{
+    return arg1 % arg2;
+}
diff --git a/src/recompiler/tcg/LICENSE b/src/recompiler/tcg/LICENSE
new file mode 100644
index 00000000..be817fa1
--- /dev/null
+++ b/src/recompiler/tcg/LICENSE
@@ -0,0 +1,3 @@
+All the files in this directory and subdirectories are released under
+a BSD like license (see header in each file). No other license is
+accepted.
diff --git a/src/recompiler/tcg/Makefile.kup b/src/recompiler/tcg/Makefile.kup
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/src/recompiler/tcg/Makefile.kup
diff --git a/src/recompiler/tcg/README b/src/recompiler/tcg/README
new file mode 100644
index 00000000..8d6fe059
--- /dev/null
+++ b/src/recompiler/tcg/README
@@ -0,0 +1,497 @@
+Tiny Code Generator - Fabrice Bellard.
+
+1) Introduction
+
+TCG (Tiny Code Generator) began as a generic backend for a C
+compiler. It was simplified to be used in QEMU. It also has its roots
+in the QOP code generator written by Paul Brook.
+
+2) Definitions
+
+The TCG "target" is the architecture for which we generate the
+code. It is of course not the same as the "target" of QEMU which is
+the emulated architecture. As TCG started as a generic C backend used
+for cross compiling, it is assumed that the TCG target is different
+from the host, although it is never the case for QEMU.
+
+A TCG "function" corresponds to a QEMU Translated Block (TB).
+
+A TCG "temporary" is a variable only live in a basic
+block. Temporaries are allocated explicitly in each function.
+
+A TCG "local temporary" is a variable only live in a function. Local
+temporaries are allocated explicitly in each function.
+
+A TCG "global" is a variable which is live in all the functions
+(equivalent of a C global variable). They are defined before the
+functions defined. A TCG global can be a memory location (e.g. a QEMU
+CPU register), a fixed host register (e.g. the QEMU CPU state pointer)
+or a memory location which is stored in a register outside QEMU TBs
+(not implemented yet).
+
+A TCG "basic block" corresponds to a list of instructions terminated
+by a branch instruction.
+
+3) Intermediate representation
+
+3.1) Introduction
+
+TCG instructions operate on variables which are temporaries, local
+temporaries or globals. TCG instructions and variables are strongly
+typed. Two types are supported: 32 bit integers and 64 bit
+integers. Pointers are defined as an alias to 32 bit or 64 bit
+integers depending on the TCG target word size.
+
+Each instruction has a fixed number of output variable operands, input
+variable operands and always constant operands.
+
+The notable exception is the call instruction which has a variable
+number of outputs and inputs.
+
+In the textual form, output operands usually come first, followed by
+input operands, followed by constant operands. The output type is
+included in the instruction name. Constants are prefixed with a '$'.
+
+add_i32 t0, t1, t2  (t0 <- t1 + t2)
+
+3.2) Assumptions
+
+* Basic blocks
+
+- Basic blocks end after branches (e.g. brcond_i32 instruction),
+  goto_tb and exit_tb instructions.
+- Basic blocks start after the end of a previous basic block, or at a
+  set_label instruction.
+
+After the end of a basic block, the content of temporaries is
+destroyed, but local temporaries and globals are preserved.
+
+* Floating point types are not supported yet
+
+* Pointers: depending on the TCG target, pointer size is 32 bit or 64
+  bit. The type TCG_TYPE_PTR is an alias to TCG_TYPE_I32 or
+  TCG_TYPE_I64.
+
+* Helpers:
+
+Using the tcg_gen_helper_x_y it is possible to call any function
+taking i32, i64 or pointer types. By default, before calling an helper,
+all globals are stored at their canonical location and it is assumed
+that the function can modify them. This can be overriden by the
+TCG_CALL_CONST function modifier. By default, the helper is allowed to
+modify the CPU state or raise an exception. This can be overriden by
+the TCG_CALL_PURE function modifier, in which case the call to the
+function is removed if the return value is not used.
+
+On some TCG targets (e.g. x86), several calling conventions are
+supported.
+
+* Branches:
+
+Use the instruction 'br' to jump to a label. Use 'jmp' to jump to an
+explicit address. Conditional branches can only jump to labels.
+
+3.3) Code Optimizations
+
+When generating instructions, you can count on at least the following
+optimizations:
+
+- Single instructions are simplified, e.g.
+
+   and_i32 t0, t0, $0xffffffff
+
+  is suppressed.
+
+- A liveness analysis is done at the basic block level. The
+  information is used to suppress moves from a dead variable to
+  another one. It is also used to remove instructions which compute
+  dead results. The later is especially useful for condition code
+  optimization in QEMU.
+
+  In the following example:
+
+  add_i32 t0, t1, t2
+  add_i32 t0, t0, $1
+  mov_i32 t0, $1
+
+  only the last instruction is kept.
+
+3.4) Instruction Reference
+
+********* Function call
+
+* call <ret> <params> ptr
+
+call function 'ptr' (pointer type)
+
+<ret> optional 32 bit or 64 bit return value
+<params> optional 32 bit or 64 bit parameters
+
+********* Jumps/Labels
+
+* jmp t0
+
+Absolute jump to address t0 (pointer type).
+
+* set_label $label
+
+Define label 'label' at the current program point.
+
+* br $label
+
+Jump to label.
+
+* brcond_i32/i64 cond, t0, t1, label
+
+Conditional jump if t0 cond t1 is true. cond can be:
+    TCG_COND_EQ
+    TCG_COND_NE
+    TCG_COND_LT /* signed */
+    TCG_COND_GE /* signed */
+    TCG_COND_LE /* signed */
+    TCG_COND_GT /* signed */
+    TCG_COND_LTU /* unsigned */
+    TCG_COND_GEU /* unsigned */
+    TCG_COND_LEU /* unsigned */
+    TCG_COND_GTU /* unsigned */
+
+********* Arithmetic
+
+* add_i32/i64 t0, t1, t2
+
+t0=t1+t2
+
+* sub_i32/i64 t0, t1, t2
+
+t0=t1-t2
+
+* neg_i32/i64 t0, t1
+
+t0=-t1 (two's complement)
+
+* mul_i32/i64 t0, t1, t2
+
+t0=t1*t2
+
+* div_i32/i64 t0, t1, t2
+
+t0=t1/t2 (signed). Undefined behavior if division by zero or overflow.
+
+* divu_i32/i64 t0, t1, t2
+
+t0=t1/t2 (unsigned). Undefined behavior if division by zero.
+
+* rem_i32/i64 t0, t1, t2
+
+t0=t1%t2 (signed). Undefined behavior if division by zero or overflow.
+
+* remu_i32/i64 t0, t1, t2
+
+t0=t1%t2 (unsigned). Undefined behavior if division by zero.
+
+********* Logical
+
+* and_i32/i64 t0, t1, t2
+
+t0=t1&t2
+
+* or_i32/i64 t0, t1, t2
+
+t0=t1|t2
+
+* xor_i32/i64 t0, t1, t2
+
+t0=t1^t2
+
+* not_i32/i64 t0, t1
+
+t0=~t1
+
+* andc_i32/i64 t0, t1, t2
+
+t0=t1&~t2
+
+* eqv_i32/i64 t0, t1, t2
+
+t0=~(t1^t2), or equivalently, t0=t1^~t2
+
+* nand_i32/i64 t0, t1, t2
+
+t0=~(t1&t2)
+
+* nor_i32/i64 t0, t1, t2
+
+t0=~(t1|t2)
+
+* orc_i32/i64 t0, t1, t2
+
+t0=t1|~t2
+
+********* Shifts/Rotates
+
+* shl_i32/i64 t0, t1, t2
+
+t0=t1 << t2. Undefined behavior if t2 < 0 or t2 >= 32 (resp 64)
+
+* shr_i32/i64 t0, t1, t2
+
+t0=t1 >> t2 (unsigned). Undefined behavior if t2 < 0 or t2 >= 32 (resp 64)
+
+* sar_i32/i64 t0, t1, t2
+
+t0=t1 >> t2 (signed). Undefined behavior if t2 < 0 or t2 >= 32 (resp 64)
+
+* rotl_i32/i64 t0, t1, t2
+
+Rotation of t2 bits to the left. Undefined behavior if t2 < 0 or t2 >= 32 (resp 64)
+
+* rotr_i32/i64 t0, t1, t2
+
+Rotation of t2 bits to the right. Undefined behavior if t2 < 0 or t2 >= 32 (resp 64)
+
+********* Misc
+
+* mov_i32/i64 t0, t1
+
+t0 = t1
+
+Move t1 to t0 (both operands must have the same type).
+
+* ext8s_i32/i64 t0, t1
+ext8u_i32/i64 t0, t1
+ext16s_i32/i64 t0, t1
+ext16u_i32/i64 t0, t1
+ext32s_i64 t0, t1
+ext32u_i64 t0, t1
+
+8, 16 or 32 bit sign/zero extension (both operands must have the same type)
+
+* bswap16_i32/i64 t0, t1
+
+16 bit byte swap on a 32/64 bit value. It assumes that the two/six high order
+bytes are set to zero.
+
+* bswap32_i32/i64 t0, t1
+
+32 bit byte swap on a 32/64 bit value. With a 64 bit value, it assumes that
+the four high order bytes are set to zero.
+
+* bswap64_i64 t0, t1
+
+64 bit byte swap
+
+* discard_i32/i64 t0
+
+Indicate that the value of t0 won't be used later. It is useful to
+force dead code elimination.
+
+********* Conditional moves
+
+* setcond_i32/i64 cond, dest, t1, t2
+
+dest = (t1 cond t2)
+
+Set DEST to 1 if (T1 cond T2) is true, otherwise set to 0.
+
+********* Type conversions
+
+* ext_i32_i64 t0, t1
+Convert t1 (32 bit) to t0 (64 bit) and does sign extension
+
+* extu_i32_i64 t0, t1
+Convert t1 (32 bit) to t0 (64 bit) and does zero extension
+
+* trunc_i64_i32 t0, t1
+Truncate t1 (64 bit) to t0 (32 bit)
+
+* concat_i32_i64 t0, t1, t2
+Construct t0 (64-bit) taking the low half from t1 (32 bit) and the high half
+from t2 (32 bit).
+
+* concat32_i64 t0, t1, t2
+Construct t0 (64-bit) taking the low half from t1 (64 bit) and the high half
+from t2 (64 bit).
+
+********* Load/Store
+
+* ld_i32/i64 t0, t1, offset
+ld8s_i32/i64 t0, t1, offset
+ld8u_i32/i64 t0, t1, offset
+ld16s_i32/i64 t0, t1, offset
+ld16u_i32/i64 t0, t1, offset
+ld32s_i64 t0, t1, offset
+ld32u_i64 t0, t1, offset
+
+t0 = read(t1 + offset)
+Load 8, 16, 32 or 64 bits with or without sign extension from host memory.
+offset must be a constant.
+
+* st_i32/i64 t0, t1, offset
+st8_i32/i64 t0, t1, offset
+st16_i32/i64 t0, t1, offset
+st32_i64 t0, t1, offset
+
+write(t0, t1 + offset)
+Write 8, 16, 32 or 64 bits to host memory.
+
+********* 64-bit target on 32-bit host support
+
+The following opcodes are internal to TCG.  Thus they are to be implemented by
+32-bit host code generators, but are not to be emitted by guest translators.
+They are emitted as needed by inline functions within "tcg-op.h".
+
+* brcond2_i32 cond, t0_low, t0_high, t1_low, t1_high, label
+
+Similar to brcond, except that the 64-bit values T0 and T1
+are formed from two 32-bit arguments.
+
+* add2_i32 t0_low, t0_high, t1_low, t1_high, t2_low, t2_high
+* sub2_i32 t0_low, t0_high, t1_low, t1_high, t2_low, t2_high
+
+Similar to add/sub, except that the 64-bit inputs T1 and T2 are
+formed from two 32-bit arguments, and the 64-bit output T0
+is returned in two 32-bit outputs.
+
+* mulu2_i32 t0_low, t0_high, t1, t2
+
+Similar to mul, except two 32-bit (unsigned) inputs T1 and T2 yielding
+the full 64-bit product T0.  The later is returned in two 32-bit outputs.
+
+* setcond2_i32 cond, dest, t1_low, t1_high, t2_low, t2_high
+
+Similar to setcond, except that the 64-bit values T1 and T2 are
+formed from two 32-bit arguments.  The result is a 32-bit value.
+
+********* QEMU specific operations
+
+* tb_exit t0
+
+Exit the current TB and return the value t0 (word type).
+
+* goto_tb index
+
+Exit the current TB and jump to the TB index 'index' (constant) if the
+current TB was linked to this TB. Otherwise execute the next
+instructions.
+
+* qemu_ld8u t0, t1, flags
+qemu_ld8s t0, t1, flags
+qemu_ld16u t0, t1, flags
+qemu_ld16s t0, t1, flags
+qemu_ld32 t0, t1, flags
+qemu_ld32u t0, t1, flags
+qemu_ld32s t0, t1, flags
+qemu_ld64 t0, t1, flags
+
+Load data at the QEMU CPU address t1 into t0. t1 has the QEMU CPU address
+type. 'flags' contains the QEMU memory index (selects user or kernel access)
+for example.
+
+Note that "qemu_ld32" implies a 32-bit result, while "qemu_ld32u" and
+"qemu_ld32s" imply a 64-bit result appropriately extended from 32 bits.
+
+* qemu_st8 t0, t1, flags
+qemu_st16 t0, t1, flags
+qemu_st32 t0, t1, flags
+qemu_st64 t0, t1, flags
+
+Store the data t0 at the QEMU CPU Address t1. t1 has the QEMU CPU
+address type. 'flags' contains the QEMU memory index (selects user or
+kernel access) for example.
+
+Note 1: Some shortcuts are defined when the last operand is known to be
+a constant (e.g. addi for add, movi for mov).
+
+Note 2: When using TCG, the opcodes must never be generated directly
+as some of them may not be available as "real" opcodes. Always use the
+function tcg_gen_xxx(args).
+
+4) Backend
+
+tcg-target.h contains the target specific definitions. tcg-target.c
+contains the target specific code.
+
+4.1) Assumptions
+
+The target word size (TCG_TARGET_REG_BITS) is expected to be 32 bit or
+64 bit. It is expected that the pointer has the same size as the word.
+
+On a 32 bit target, all 64 bit operations are converted to 32 bits. A
+few specific operations must be implemented to allow it (see add2_i32,
+sub2_i32, brcond2_i32).
+
+Floating point operations are not supported in this version. A
+previous incarnation of the code generator had full support of them,
+but it is better to concentrate on integer operations first.
+
+On a 64 bit target, no assumption is made in TCG about the storage of
+the 32 bit values in 64 bit registers.
+
+4.2) Constraints
+
+GCC like constraints are used to define the constraints of every
+instruction. Memory constraints are not supported in this
+version. Aliases are specified in the input operands as for GCC.
+
+The same register may be used for both an input and an output, even when
+they are not explicitly aliased.  If an op expands to multiple target
+instructions then care must be taken to avoid clobbering input values.
+GCC style "early clobber" outputs are not currently supported.
+
+A target can define specific register or constant constraints. If an
+operation uses a constant input constraint which does not allow all
+constants, it must also accept registers in order to have a fallback.
+
+The movi_i32 and movi_i64 operations must accept any constants.
+
+The mov_i32 and mov_i64 operations must accept any registers of the
+same type.
+
+The ld/st instructions must accept signed 32 bit constant offsets. It
+can be implemented by reserving a specific register to compute the
+address if the offset is too big.
+
+The ld/st instructions must accept any destination (ld) or source (st)
+register.
+
+4.3) Function call assumptions
+
+- The only supported types for parameters and return value are: 32 and
+  64 bit integers and pointer.
+- The stack grows downwards.
+- The first N parameters are passed in registers.
+- The next parameters are passed on the stack by storing them as words.
+- Some registers are clobbered during the call.
+- The function can return 0 or 1 value in registers. On a 32 bit
+  target, functions must be able to return 2 values in registers for
+  64 bit return type.
+
+5) Recommended coding rules for best performance
+
+- Use globals to represent the parts of the QEMU CPU state which are
+  often modified, e.g. the integer registers and the condition
+  codes. TCG will be able to use host registers to store them.
+
+- Avoid globals stored in fixed registers. They must be used only to
+  store the pointer to the CPU state and possibly to store a pointer
+  to a register window.
+
+- Use temporaries. Use local temporaries only when really needed,
+  e.g. when you need to use a value after a jump. Local temporaries
+  introduce a performance hit in the current TCG implementation: their
+  content is saved to memory at end of each basic block.
+
+- Free temporaries and local temporaries when they are no longer used
+  (tcg_temp_free). Since tcg_const_x() also creates a temporary, you
+  should free it after it is used. Freeing temporaries does not yield
+  a better generated code, but it reduces the memory usage of TCG and
+  the speed of the translation.
+
+- Don't hesitate to use helpers for complicated or seldom used target
+  intructions. There is little performance advantage in using TCG to
+  implement target instructions taking more than about twenty TCG
+  instructions.
+
+- Use the 'discard' instruction if you know that TCG won't be able to
+  prove that a given global is "dead" at a given program point. The
+  x86 target uses it to improve the condition codes optimisation.
diff --git a/src/recompiler/tcg/TODO b/src/recompiler/tcg/TODO
new file mode 100644
index 00000000..2bc2785d
--- /dev/null
+++ b/src/recompiler/tcg/TODO
@@ -0,0 +1,14 @@
+- Add new instructions such as: clz, ctz, popcnt.
+
+- See if it is worth exporting mul2, mulu2, div2, divu2.
+
+- Support of globals saved in fixed registers between TBs.
+
+Ideas:
+
+- Move the slow part of the qemu_ld/st ops after the end of the TB.
+
+- Change exception syntax to get closer to QOP system (exception
+  parameters given with a specific instruction).
+
+- Add float and vector support.
diff --git a/src/recompiler/tcg/i386/Makefile.kup b/src/recompiler/tcg/i386/Makefile.kup
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/src/recompiler/tcg/i386/Makefile.kup
diff --git a/src/recompiler/tcg/i386/tcg-target.c b/src/recompiler/tcg/i386/tcg-target.c
new file mode 100644
index 00000000..0943d54e
--- /dev/null
+++ b/src/recompiler/tcg/i386/tcg-target.c
@@ -0,0 +1,2231 @@
+/*
+ * Tiny Code Generator for QEMU
+ *
+ * Copyright (c) 2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef NDEBUG
+static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
+#if TCG_TARGET_REG_BITS == 64
+    "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
+    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
+#else
+    "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
+#endif
+};
+#endif
+
+static const int tcg_target_reg_alloc_order[] = {
+#if TCG_TARGET_REG_BITS == 64
+    TCG_REG_RBP,
+    TCG_REG_RBX,
+    TCG_REG_R12,
+    TCG_REG_R13,
+    TCG_REG_R14,
+    TCG_REG_R15,
+    TCG_REG_R10,
+    TCG_REG_R11,
+# if !defined(VBOX) || !defined(__MINGW64__)
+    TCG_REG_R9,
+    TCG_REG_R8,
+    TCG_REG_RCX,
+    TCG_REG_RDX,
+# endif
+    TCG_REG_RSI,
+    TCG_REG_RDI,
+# if defined(VBOX) && defined(__MINGW64__)
+    TCG_REG_R9,
+    TCG_REG_R8,
+    TCG_REG_RDX,
+    TCG_REG_RCX,
+# endif
+    TCG_REG_RAX,
+#else
+    TCG_REG_EBX,
+    TCG_REG_ESI,
+    TCG_REG_EDI,
+    TCG_REG_EBP,
+    TCG_REG_ECX,
+    TCG_REG_EDX,
+    TCG_REG_EAX,
+#endif
+};
+
+static const int tcg_target_call_iarg_regs[] = {
+#if TCG_TARGET_REG_BITS == 64
+# if defined(VBOX) && defined(__MINGW64__)
+    TCG_REG_RCX,
+    TCG_REG_RDX,
+# else
+    TCG_REG_RDI,
+    TCG_REG_RSI,
+    TCG_REG_RDX,
+    TCG_REG_RCX,
+# endif
+    TCG_REG_R8,
+    TCG_REG_R9,
+#else
+    TCG_REG_EAX,
+    TCG_REG_EDX,
+    TCG_REG_ECX
+#endif
+};
+
+static const int tcg_target_call_oarg_regs[2] = {
+    TCG_REG_EAX,
+    TCG_REG_EDX
+};
+
+static uint8_t *tb_ret_addr;
+
+static void patch_reloc(uint8_t *code_ptr, int type,
+                        tcg_target_long value, tcg_target_long addend)
+{
+    value += addend;
+    switch(type) {
+    case R_386_PC32:
+        value -= (uintptr_t)code_ptr;
+        if (value != (int32_t)value) {
+            tcg_abort();
+        }
+        *(uint32_t *)code_ptr = value;
+        break;
+    case R_386_PC8:
+        value -= (uintptr_t)code_ptr;
+        if (value != (int8_t)value) {
+            tcg_abort();
+        }
+        *(uint8_t *)code_ptr = value;
+        break;
+    default:
+        tcg_abort();
+    }
+}
+
+#ifdef VBOX
+/* emits stack alignment checks for strict builds. */
+DECLINLINE(void) tcg_gen_stack_alignment_check(TCGContext *s)
+{
+# if defined(RT_STRICT) && defined(RT_OS_DARWIN) /** @todo all OSes? */
+    tcg_out8(s, 0xf7); tcg_out8(s, 0xc4);   /* test %esp, 1fh */
+    tcg_out32(s, TCG_TARGET_STACK_ALIGN - 1);
+    tcg_out8(s, 0x74);                      /* jz imm8 */
+    tcg_out8(s, 1);                         /* $+3 (over int3) */
+    tcg_out8(s, 0xcc);                      /* int3 */
+# else
+    NOREF(s);
+# endif
+}
+#endif /* VBOX */
+
+/* maximum number of register used for input function arguments */
+static inline int tcg_target_get_call_iarg_regs_count(int flags)
+{
+    if (TCG_TARGET_REG_BITS == 64) {
+        return 6;
+    }
+
+    flags &= TCG_CALL_TYPE_MASK;
+    switch(flags) {
+    case TCG_CALL_TYPE_STD:
+        return 0;
+    case TCG_CALL_TYPE_REGPARM_1:
+    case TCG_CALL_TYPE_REGPARM_2:
+    case TCG_CALL_TYPE_REGPARM:
+        return flags - TCG_CALL_TYPE_REGPARM_1 + 1;
+    default:
+        tcg_abort();
+    }
+}
+
+/* parse target specific constraints */
+static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
+{
+    const char *ct_str;
+
+    ct_str = *pct_str;
+    switch(ct_str[0]) {
+    case 'a':
+        ct->ct |= TCG_CT_REG;
+        tcg_regset_set_reg(ct->u.regs, TCG_REG_EAX);
+        break;
+    case 'b':
+        ct->ct |= TCG_CT_REG;
+        tcg_regset_set_reg(ct->u.regs, TCG_REG_EBX);
+        break;
+    case 'c':
+        ct->ct |= TCG_CT_REG;
+        tcg_regset_set_reg(ct->u.regs, TCG_REG_ECX);
+        break;
+    case 'd':
+        ct->ct |= TCG_CT_REG;
+        tcg_regset_set_reg(ct->u.regs, TCG_REG_EDX);
+        break;
+    case 'S':
+        ct->ct |= TCG_CT_REG;
+        tcg_regset_set_reg(ct->u.regs, TCG_REG_ESI);
+        break;
+    case 'D':
+        ct->ct |= TCG_CT_REG;
+        tcg_regset_set_reg(ct->u.regs, TCG_REG_EDI);
+        break;
+    case 'q':
+        ct->ct |= TCG_CT_REG;
+        if (TCG_TARGET_REG_BITS == 64) {
+            tcg_regset_set32(ct->u.regs, 0, 0xffff);
+        } else {
+            tcg_regset_set32(ct->u.regs, 0, 0xf);
+        }
+        break;
+    case 'r':
+        ct->ct |= TCG_CT_REG;
+        if (TCG_TARGET_REG_BITS == 64) {
+            tcg_regset_set32(ct->u.regs, 0, 0xffff);
+        } else {
+            tcg_regset_set32(ct->u.regs, 0, 0xff);
+        }
+        break;
+
+        /* qemu_ld/st address constraint */
+    case 'L':
+        ct->ct |= TCG_CT_REG;
+        if (TCG_TARGET_REG_BITS == 64) {
+            tcg_regset_set32(ct->u.regs, 0, 0xffff);
+#if defined(VBOX) && defined(__MINGW64__)
+            tcg_regset_reset_reg(ct->u.regs, tcg_target_call_iarg_regs[2]);
+            tcg_regset_reset_reg(ct->u.regs, tcg_target_call_iarg_regs[1]);
+            tcg_regset_reset_reg(ct->u.regs, tcg_target_call_iarg_regs[0]);
+#else
+            /** @todo figure why RDX isn't mentioned here. */
+            tcg_regset_reset_reg(ct->u.regs, TCG_REG_RSI);
+            tcg_regset_reset_reg(ct->u.regs, TCG_REG_RDI);
+#endif
+        } else {
+            tcg_regset_set32(ct->u.regs, 0, 0xff);
+            tcg_regset_reset_reg(ct->u.regs, TCG_REG_EAX);
+            tcg_regset_reset_reg(ct->u.regs, TCG_REG_EDX);
+        }
+        break;
+
+    case 'e':
+        ct->ct |= TCG_CT_CONST_S32;
+        break;
+    case 'Z':
+        ct->ct |= TCG_CT_CONST_U32;
+        break;
+
+    default:
+        return -1;
+    }
+    ct_str++;
+    *pct_str = ct_str;
+    return 0;
+}
+
+/* test if a constant matches the constraint */
+static inline int tcg_target_const_match(tcg_target_long val,
+                                         const TCGArgConstraint *arg_ct)
+{
+    int ct = arg_ct->ct;
+    if (ct & TCG_CT_CONST) {
+        return 1;
+    }
+    if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
+        return 1;
+    }
+    if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
+        return 1;
+    }
+    return 0;
+}
+
+#if TCG_TARGET_REG_BITS == 64
+# define LOWREGMASK(x)	((x) & 7)
+#else
+# define LOWREGMASK(x)	(x)
+#endif
+
+#define P_EXT		0x100		/* 0x0f opcode prefix */
+#define P_DATA16	0x200		/* 0x66 opcode prefix */
+#if TCG_TARGET_REG_BITS == 64
+# define P_ADDR32	0x400		/* 0x67 opcode prefix */
+# define P_REXW		0x800		/* Set REX.W = 1 */
+# define P_REXB_R	0x1000		/* REG field as byte register */
+# define P_REXB_RM	0x2000		/* R/M field as byte register */
+#else
+# define P_ADDR32	0
+# define P_REXW		0
+# define P_REXB_R	0
+# define P_REXB_RM	0
+#endif
+
+#define OPC_ARITH_EvIz	(0x81)
+#define OPC_ARITH_EvIb	(0x83)
+#define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
+#define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
+#define OPC_BSWAP	(0xc8 | P_EXT)
+#define OPC_CALL_Jz	(0xe8)
+#define OPC_CMP_GvEv	(OPC_ARITH_GvEv | (ARITH_CMP << 3))
+#define OPC_DEC_r32	(0x48)
+#define OPC_IMUL_GvEv	(0xaf | P_EXT)
+#define OPC_IMUL_GvEvIb	(0x6b)
+#define OPC_IMUL_GvEvIz	(0x69)
+#define OPC_INC_r32	(0x40)
+#define OPC_JCC_long	(0x80 | P_EXT)	/* ... plus condition code */
+#define OPC_JCC_short	(0x70)		/* ... plus condition code */
+#define OPC_JMP_long	(0xe9)
+#define OPC_JMP_short	(0xeb)
+#define OPC_LEA         (0x8d)
+#define OPC_MOVB_EvGv	(0x88)		/* stores, more or less */
+#define OPC_MOVL_EvGv	(0x89)		/* stores, more or less */
+#define OPC_MOVL_GvEv	(0x8b)		/* loads, more or less */
+#define OPC_MOVL_EvIz	(0xc7)
+#define OPC_MOVL_Iv     (0xb8)
+#define OPC_MOVSBL	(0xbe | P_EXT)
+#define OPC_MOVSWL	(0xbf | P_EXT)
+#define OPC_MOVSLQ	(0x63 | P_REXW)
+#define OPC_MOVZBL	(0xb6 | P_EXT)
+#define OPC_MOVZWL	(0xb7 | P_EXT)
+#define OPC_POP_r32	(0x58)
+#define OPC_PUSH_r32	(0x50)
+#define OPC_PUSH_Iv	(0x68)
+#define OPC_PUSH_Ib	(0x6a)
+#define OPC_RET		(0xc3)
+#define OPC_SETCC	(0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
+#define OPC_SHIFT_1	(0xd1)
+#define OPC_SHIFT_Ib	(0xc1)
+#define OPC_SHIFT_cl	(0xd3)
+#define OPC_TESTL	(0x85)
+#define OPC_XCHG_ax_r32	(0x90)
+
+#define OPC_GRP3_Ev	(0xf7)
+#define OPC_GRP5	(0xff)
+
+/* Group 1 opcode extensions for 0x80-0x83.
+   These are also used as modifiers for OPC_ARITH.  */
+#define ARITH_ADD 0
+#define ARITH_OR  1
+#define ARITH_ADC 2
+#define ARITH_SBB 3
+#define ARITH_AND 4
+#define ARITH_SUB 5
+#define ARITH_XOR 6
+#define ARITH_CMP 7
+
+/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
+#define SHIFT_ROL 0
+#define SHIFT_ROR 1
+#define SHIFT_SHL 4
+#define SHIFT_SHR 5
+#define SHIFT_SAR 7
+
+/* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
+#define EXT3_NOT   2
+#define EXT3_NEG   3
+#define EXT3_MUL   4
+#define EXT3_IMUL  5
+#define EXT3_DIV   6
+#define EXT3_IDIV  7
+
+/* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
+#define EXT5_INC_Ev	0
+#define EXT5_DEC_Ev	1
+#define EXT5_CALLN_Ev	2
+#define EXT5_JMPN_Ev	4
+
+/* Condition codes to be added to OPC_JCC_{long,short}.  */
+#define JCC_JMP (-1)
+#define JCC_JO  0x0
+#define JCC_JNO 0x1
+#define JCC_JB  0x2
+#define JCC_JAE 0x3
+#define JCC_JE  0x4
+#define JCC_JNE 0x5
+#define JCC_JBE 0x6
+#define JCC_JA  0x7
+#define JCC_JS  0x8
+#define JCC_JNS 0x9
+#define JCC_JP  0xa
+#define JCC_JNP 0xb
+#define JCC_JL  0xc
+#define JCC_JGE 0xd
+#define JCC_JLE 0xe
+#define JCC_JG  0xf
+
+static const uint8_t tcg_cond_to_jcc[10] = {
+    [TCG_COND_EQ] = JCC_JE,
+    [TCG_COND_NE] = JCC_JNE,
+    [TCG_COND_LT] = JCC_JL,
+    [TCG_COND_GE] = JCC_JGE,
+    [TCG_COND_LE] = JCC_JLE,
+    [TCG_COND_GT] = JCC_JG,
+    [TCG_COND_LTU] = JCC_JB,
+    [TCG_COND_GEU] = JCC_JAE,
+    [TCG_COND_LEU] = JCC_JBE,
+    [TCG_COND_GTU] = JCC_JA,
+};
+
+#if defined(VBOX)
+/* Calc the size of the tcg_out_opc() result. */
+static inline unsigned char tcg_calc_opc_len(TCGContext *s, int opc, int r, int rm, int x)
+{
+    unsigned char len = 1;
+# if TCG_TARGET_REG_BITS == 64
+    unsigned rex;
+    rex = 0;
+    rex |= (opc & P_REXW) >> 8;		/* REX.W */
+    rex |= (r & 8) >> 1;		/* REX.R */
+    rex |= (x & 8) >> 2;		/* REX.X */
+    rex |= (rm & 8) >> 3;		/* REX.B */
+    rex |= opc & (r >= 4 ? P_REXB_R : 0);
+    rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
+    if (rex)            len++;
+    if (opc & P_ADDR32) len++;
+# endif
+    if (opc & P_DATA16) len++;
+    if (opc & P_EXT)    len++;
+
+    return len;
+}
+#endif
+
+#if TCG_TARGET_REG_BITS == 64
+static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
+{
+    int rex;
+
+    if (opc & P_DATA16) {
+        /* We should never be asking for both 16 and 64-bit operation.  */
+        assert((opc & P_REXW) == 0);
+        tcg_out8(s, 0x66);
+    }
+    if (opc & P_ADDR32) {
+        tcg_out8(s, 0x67);
+    }
+
+    rex = 0;
+    rex |= (opc & P_REXW) >> 8;		/* REX.W */
+    rex |= (r & 8) >> 1;		/* REX.R */
+    rex |= (x & 8) >> 2;		/* REX.X */
+    rex |= (rm & 8) >> 3;		/* REX.B */
+
+    /* P_REXB_{R,RM} indicates that the given register is the low byte.
+       For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
+       as otherwise the encoding indicates %[abcd]h.  Note that the values
+       that are ORed in merely indicate that the REX byte must be present;
+       those bits get discarded in output.  */
+    rex |= opc & (r >= 4 ? P_REXB_R : 0);
+    rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
+
+    if (rex) {
+        tcg_out8(s, (uint8_t)(rex | 0x40));
+    }
+
+    if (opc & P_EXT) {
+        tcg_out8(s, 0x0f);
+    }
+    tcg_out8(s, opc);
+}
+#else
+static void tcg_out_opc(TCGContext *s, int opc)
+{
+    if (opc & P_DATA16) {
+        tcg_out8(s, 0x66);
+    }
+    if (opc & P_EXT) {
+        tcg_out8(s, 0x0f);
+    }
+    tcg_out8(s, opc);
+}
+/* Discard the register arguments to tcg_out_opc early, so as not to penalize
+   the 32-bit compilation paths.  This method works with all versions of gcc,
+   whereas relying on optimization may not be able to exclude them.  */
+#define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
+#endif
+
+static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
+{
+    tcg_out_opc(s, opc, r, rm, 0);
+    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
+}
+
+/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
+   We handle either RM and INDEX missing with a negative value.  In 64-bit
+   mode for absolute addresses, ~RM is the size of the immediate operand
+   that will follow the instruction.  */
+
+static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
+                                     int index, int shift,
+                                     tcg_target_long offset)
+{
+    int mod, len;
+
+    if (index < 0 && rm < 0) {
+        if (TCG_TARGET_REG_BITS == 64) {
+            /* Try for a rip-relative addressing mode.  This has replaced
+               the 32-bit-mode absolute addressing encoding.  */
+#ifdef VBOX
+            tcg_target_long pc = (tcg_target_long)s->code_ptr
+                               + tcg_calc_opc_len(s, opc, r, 0, 0) + 1 + 4;
+#else
+            tcg_target_long pc = (tcg_target_long)s->code_ptr + 5 + ~rm;
+#endif
+            tcg_target_long disp = offset - pc;
+            if (disp == (int32_t)disp) {
+                tcg_out_opc(s, opc, r, 0, 0);
+                tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
+                tcg_out32(s, disp);
+#ifdef VBOX
+                Assert(pc == (tcg_target_long)s->code_ptr);
+#endif
+                return;
+            }
+
+            /* Try for an absolute address encoding.  This requires the
+               use of the MODRM+SIB encoding and is therefore larger than
+               rip-relative addressing.  */
+            if (offset == (int32_t)offset) {
+                tcg_out_opc(s, opc, r, 0, 0);
+                tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
+                tcg_out8(s, (4 << 3) | 5);
+                tcg_out32(s, offset);
+                return;
+            }
+
+            /* ??? The memory isn't directly addressable.  */
+            tcg_abort();
+        } else {
+            /* Absolute address.  */
+            tcg_out_opc(s, opc, r, 0, 0);
+            tcg_out8(s, (r << 3) | 5);
+            tcg_out32(s, offset);
+            return;
+        }
+    }
+
+    /* Find the length of the immediate addend.  Note that the encoding
+       that would be used for (%ebp) indicates absolute addressing.  */
+    if (rm < 0) {
+        mod = 0, len = 4, rm = 5;
+    } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
+        mod = 0, len = 0;
+    } else if (offset == (int8_t)offset) {
+        mod = 0x40, len = 1;
+    } else {
+        mod = 0x80, len = 4;
+    }
+
+    /* Use a single byte MODRM format if possible.  Note that the encoding
+       that would be used for %esp is the escape to the two byte form.  */
+    if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
+        /* Single byte MODRM format.  */
+        tcg_out_opc(s, opc, r, rm, 0);
+        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
+    } else {
+        /* Two byte MODRM+SIB format.  */
+
+        /* Note that the encoding that would place %esp into the index
+           field indicates no index register.  In 64-bit mode, the REX.X
+           bit counts, so %r12 can be used as the index.  */
+        if (index < 0) {
+            index = 4;
+        } else {
+            assert(index != TCG_REG_ESP);
+        }
+
+        tcg_out_opc(s, opc, r, rm, index);
+        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
+        tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
+    }
+
+    if (len == 1) {
+        tcg_out8(s, offset);
+    } else if (len == 4) {
+        tcg_out32(s, offset);
+    }
+}
+
+/* A simplification of the above with no index or shift.  */
+static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
+                                        int rm, tcg_target_long offset)
+{
+    tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
+}
+
+/* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
+static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
+{
+    /* Propagate an opcode prefix, such as P_REXW.  */
+    int ext = subop & ~0x7;
+    subop &= 0x7;
+
+    tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
+}
+
+static inline void tcg_out_mov(TCGContext *s, TCGType type, int ret, int arg)
+{
+    if (arg != ret) {
+        int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);
+        tcg_out_modrm(s, opc, ret, arg);
+    }
+}
+
+static void tcg_out_movi(TCGContext *s, TCGType type,
+                         int ret, tcg_target_long arg)
+{
+    if (arg == 0) {
+        tgen_arithr(s, ARITH_XOR, ret, ret);
+        return;
+    } else if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
+        tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
+        tcg_out32(s, arg);
+    } else if (arg == (int32_t)arg) {
+        tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
+        tcg_out32(s, arg);
+    } else {
+        tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
+        tcg_out32(s, arg);
+        tcg_out32(s, arg >> 31 >> 1);
+    }
+}
+
+static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
+{
+    if (val == (int8_t)val) {
+        tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
+        tcg_out8(s, val);
+    } else if (val == (int32_t)val) {
+        tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
+        tcg_out32(s, val);
+    } else {
+        tcg_abort();
+    }
+}
+
+static inline void tcg_out_push(TCGContext *s, int reg)
+{
+    tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
+}
+
+static inline void tcg_out_pop(TCGContext *s, int reg)
+{
+    tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
+}
+
+static inline void tcg_out_ld(TCGContext *s, TCGType type, int ret,
+                              int arg1, tcg_target_long arg2)
+{
+    int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);
+    tcg_out_modrm_offset(s, opc, ret, arg1, arg2);
+}
+
+static inline void tcg_out_st(TCGContext *s, TCGType type, int arg,
+                              int arg1, tcg_target_long arg2)
+{
+    int opc = OPC_MOVL_EvGv + (type == TCG_TYPE_I64 ? P_REXW : 0);
+    tcg_out_modrm_offset(s, opc, arg, arg1, arg2);
+}
+
+static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
+{
+    /* Propagate an opcode prefix, such as P_DATA16.  */
+    int ext = subopc & ~0x7;
+    subopc &= 0x7;
+
+    if (count == 1) {
+        tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
+    } else {
+        tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
+        tcg_out8(s, count);
+    }
+}
+
+static inline void tcg_out_bswap32(TCGContext *s, int reg)
+{
+    tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
+}
+
+static inline void tcg_out_rolw_8(TCGContext *s, int reg)
+{
+    tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
+}
+
+static inline void tcg_out_ext8u(TCGContext *s, int dest, int src)
+{
+    /* movzbl */
+    assert(src < 4 || TCG_TARGET_REG_BITS == 64);
+    tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
+}
+
+static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw)
+{
+    /* movsbl */
+    assert(src < 4 || TCG_TARGET_REG_BITS == 64);
+    tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
+}
+
+static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
+{
+    /* movzwl */
+    tcg_out_modrm(s, OPC_MOVZWL, dest, src);
+}
+
+static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw)
+{
+    /* movsw[lq] */
+    tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
+}
+
+static inline void tcg_out_ext32u(TCGContext *s, int dest, int src)
+{
+    /* 32-bit mov zero extends.  */
+    tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
+}
+
+static inline void tcg_out_ext32s(TCGContext *s, int dest, int src)
+{
+    tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
+}
+
+static inline void tcg_out_bswap64(TCGContext *s, int reg)
+{
+    tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
+}
+
+static void tgen_arithi(TCGContext *s, int c, int r0,
+                        tcg_target_long val, int cf)
+{
+    int rexw = 0;
+
+    if (TCG_TARGET_REG_BITS == 64) {
+        rexw = c & -8;
+        c &= 7;
+    }
+
+    /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
+       partial flags update stalls on Pentium4 and are not recommended
+       by current Intel optimization manuals.  */
+    if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
+        int is_inc = (c == ARITH_ADD) != (val < 0); /* VBox: cppcheck: "xor" for bools is "not-equals" */
+        if (TCG_TARGET_REG_BITS == 64) {
+            /* The single-byte increment encodings are re-tasked as the
+               REX prefixes.  Use the MODRM encoding.  */
+            tcg_out_modrm(s, OPC_GRP5 + rexw,
+                          (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
+        } else {
+            tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
+        }
+        return;
+    }
+
+    if (c == ARITH_AND) {
+        if (TCG_TARGET_REG_BITS == 64) {
+            if (val == 0xffffffffu) {
+                tcg_out_ext32u(s, r0, r0);
+                return;
+            }
+            if (val == (uint32_t)val) {
+                /* AND with no high bits set can use a 32-bit operation.  */
+                rexw = 0;
+            }
+        }
+        if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
+            tcg_out_ext8u(s, r0, r0);
+            return;
+        }
+        if (val == 0xffffu) {
+            tcg_out_ext16u(s, r0, r0);
+            return;
+        }
+    }
+
+    if (val == (int8_t)val) {
+        tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
+        tcg_out8(s, val);
+        return;
+    }
+    if (rexw == 0 || val == (int32_t)val) {
+        tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
+        tcg_out32(s, val);
+        return;
+    }
+
+    tcg_abort();
+}
+
+static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
+{
+    if (val != 0) {
+        tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
+    }
+}
+
+#if defined(VBOX) && defined(RT_OS_DARWIN) && ARCH_BITS == 32
+# define VBOX_16_BYTE_STACK_ALIGN
+#endif
+#ifdef VBOX_16_BYTE_STACK_ALIGN
+static void tcg_out_subi(TCGContext *s, int reg, tcg_target_long val)
+{
+    if (val != 0) {
+        tgen_arithi(s, ARITH_SUB + P_REXW, reg, val, 0);
+    }
+}
+#endif
+
+/* Use SMALL != 0 to force a short forward branch.  */
+static void tcg_out_jxx(TCGContext *s, int opc, int label_index, int small)
+{
+    int32_t val, val1;
+    TCGLabel *l = &s->labels[label_index];
+
+    if (l->has_value) {
+        val = l->u.value - (tcg_target_long)s->code_ptr;
+        val1 = val - 2;
+        if ((int8_t)val1 == val1) {
+            if (opc == -1) {
+                tcg_out8(s, OPC_JMP_short);
+            } else {
+                tcg_out8(s, OPC_JCC_short + opc);
+            }
+            tcg_out8(s, val1);
+        } else {
+            if (small) {
+                tcg_abort();
+            }
+            if (opc == -1) {
+                tcg_out8(s, OPC_JMP_long);
+                tcg_out32(s, val - 5);
+            } else {
+                tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
+                tcg_out32(s, val - 6);
+            }
+        }
+    } else if (small) {
+        if (opc == -1) {
+            tcg_out8(s, OPC_JMP_short);
+        } else {
+            tcg_out8(s, OPC_JCC_short + opc);
+        }
+        tcg_out_reloc(s, s->code_ptr, R_386_PC8, label_index, -1);
+        s->code_ptr += 1;
+    } else {
+        if (opc == -1) {
+            tcg_out8(s, OPC_JMP_long);
+        } else {
+            tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
+        }
+        tcg_out_reloc(s, s->code_ptr, R_386_PC32, label_index, -4);
+        s->code_ptr += 4;
+    }
+}
+
+static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
+                        int const_arg2, int rexw)
+{
+    if (const_arg2) {
+        if (arg2 == 0) {
+            /* test r, r */
+            tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
+        } else {
+            tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
+        }
+    } else {
+        tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
+    }
+}
+
+static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
+                             TCGArg arg1, TCGArg arg2, int const_arg2,
+                             int label_index, int small)
+{
+    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
+    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label_index, small);
+}
+
+#if TCG_TARGET_REG_BITS == 64
+static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
+                             TCGArg arg1, TCGArg arg2, int const_arg2,
+                             int label_index, int small)
+{
+    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
+    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label_index, small);
+}
+#else
+/* XXX: we implement it at the target level to avoid having to
+   handle cross basic blocks temporaries */
+static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
+                            const int *const_args, int small)
+{
+    int label_next;
+    label_next = gen_new_label();
+    switch(args[4]) {
+    case TCG_COND_EQ:
+        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
+                         label_next, 1);
+        tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
+                         args[5], small);
+        break;
+    case TCG_COND_NE:
+        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
+                         args[5], small);
+        tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
+                         args[5], small);
+        break;
+    case TCG_COND_LT:
+        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
+                         args[5], small);
+        tcg_out_jxx(s, JCC_JNE, label_next, 1);
+        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
+                         args[5], small);
+        break;
+    case TCG_COND_LE:
+        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
+                         args[5], small);
+        tcg_out_jxx(s, JCC_JNE, label_next, 1);
+        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
+                         args[5], small);
+        break;
+    case TCG_COND_GT:
+        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
+                         args[5], small);
+        tcg_out_jxx(s, JCC_JNE, label_next, 1);
+        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
+                         args[5], small);
+        break;
+    case TCG_COND_GE:
+        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
+                         args[5], small);
+        tcg_out_jxx(s, JCC_JNE, label_next, 1);
+        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
+                         args[5], small);
+        break;
+    case TCG_COND_LTU:
+        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
+                         args[5], small);
+        tcg_out_jxx(s, JCC_JNE, label_next, 1);
+        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
+                         args[5], small);
+        break;
+    case TCG_COND_LEU:
+        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
+                         args[5], small);
+        tcg_out_jxx(s, JCC_JNE, label_next, 1);
+        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
+                         args[5], small);
+        break;
+    case TCG_COND_GTU:
+        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
+                         args[5], small);
+        tcg_out_jxx(s, JCC_JNE, label_next, 1);
+        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
+                         args[5], small);
+        break;
+    case TCG_COND_GEU:
+        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
+                         args[5], small);
+        tcg_out_jxx(s, JCC_JNE, label_next, 1);
+        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
+                         args[5], small);
+        break;
+    default:
+        tcg_abort();
+    }
+    tcg_out_label(s, label_next, (tcg_target_long)s->code_ptr);
+}
+#endif
+
+static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
+                              TCGArg arg1, TCGArg arg2, int const_arg2)
+{
+    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
+    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
+    tcg_out_ext8u(s, dest, dest);
+}
+
+#if TCG_TARGET_REG_BITS == 64
+static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
+                              TCGArg arg1, TCGArg arg2, int const_arg2)
+{
+    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
+    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
+    tcg_out_ext8u(s, dest, dest);
+}
+#else
+static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
+                             const int *const_args)
+{
+    TCGArg new_args[6];
+    int label_true, label_over;
+
+    memcpy(new_args, args+1, 5*sizeof(TCGArg));
+
+    if (args[0] == args[1] || args[0] == args[2]
+        || (!const_args[3] && args[0] == args[3])
+        || (!const_args[4] && args[0] == args[4])) {
+        /* When the destination overlaps with one of the argument
+           registers, don't do anything tricky.  */
+        label_true = gen_new_label();
+        label_over = gen_new_label();
+
+        new_args[5] = label_true;
+        tcg_out_brcond2(s, new_args, const_args+1, 1);
+
+        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
+        tcg_out_jxx(s, JCC_JMP, label_over, 1);
+        tcg_out_label(s, label_true, (tcg_target_long)s->code_ptr);
+
+        tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
+        tcg_out_label(s, label_over, (tcg_target_long)s->code_ptr);
+    } else {
+        /* When the destination does not overlap one of the arguments,
+           clear the destination first, jump if cond false, and emit an
+           increment in the true case.  This results in smaller code.  */
+
+        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
+
+        label_over = gen_new_label();
+        new_args[4] = tcg_invert_cond(new_args[4]);
+        new_args[5] = label_over;
+        tcg_out_brcond2(s, new_args, const_args+1, 1);
+
+        tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
+        tcg_out_label(s, label_over, (tcg_target_long)s->code_ptr);
+    }
+}
+#endif
+
+static void tcg_out_branch(TCGContext *s, int call, tcg_target_long dest)
+{
+#ifdef VBOX
+    tcg_target_long disp = dest - (tcg_target_long)s->code_ptr
+                         - tcg_calc_opc_len(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0)
+                         - 4;
+#else
+    tcg_target_long disp = dest - (tcg_target_long)s->code_ptr - 5;
+#endif
+
+    if (disp == (int32_t)disp) {
+        tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
+        tcg_out32(s, disp);
+    } else {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R10, dest);
+        tcg_out_modrm(s, OPC_GRP5,
+                      call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev, TCG_REG_R10);
+    }
+}
+
+static inline void tcg_out_calli(TCGContext *s, tcg_target_long dest)
+{
+#ifdef VBOX
+    tcg_gen_stack_alignment_check(s);
+#endif
+    tcg_out_branch(s, 1, dest);
+}
+
+static void tcg_out_jmp(TCGContext *s, tcg_target_long dest)
+{
+    tcg_out_branch(s, 0, dest);
+}
+
+#if defined(CONFIG_SOFTMMU)
+
+#include "../../softmmu_defs.h"
+
+static void *qemu_ld_helpers[4] = {
+    __ldb_mmu,
+    __ldw_mmu,
+    __ldl_mmu,
+    __ldq_mmu,
+};
+
+static void *qemu_st_helpers[4] = {
+    __stb_mmu,
+    __stw_mmu,
+    __stl_mmu,
+    __stq_mmu,
+};
+
+/* Perform the TLB load and compare.
+
+   Inputs:
+   ADDRLO_IDX contains the index into ARGS of the low part of the
+   address; the high part of the address is at ADDR_LOW_IDX+1.
+
+   MEM_INDEX and S_BITS are the memory context and log2 size of the load.
+
+   WHICH is the offset into the CPUTLBEntry structure of the slot to read.
+   This should be offsetof addr_read or addr_write.
+
+   Outputs:
+   LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
+   positions of the displacements of forward jumps to the TLB miss case.
+
+   First argument register is loaded with the low part of the address.
+   In the TLB hit case, it has been adjusted as indicated by the TLB
+   and so is a host address.  In the TLB miss case, it continues to
+   hold a guest address.
+
+   Second argument register is clobbered.  */
+
+static inline void tcg_out_tlb_load(TCGContext *s, int addrlo_idx,
+                                    int mem_index, int s_bits,
+                                    const TCGArg *args,
+                                    uint8_t **label_ptr, int which)
+{
+    const int addrlo = args[addrlo_idx];
+    const int r0 = tcg_target_call_iarg_regs[0];
+    const int r1 = tcg_target_call_iarg_regs[1];
+    TCGType type = TCG_TYPE_I32;
+    int rexw = 0;
+
+    if (TCG_TARGET_REG_BITS == 64 && TARGET_LONG_BITS == 64) {
+        type = TCG_TYPE_I64;
+        rexw = P_REXW;
+    }
+
+    tcg_out_mov(s, type, r1, addrlo);
+    tcg_out_mov(s, type, r0, addrlo);
+
+    tcg_out_shifti(s, SHIFT_SHR + rexw, r1,
+                   TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
+
+    tgen_arithi(s, ARITH_AND + rexw, r0,
+                TARGET_PAGE_MASK | ((1 << s_bits) - 1), 0);
+    tgen_arithi(s, ARITH_AND + rexw, r1,
+                (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS, 0);
+
+    tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, r1, TCG_AREG0, r1, 0,
+                             offsetof(CPUState, tlb_table[mem_index][0])
+                             + which);
+
+    /* cmp 0(r1), r0 */
+    tcg_out_modrm_offset(s, OPC_CMP_GvEv + rexw, r0, r1, 0);
+
+    tcg_out_mov(s, type, r0, addrlo);
+
+    /* jne label1 */
+    tcg_out8(s, OPC_JCC_short + JCC_JNE);
+    label_ptr[0] = s->code_ptr;
+    s->code_ptr++;
+
+    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
+        /* cmp 4(r1), addrhi */
+        tcg_out_modrm_offset(s, OPC_CMP_GvEv, args[addrlo_idx+1], r1, 4);
+
+        /* jne label1 */
+        tcg_out8(s, OPC_JCC_short + JCC_JNE);
+        label_ptr[1] = s->code_ptr;
+        s->code_ptr++;
+    }
+
+    /* TLB Hit.  */
+
+    /* add addend(r1), r0 */
+    tcg_out_modrm_offset(s, OPC_ADD_GvEv + P_REXW, r0, r1,
+                         offsetof(CPUTLBEntry, addend) - which);
+}
+#endif
+
+static void tcg_out_qemu_ld_direct(TCGContext *s, int datalo, int datahi,
+                                   int base, tcg_target_long ofs, int sizeop)
+{
+#ifdef TARGET_WORDS_BIGENDIAN
+    const int bswap = 1;
+#else
+    const int bswap = 0;
+#endif
+    switch (sizeop) {
+    case 0:
+        tcg_out_modrm_offset(s, OPC_MOVZBL, datalo, base, ofs);
+        break;
+    case 0 | 4:
+        tcg_out_modrm_offset(s, OPC_MOVSBL + P_REXW, datalo, base, ofs);
+        break;
+    case 1:
+        tcg_out_modrm_offset(s, OPC_MOVZWL, datalo, base, ofs);
+        if (bswap) {
+            tcg_out_rolw_8(s, datalo);
+        }
+        break;
+    case 1 | 4:
+        if (bswap) {
+            tcg_out_modrm_offset(s, OPC_MOVZWL, datalo, base, ofs);
+            tcg_out_rolw_8(s, datalo);
+            tcg_out_modrm(s, OPC_MOVSWL + P_REXW, datalo, datalo);
+        } else {
+            tcg_out_modrm_offset(s, OPC_MOVSWL + P_REXW, datalo, base, ofs);
+        }
+        break;
+    case 2:
+        tcg_out_ld(s, TCG_TYPE_I32, datalo, base, ofs);
+        if (bswap) {
+            tcg_out_bswap32(s, datalo);
+        }
+        break;
+#if TCG_TARGET_REG_BITS == 64
+    case 2 | 4:
+        if (bswap) {
+            tcg_out_ld(s, TCG_TYPE_I32, datalo, base, ofs);
+            tcg_out_bswap32(s, datalo);
+            tcg_out_ext32s(s, datalo, datalo);
+        } else {
+            tcg_out_modrm_offset(s, OPC_MOVSLQ, datalo, base, ofs);
+        }
+        break;
+#endif
+    case 3:
+        if (TCG_TARGET_REG_BITS == 64) {
+            tcg_out_ld(s, TCG_TYPE_I64, datalo, base, ofs);
+            if (bswap) {
+                tcg_out_bswap64(s, datalo);
+            }
+        } else {
+            if (bswap) {
+                int t = datalo;
+                datalo = datahi;
+                datahi = t;
+            }
+            if (base != datalo) {
+                tcg_out_ld(s, TCG_TYPE_I32, datalo, base, ofs);
+                tcg_out_ld(s, TCG_TYPE_I32, datahi, base, ofs + 4);
+            } else {
+                tcg_out_ld(s, TCG_TYPE_I32, datahi, base, ofs + 4);
+                tcg_out_ld(s, TCG_TYPE_I32, datalo, base, ofs);
+            }
+            if (bswap) {
+                tcg_out_bswap32(s, datalo);
+                tcg_out_bswap32(s, datahi);
+            }
+        }
+        break;
+    default:
+        tcg_abort();
+    }
+}
+
+#if defined(VBOX) && defined(REM_PHYS_ADDR_IN_TLB)
+
+static void * const vbox_ld_helpers[] = {
+    __ldub_vbox_phys,
+    __lduw_vbox_phys,
+    __ldul_vbox_phys,
+    __ldq_vbox_phys,
+    __ldb_vbox_phys,
+    __ldw_vbox_phys,
+    __ldl_vbox_phys,
+    __ldq_vbox_phys,
+};
+
+static void * const vbox_st_helpers[] = {
+    __stb_vbox_phys,
+    __stw_vbox_phys,
+    __stl_vbox_phys,
+    __stq_vbox_phys
+};
+
+DECLINLINE(void) tcg_out_long_call(TCGContext *s, void* dst)
+{
+    intptr_t disp;
+# ifdef VBOX
+    tcg_gen_stack_alignment_check(s);
+# endif
+    disp = (uintptr_t)dst - (uintptr_t)s->code_ptr - 5;
+    tcg_out8(s,  0xe8); /* call disp32 */
+    tcg_out32(s, disp); /* disp32 */
+}
+
+static void tcg_out_vbox_phys_read(TCGContext *s, int index,
+                                   int addr_reg,
+                                   int data_reg, int data_reg2)
+{
+    int useReg2 = ((index & 3) == 3);
+
+    /** @todo  should we make phys address accessors fastcalls - probably not a big deal */
+    /* out parameter (address), note that phys address is always 64-bit */
+    AssertMsg(sizeof(RTGCPHYS) == 8, ("Physical address must be 64-bits, update caller\n"));
+
+# if 0
+    tcg_out8(s, 0x6a); tcg_out8(s, 0x00); /* push $0 */
+    tcg_out_push(s, addr_reg);
+# else
+    /* mov addr_reg, %eax */
+    tcg_out_mov(s, TCG_REG_EAX, addr_reg);
+# endif
+
+    tcg_out_long_call(s, vbox_ld_helpers[index]);
+
+    /* mov %eax, data_reg */
+    tcg_out_mov(s, data_reg, TCG_REG_EAX);
+
+    /* returned 64-bit value */
+    if (useReg2)
+      tcg_out_mov(s, data_reg2, TCG_REG_EDX);
+}
+
+static void tcg_out_vbox_phys_write(TCGContext *s, int index,
+                                    int addr_reg,
+                                    int val_reg, int val_reg2) {
+    int useReg2 = ((index & 3) == 3);
+
+# if 0
+    /* out parameter (value2) */
+    if (useReg2)
+        tcg_out_push(s, val_reg2);
+    /* out parameter (value) */
+    tcg_out_push(s, val_reg);
+    /* out parameter (address), note that phys address is always 64-bit */
+    AssertMsg(sizeof(RTGCPHYS) == 8, ("Physical address must be 64-bits, update caller\n"));
+    tcg_out8(s, 0x6a); tcg_out8(s, 0x00); /* push $0 */
+    tcg_out_push(s, addr_reg);
+# else
+    Assert(val_reg !=  TCG_REG_EAX && (!useReg2 || (val_reg2 != TCG_REG_EAX)));
+    /* mov addr_reg, %eax */
+    tcg_out_mov(s, TCG_REG_EAX, addr_reg);
+    Assert(!useReg2 || (val_reg2 != TCG_REG_EDX));
+    /* mov val_reg, %edx */
+    tcg_out_mov(s, TCG_REG_EDX, val_reg);
+    if (useReg2)
+        tcg_out_mov(s, TCG_REG_ECX, val_reg2);
+
+# endif
+    /* call it */
+    tcg_out_long_call(s, vbox_st_helpers[index]);
+
+    /* clean stack after us */
+# if 0
+    tcg_out_addi(s, TCG_REG_ESP, 8 + (useReg2 ? 8 : 4));
+# endif
+}
+
+#endif /* defined(VBOX) && defined(REM_PHYS_ADDR_IN_TLB) */
+
+/* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
+   EAX. It will be useful once fixed registers globals are less
+   common. */
+static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
+                            int opc)
+{
+    int data_reg, data_reg2 = 0;
+    int addrlo_idx;
+#if defined(CONFIG_SOFTMMU)
+    int mem_index, s_bits, arg_idx;
+    uint8_t *label_ptr[3];
+#endif
+
+    data_reg = args[0];
+    addrlo_idx = 1;
+    if (TCG_TARGET_REG_BITS == 32 && opc == 3) {
+        data_reg2 = args[1];
+        addrlo_idx = 2;
+    }
+
+#if defined(CONFIG_SOFTMMU)
+    mem_index = args[addrlo_idx + 1 + (TARGET_LONG_BITS > TCG_TARGET_REG_BITS)];
+    s_bits = opc & 3;
+
+    tcg_out_tlb_load(s, addrlo_idx, mem_index, s_bits, args,
+                     label_ptr, offsetof(CPUTLBEntry, addr_read));
+
+    /* TLB Hit.  */
+    tcg_out_qemu_ld_direct(s, data_reg, data_reg2,
+                           tcg_target_call_iarg_regs[0], 0, opc);
+
+    /* jmp label2 */
+    tcg_out8(s, OPC_JMP_short);
+    label_ptr[2] = s->code_ptr;
+    s->code_ptr++;
+
+    /* TLB Miss.  */
+
+    /* label1: */
+    *label_ptr[0] = s->code_ptr - label_ptr[0] - 1;
+    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
+        *label_ptr[1] = s->code_ptr - label_ptr[1] - 1;
+    }
+
+    /* XXX: move that code at the end of the TB */
+    /* The first argument is already loaded with addrlo.  */
+    arg_idx = 1;
+    if (TCG_TARGET_REG_BITS == 32 && TARGET_LONG_BITS == 64) {
+        tcg_out_mov(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[arg_idx++],
+                    args[addrlo_idx + 1]);
+    }
+    tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[arg_idx],
+                 mem_index);
+    tcg_out_calli(s, (tcg_target_long)qemu_ld_helpers[s_bits]);
+
+    switch(opc) {
+    case 0 | 4:
+        tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW);
+        break;
+    case 1 | 4:
+        tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW);
+        break;
+    case 0:
+        tcg_out_ext8u(s, data_reg, TCG_REG_EAX);
+        break;
+    case 1:
+        tcg_out_ext16u(s, data_reg, TCG_REG_EAX);
+        break;
+    case 2:
+        tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
+        break;
+#if TCG_TARGET_REG_BITS == 64
+    case 2 | 4:
+        tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
+        break;
+#endif
+    case 3:
+        if (TCG_TARGET_REG_BITS == 64) {
+            tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
+        } else if (data_reg == TCG_REG_EDX) {
+            /* xchg %edx, %eax */
+            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
+            tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EAX);
+        } else {
+            tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
+            tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EDX);
+        }
+        break;
+    default:
+        tcg_abort();
+    }
+
+    /* label2: */
+    *label_ptr[2] = s->code_ptr - label_ptr[2] - 1;
+#else
+# if defined(VBOX) && defined(__MINGW64__)
+#  error port me
+# endif
+    {
+        int32_t offset = GUEST_BASE;
+        int base = args[addrlo_idx];
+
+        if (TCG_TARGET_REG_BITS == 64) {
+            /* ??? We assume all operations have left us with register
+               contents that are zero extended.  So far this appears to
+               be true.  If we want to enforce this, we can either do
+               an explicit zero-extension here, or (if GUEST_BASE == 0)
+               use the ADDR32 prefix.  For now, do nothing.  */
+
+            if (offset != GUEST_BASE) {
+                tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_RDI, GUEST_BASE);
+                tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_RDI, base);
+                base = TCG_REG_RDI, offset = 0;
+            }
+        }
+
+        tcg_out_qemu_ld_direct(s, data_reg, data_reg2, base, offset, opc);
+    }
+#endif
+}
+
+static void tcg_out_qemu_st_direct(TCGContext *s, int datalo, int datahi,
+                                   int base, tcg_target_long ofs, int sizeop)
+{
+#if !defined(VBOX) || !defined(REM_PHYS_ADDR_IN_TLB)
+#ifdef TARGET_WORDS_BIGENDIAN
+    const int bswap = 1;
+#else
+    const int bswap = 0;
+#endif
+    /* ??? Ideally we wouldn't need a scratch register.  For user-only,
+       we could perform the bswap twice to restore the original value
+       instead of moving to the scratch.  But as it is, the L constraint
+       means that the second argument reg is definitely free here.  */
+    int scratch = tcg_target_call_iarg_regs[1];
+
+    switch (sizeop) {
+    case 0:
+        tcg_out_modrm_offset(s, OPC_MOVB_EvGv + P_REXB_R, datalo, base, ofs);
+        break;
+    case 1:
+        if (bswap) {
+            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
+            tcg_out_rolw_8(s, scratch);
+            datalo = scratch;
+        }
+        tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_DATA16, datalo, base, ofs);
+        break;
+    case 2:
+        if (bswap) {
+            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
+            tcg_out_bswap32(s, scratch);
+            datalo = scratch;
+        }
+        tcg_out_st(s, TCG_TYPE_I32, datalo, base, ofs);
+        break;
+    case 3:
+        if (TCG_TARGET_REG_BITS == 64) {
+            if (bswap) {
+                tcg_out_mov(s, TCG_TYPE_I64, scratch, datalo);
+                tcg_out_bswap64(s, scratch);
+                datalo = scratch;
+            }
+            tcg_out_st(s, TCG_TYPE_I64, datalo, base, ofs);
+        } else if (bswap) {
+            tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi);
+            tcg_out_bswap32(s, scratch);
+            tcg_out_st(s, TCG_TYPE_I32, scratch, base, ofs);
+            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
+            tcg_out_bswap32(s, scratch);
+            tcg_out_st(s, TCG_TYPE_I32, scratch, base, ofs + 4);
+        } else {
+            tcg_out_st(s, TCG_TYPE_I32, datalo, base, ofs);
+            tcg_out_st(s, TCG_TYPE_I32, datahi, base, ofs + 4);
+        }
+        break;
+    default:
+        tcg_abort();
+    }
+#else /* VBOX */
+# error "broken"
+    tcg_out_vbox_phys_read(s, opc, r0, data_reg, data_reg2);
+#endif
+}
+
+static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
+                            int opc)
+{
+    int data_reg, data_reg2 = 0;
+    int addrlo_idx;
+#if defined(CONFIG_SOFTMMU)
+    int mem_index, s_bits;
+    int stack_adjust;
+    uint8_t *label_ptr[3];
+#endif
+
+    data_reg = args[0];
+    addrlo_idx = 1;
+    if (TCG_TARGET_REG_BITS == 32 && opc == 3) {
+        data_reg2 = args[1];
+        addrlo_idx = 2;
+    }
+
+#if defined(CONFIG_SOFTMMU)
+    mem_index = args[addrlo_idx + 1 + (TARGET_LONG_BITS > TCG_TARGET_REG_BITS)];
+    s_bits = opc;
+
+    tcg_out_tlb_load(s, addrlo_idx, mem_index, s_bits, args,
+                     label_ptr, offsetof(CPUTLBEntry, addr_write));
+
+    /* TLB Hit.  */
+    tcg_out_qemu_st_direct(s, data_reg, data_reg2,
+                           tcg_target_call_iarg_regs[0], 0, opc);
+
+    /* jmp label2 */
+    tcg_out8(s, OPC_JMP_short);
+    label_ptr[2] = s->code_ptr;
+    s->code_ptr++;
+
+    /* TLB Miss.  */
+
+    /* label1: */
+    *label_ptr[0] = s->code_ptr - label_ptr[0] - 1;
+    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
+        *label_ptr[1] = s->code_ptr - label_ptr[1] - 1;
+    }
+
+# if !defined(VBOX) || !defined(REM_PHYS_ADDR_IN_TLB)
+
+    /* XXX: move that code at the end of the TB */
+    if (TCG_TARGET_REG_BITS == 64) {
+#  if defined(VBOX) && defined(__MINGW64__)
+        tcg_out_mov(s, (opc == 3 ? TCG_TYPE_I64 : TCG_TYPE_I32),
+                    tcg_target_call_iarg_regs[1], data_reg);
+        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], mem_index);
+#  else
+        tcg_out_mov(s, (opc == 3 ? TCG_TYPE_I64 : TCG_TYPE_I32),
+                    TCG_REG_RSI, data_reg);
+        tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_RDX, mem_index);
+#  endif
+        stack_adjust = 0;
+    } else if (TARGET_LONG_BITS == 32) {
+        tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_EDX, data_reg);
+        if (opc == 3) {
+            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_ECX, data_reg2);
+#  ifdef VBOX_16_BYTE_STACK_ALIGN
+            tcg_out_subi(s, TCG_REG_ESP, 12);
+#  endif
+            tcg_out_pushi(s, mem_index);
+            stack_adjust = 4;
+        } else {
+            tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_ECX, mem_index);
+            stack_adjust = 0;
+        }
+    } else {
+        if (opc == 3) {
+            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_EDX, args[addrlo_idx + 1]);
+#  ifdef VBOX_16_BYTE_STACK_ALIGN
+            tcg_out_pushi(s, 0);
+#  endif
+            tcg_out_pushi(s, mem_index);
+            tcg_out_push(s, data_reg2);
+            tcg_out_push(s, data_reg);
+            stack_adjust = 12;
+        } else {
+            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_EDX, args[addrlo_idx + 1]);
+            switch(opc) {
+            case 0:
+                tcg_out_ext8u(s, TCG_REG_ECX, data_reg);
+                break;
+            case 1:
+                tcg_out_ext16u(s, TCG_REG_ECX, data_reg);
+                break;
+            case 2:
+                tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_ECX, data_reg);
+                break;
+            }
+#  ifdef VBOX_16_BYTE_STACK_ALIGN
+            tcg_out_subi(s, TCG_REG_ESP, 12);
+#  endif
+            tcg_out_pushi(s, mem_index);
+            stack_adjust = 4;
+        }
+    }
+
+    tcg_out_calli(s, (tcg_target_long)qemu_st_helpers[s_bits]);
+
+#  ifdef VBOX_16_BYTE_STACK_ALIGN
+    if (stack_adjust != 0) {
+        tcg_out_addi(s, TCG_REG_ESP, RT_ALIGN(stack_adjust, 16));
+    }
+#  else
+    if (stack_adjust == (TCG_TARGET_REG_BITS / 8)) {
+        /* Pop and discard.  This is 2 bytes smaller than the add.  */
+        tcg_out_pop(s, TCG_REG_ECX);
+    } else if (stack_adjust != 0) {
+        tcg_out_addi(s, TCG_REG_ESP, stack_adjust);
+    }
+#  endif
+
+# else  /* VBOX && REM_PHYS_ADDR_IN_TLB */
+#  error Borked
+    tcg_out_vbox_phys_write(s, opc, r0, data_reg, data_reg2);
+# endif /* VBOX && REM_PHYS_ADDR_IN_TLB */
+
+    /* label2: */
+    *label_ptr[2] = s->code_ptr - label_ptr[2] - 1;
+#else
+# if defined(VBOX) && defined(__MINGW64__)
+#  error port me
+# endif
+    {
+        int32_t offset = GUEST_BASE;
+        int base = args[addrlo_idx];
+
+        if (TCG_TARGET_REG_BITS == 64) {
+            /* ??? We assume all operations have left us with register
+               contents that are zero extended.  So far this appears to
+               be true.  If we want to enforce this, we can either do
+               an explicit zero-extension here, or (if GUEST_BASE == 0)
+               use the ADDR32 prefix.  For now, do nothing.  */
+
+            if (offset != GUEST_BASE) {
+                tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_RDI, GUEST_BASE);
+                tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_RDI, base);
+                base = TCG_REG_RDI, offset = 0;
+            }
+        }
+
+        tcg_out_qemu_st_direct(s, data_reg, data_reg2, base, offset, opc);
+    }
+#endif
+}
+
+static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                              const TCGArg *args, const int *const_args)
+{
+    int c, rexw = 0;
+
+#if TCG_TARGET_REG_BITS == 64
+# define OP_32_64(x) \
+        case glue(glue(INDEX_op_, x), _i64): \
+            rexw = P_REXW; /* FALLTHRU */    \
+        case glue(glue(INDEX_op_, x), _i32)
+#else
+# define OP_32_64(x) \
+        case glue(glue(INDEX_op_, x), _i32)
+#endif
+
+    switch(opc) {
+    case INDEX_op_exit_tb:
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, args[0]);
+        tcg_out_jmp(s, (tcg_target_long) tb_ret_addr);
+        break;
+    case INDEX_op_goto_tb:
+        if (s->tb_jmp_offset) {
+            /* direct jump method */
+            tcg_out8(s, OPC_JMP_long); /* jmp im */
+            s->tb_jmp_offset[args[0]] = s->code_ptr - s->code_buf;
+            tcg_out32(s, 0);
+        } else {
+            /* indirect jump method */
+            tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
+                                 (tcg_target_long)(s->tb_next + args[0]));
+        }
+        s->tb_next_offset[args[0]] = s->code_ptr - s->code_buf;
+        break;
+    case INDEX_op_call:
+        if (const_args[0]) {
+            tcg_out_calli(s, args[0]);
+        } else {
+            /* call *reg */
+            tcg_out_modrm(s, OPC_GRP5, EXT5_CALLN_Ev, args[0]);
+        }
+        break;
+    case INDEX_op_jmp:
+        if (const_args[0]) {
+            tcg_out_jmp(s, args[0]);
+        } else {
+            /* jmp *reg */
+            tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, args[0]);
+        }
+        break;
+    case INDEX_op_br:
+        tcg_out_jxx(s, JCC_JMP, args[0], 0);
+        break;
+    case INDEX_op_movi_i32:
+        tcg_out_movi(s, TCG_TYPE_I32, args[0], args[1]);
+        break;
+    OP_32_64(ld8u):
+        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
+        tcg_out_modrm_offset(s, OPC_MOVZBL, args[0], args[1], args[2]);
+        break;
+    OP_32_64(ld8s):
+        tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, args[0], args[1], args[2]);
+        break;
+    OP_32_64(ld16u):
+        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
+        tcg_out_modrm_offset(s, OPC_MOVZWL, args[0], args[1], args[2]);
+        break;
+    OP_32_64(ld16s):
+        tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, args[0], args[1], args[2]);
+        break;
+#if TCG_TARGET_REG_BITS == 64
+    case INDEX_op_ld32u_i64:
+#endif
+    case INDEX_op_ld_i32:
+        tcg_out_ld(s, TCG_TYPE_I32, args[0], args[1], args[2]);
+        break;
+
+    OP_32_64(st8):
+        tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R,
+                             args[0], args[1], args[2]);
+        break;
+    OP_32_64(st16):
+        tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16,
+                             args[0], args[1], args[2]);
+        break;
+#if TCG_TARGET_REG_BITS == 64
+    case INDEX_op_st32_i64:
+#endif
+    case INDEX_op_st_i32:
+        tcg_out_st(s, TCG_TYPE_I32, args[0], args[1], args[2]);
+        break;
+
+    OP_32_64(add):
+        /* For 3-operand addition, use LEA.  */
+        if (args[0] != args[1]) {
+            TCGArg a0 = args[0], a1 = args[1], a2 = args[2], c3 = 0;
+
+            if (const_args[2]) {
+                c3 = a2, a2 = -1;
+            } else if (a0 == a2) {
+                /* Watch out for dest = src + dest, since we've removed
+                   the matching constraint on the add.  */
+                tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
+                break;
+            }
+
+            tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
+            break;
+        }
+        c = ARITH_ADD;
+        goto gen_arith;
+    OP_32_64(sub):
+        c = ARITH_SUB;
+        goto gen_arith;
+    OP_32_64(and):
+        c = ARITH_AND;
+        goto gen_arith;
+    OP_32_64(or):
+        c = ARITH_OR;
+        goto gen_arith;
+    OP_32_64(xor):
+        c = ARITH_XOR;
+        goto gen_arith;
+    gen_arith:
+        if (const_args[2]) {
+            tgen_arithi(s, c + rexw, args[0], args[2], 0);
+        } else {
+            tgen_arithr(s, c + rexw, args[0], args[2]);
+        }
+        break;
+
+    OP_32_64(mul):
+        if (const_args[2]) {
+            int32_t val;
+            val = args[2];
+            if (val == (int8_t)val) {
+                tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, args[0], args[0]);
+                tcg_out8(s, val);
+            } else {
+                tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, args[0], args[0]);
+                tcg_out32(s, val);
+            }
+        } else {
+            tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, args[0], args[2]);
+        }
+        break;
+
+    OP_32_64(div2):
+        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
+        break;
+    OP_32_64(divu2):
+        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
+        break;
+
+    OP_32_64(shl):
+        c = SHIFT_SHL;
+        goto gen_shift;
+    OP_32_64(shr):
+        c = SHIFT_SHR;
+        goto gen_shift;
+    OP_32_64(sar):
+        c = SHIFT_SAR;
+        goto gen_shift;
+    OP_32_64(rotl):
+        c = SHIFT_ROL;
+        goto gen_shift;
+    OP_32_64(rotr):
+        c = SHIFT_ROR;
+        goto gen_shift;
+    gen_shift:
+        if (const_args[2]) {
+            tcg_out_shifti(s, c + rexw, args[0], args[2]);
+        } else {
+            tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, args[0]);
+        }
+        break;
+
+    case INDEX_op_brcond_i32:
+        tcg_out_brcond32(s, args[2], args[0], args[1], const_args[1],
+                         args[3], 0);
+        break;
+    case INDEX_op_setcond_i32:
+        tcg_out_setcond32(s, args[3], args[0], args[1],
+                          args[2], const_args[2]);
+        break;
+
+    OP_32_64(bswap16):
+        tcg_out_rolw_8(s, args[0]);
+        break;
+    OP_32_64(bswap32):
+        tcg_out_bswap32(s, args[0]);
+        break;
+
+    OP_32_64(neg):
+        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, args[0]);
+        break;
+    OP_32_64(not):
+        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, args[0]);
+        break;
+
+    OP_32_64(ext8s):
+        tcg_out_ext8s(s, args[0], args[1], rexw);
+        break;
+    OP_32_64(ext16s):
+        tcg_out_ext16s(s, args[0], args[1], rexw);
+        break;
+    OP_32_64(ext8u):
+        tcg_out_ext8u(s, args[0], args[1]);
+        break;
+    OP_32_64(ext16u):
+        tcg_out_ext16u(s, args[0], args[1]);
+        break;
+
+    case INDEX_op_qemu_ld8u:
+        tcg_out_qemu_ld(s, args, 0);
+        break;
+    case INDEX_op_qemu_ld8s:
+        tcg_out_qemu_ld(s, args, 0 | 4);
+        break;
+    case INDEX_op_qemu_ld16u:
+        tcg_out_qemu_ld(s, args, 1);
+        break;
+    case INDEX_op_qemu_ld16s:
+        tcg_out_qemu_ld(s, args, 1 | 4);
+        break;
+#if TCG_TARGET_REG_BITS == 64
+    case INDEX_op_qemu_ld32u:
+#endif
+    case INDEX_op_qemu_ld32:
+        tcg_out_qemu_ld(s, args, 2);
+        break;
+    case INDEX_op_qemu_ld64:
+        tcg_out_qemu_ld(s, args, 3);
+        break;
+
+    case INDEX_op_qemu_st8:
+        tcg_out_qemu_st(s, args, 0);
+        break;
+    case INDEX_op_qemu_st16:
+        tcg_out_qemu_st(s, args, 1);
+        break;
+    case INDEX_op_qemu_st32:
+        tcg_out_qemu_st(s, args, 2);
+        break;
+    case INDEX_op_qemu_st64:
+        tcg_out_qemu_st(s, args, 3);
+        break;
+
+#if TCG_TARGET_REG_BITS == 32
+    case INDEX_op_brcond2_i32:
+        tcg_out_brcond2(s, args, const_args, 0);
+        break;
+    case INDEX_op_setcond2_i32:
+        tcg_out_setcond2(s, args, const_args);
+        break;
+    case INDEX_op_mulu2_i32:
+        tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_MUL, args[3]);
+        break;
+    case INDEX_op_add2_i32:
+        if (const_args[4]) {
+            tgen_arithi(s, ARITH_ADD, args[0], args[4], 1);
+        } else {
+            tgen_arithr(s, ARITH_ADD, args[0], args[4]);
+        }
+        if (const_args[5]) {
+            tgen_arithi(s, ARITH_ADC, args[1], args[5], 1);
+        } else {
+            tgen_arithr(s, ARITH_ADC, args[1], args[5]);
+        }
+        break;
+    case INDEX_op_sub2_i32:
+        if (const_args[4]) {
+            tgen_arithi(s, ARITH_SUB, args[0], args[4], 1);
+        } else {
+            tgen_arithr(s, ARITH_SUB, args[0], args[4]);
+        }
+        if (const_args[5]) {
+            tgen_arithi(s, ARITH_SBB, args[1], args[5], 1);
+        } else {
+            tgen_arithr(s, ARITH_SBB, args[1], args[5]);
+        }
+        break;
+#else /* TCG_TARGET_REG_BITS == 64 */
+    case INDEX_op_movi_i64:
+        tcg_out_movi(s, TCG_TYPE_I64, args[0], args[1]);
+        break;
+    case INDEX_op_ld32s_i64:
+        tcg_out_modrm_offset(s, OPC_MOVSLQ, args[0], args[1], args[2]);
+        break;
+    case INDEX_op_ld_i64:
+        tcg_out_ld(s, TCG_TYPE_I64, args[0], args[1], args[2]);
+        break;
+    case INDEX_op_st_i64:
+        tcg_out_st(s, TCG_TYPE_I64, args[0], args[1], args[2]);
+        break;
+    case INDEX_op_qemu_ld32s:
+        tcg_out_qemu_ld(s, args, 2 | 4);
+        break;
+
+    case INDEX_op_brcond_i64:
+        tcg_out_brcond64(s, args[2], args[0], args[1], const_args[1],
+                         args[3], 0);
+        break;
+    case INDEX_op_setcond_i64:
+        tcg_out_setcond64(s, args[3], args[0], args[1],
+                          args[2], const_args[2]);
+        break;
+
+    case INDEX_op_bswap64_i64:
+        tcg_out_bswap64(s, args[0]);
+        break;
+    case INDEX_op_ext32u_i64:
+        tcg_out_ext32u(s, args[0], args[1]);
+        break;
+    case INDEX_op_ext32s_i64:
+        tcg_out_ext32s(s, args[0], args[1]);
+        break;
+#endif
+
+    default:
+        tcg_abort();
+    }
+
+#undef OP_32_64
+}
+
+static const TCGTargetOpDef x86_op_defs[] = {
+    { INDEX_op_exit_tb, { } },
+    { INDEX_op_goto_tb, { } },
+    { INDEX_op_call, { "ri" } },
+    { INDEX_op_jmp, { "ri" } },
+    { INDEX_op_br, { } },
+    { INDEX_op_mov_i32, { "r", "r" } },
+    { INDEX_op_movi_i32, { "r" } },
+    { INDEX_op_ld8u_i32, { "r", "r" } },
+    { INDEX_op_ld8s_i32, { "r", "r" } },
+    { INDEX_op_ld16u_i32, { "r", "r" } },
+    { INDEX_op_ld16s_i32, { "r", "r" } },
+    { INDEX_op_ld_i32, { "r", "r" } },
+    { INDEX_op_st8_i32, { "q", "r" } },
+    { INDEX_op_st16_i32, { "r", "r" } },
+    { INDEX_op_st_i32, { "r", "r" } },
+
+    { INDEX_op_add_i32, { "r", "r", "ri" } },
+    { INDEX_op_sub_i32, { "r", "0", "ri" } },
+    { INDEX_op_mul_i32, { "r", "0", "ri" } },
+    { INDEX_op_div2_i32, { "a", "d", "0", "1", "r" } },
+    { INDEX_op_divu2_i32, { "a", "d", "0", "1", "r" } },
+    { INDEX_op_and_i32, { "r", "0", "ri" } },
+    { INDEX_op_or_i32, { "r", "0", "ri" } },
+    { INDEX_op_xor_i32, { "r", "0", "ri" } },
+
+    { INDEX_op_shl_i32, { "r", "0", "ci" } },
+    { INDEX_op_shr_i32, { "r", "0", "ci" } },
+    { INDEX_op_sar_i32, { "r", "0", "ci" } },
+    { INDEX_op_rotl_i32, { "r", "0", "ci" } },
+    { INDEX_op_rotr_i32, { "r", "0", "ci" } },
+
+    { INDEX_op_brcond_i32, { "r", "ri" } },
+
+    { INDEX_op_bswap16_i32, { "r", "0" } },
+    { INDEX_op_bswap32_i32, { "r", "0" } },
+
+    { INDEX_op_neg_i32, { "r", "0" } },
+
+    { INDEX_op_not_i32, { "r", "0" } },
+
+    { INDEX_op_ext8s_i32, { "r", "q" } },
+    { INDEX_op_ext16s_i32, { "r", "r" } },
+    { INDEX_op_ext8u_i32, { "r", "q" } },
+    { INDEX_op_ext16u_i32, { "r", "r" } },
+
+    { INDEX_op_setcond_i32, { "q", "r", "ri" } },
+
+#if TCG_TARGET_REG_BITS == 32
+    { INDEX_op_mulu2_i32, { "a", "d", "a", "r" } },
+    { INDEX_op_add2_i32, { "r", "r", "0", "1", "ri", "ri" } },
+    { INDEX_op_sub2_i32, { "r", "r", "0", "1", "ri", "ri" } },
+    { INDEX_op_brcond2_i32, { "r", "r", "ri", "ri" } },
+    { INDEX_op_setcond2_i32, { "r", "r", "r", "ri", "ri" } },
+#else
+    { INDEX_op_mov_i64, { "r", "r" } },
+    { INDEX_op_movi_i64, { "r" } },
+    { INDEX_op_ld8u_i64, { "r", "r" } },
+    { INDEX_op_ld8s_i64, { "r", "r" } },
+    { INDEX_op_ld16u_i64, { "r", "r" } },
+    { INDEX_op_ld16s_i64, { "r", "r" } },
+    { INDEX_op_ld32u_i64, { "r", "r" } },
+    { INDEX_op_ld32s_i64, { "r", "r" } },
+    { INDEX_op_ld_i64, { "r", "r" } },
+    { INDEX_op_st8_i64, { "r", "r" } },
+    { INDEX_op_st16_i64, { "r", "r" } },
+    { INDEX_op_st32_i64, { "r", "r" } },
+    { INDEX_op_st_i64, { "r", "r" } },
+
+    { INDEX_op_add_i64, { "r", "0", "re" } },
+    { INDEX_op_mul_i64, { "r", "0", "re" } },
+    { INDEX_op_div2_i64, { "a", "d", "0", "1", "r" } },
+    { INDEX_op_divu2_i64, { "a", "d", "0", "1", "r" } },
+    { INDEX_op_sub_i64, { "r", "0", "re" } },
+    { INDEX_op_and_i64, { "r", "0", "reZ" } },
+    { INDEX_op_or_i64, { "r", "0", "re" } },
+    { INDEX_op_xor_i64, { "r", "0", "re" } },
+
+    { INDEX_op_shl_i64, { "r", "0", "ci" } },
+    { INDEX_op_shr_i64, { "r", "0", "ci" } },
+    { INDEX_op_sar_i64, { "r", "0", "ci" } },
+    { INDEX_op_rotl_i64, { "r", "0", "ci" } },
+    { INDEX_op_rotr_i64, { "r", "0", "ci" } },
+
+    { INDEX_op_brcond_i64, { "r", "re" } },
+    { INDEX_op_setcond_i64, { "r", "r", "re" } },
+
+    { INDEX_op_bswap16_i64, { "r", "0" } },
+    { INDEX_op_bswap32_i64, { "r", "0" } },
+    { INDEX_op_bswap64_i64, { "r", "0" } },
+    { INDEX_op_neg_i64, { "r", "0" } },
+    { INDEX_op_not_i64, { "r", "0" } },
+
+    { INDEX_op_ext8s_i64, { "r", "r" } },
+    { INDEX_op_ext16s_i64, { "r", "r" } },
+    { INDEX_op_ext32s_i64, { "r", "r" } },
+    { INDEX_op_ext8u_i64, { "r", "r" } },
+    { INDEX_op_ext16u_i64, { "r", "r" } },
+    { INDEX_op_ext32u_i64, { "r", "r" } },
+#endif
+
+#if TCG_TARGET_REG_BITS == 64
+    { INDEX_op_qemu_ld8u, { "r", "L" } },
+    { INDEX_op_qemu_ld8s, { "r", "L" } },
+    { INDEX_op_qemu_ld16u, { "r", "L" } },
+    { INDEX_op_qemu_ld16s, { "r", "L" } },
+    { INDEX_op_qemu_ld32, { "r", "L" } },
+    { INDEX_op_qemu_ld32u, { "r", "L" } },
+    { INDEX_op_qemu_ld32s, { "r", "L" } },
+    { INDEX_op_qemu_ld64, { "r", "L" } },
+
+    { INDEX_op_qemu_st8, { "L", "L" } },
+    { INDEX_op_qemu_st16, { "L", "L" } },
+    { INDEX_op_qemu_st32, { "L", "L" } },
+    { INDEX_op_qemu_st64, { "L", "L" } },
+#elif TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
+    { INDEX_op_qemu_ld8u, { "r", "L" } },
+    { INDEX_op_qemu_ld8s, { "r", "L" } },
+    { INDEX_op_qemu_ld16u, { "r", "L" } },
+    { INDEX_op_qemu_ld16s, { "r", "L" } },
+    { INDEX_op_qemu_ld32, { "r", "L" } },
+    { INDEX_op_qemu_ld64, { "r", "r", "L" } },
+
+    { INDEX_op_qemu_st8, { "cb", "L" } },
+    { INDEX_op_qemu_st16, { "L", "L" } },
+    { INDEX_op_qemu_st32, { "L", "L" } },
+    { INDEX_op_qemu_st64, { "L", "L", "L" } },
+#else
+    { INDEX_op_qemu_ld8u, { "r", "L", "L" } },
+    { INDEX_op_qemu_ld8s, { "r", "L", "L" } },
+    { INDEX_op_qemu_ld16u, { "r", "L", "L" } },
+    { INDEX_op_qemu_ld16s, { "r", "L", "L" } },
+    { INDEX_op_qemu_ld32, { "r", "L", "L" } },
+    { INDEX_op_qemu_ld64, { "r", "r", "L", "L" } },
+
+    { INDEX_op_qemu_st8, { "cb", "L", "L" } },
+    { INDEX_op_qemu_st16, { "L", "L", "L" } },
+    { INDEX_op_qemu_st32, { "L", "L", "L" } },
+    { INDEX_op_qemu_st64, { "L", "L", "L", "L" } },
+#endif
+    { -1 },
+};
+
+static int tcg_target_callee_save_regs[] = {
+#if TCG_TARGET_REG_BITS == 64
+    TCG_REG_RBP,
+    TCG_REG_RBX,
+# if defined(VBOX) && defined(__MINGW64__)
+    TCG_REG_RSI,
+    TCG_REG_RDI,
+# endif
+    TCG_REG_R12,
+    TCG_REG_R13,
+    /* TCG_REG_R14, */ /* Currently used for the global env. */
+    TCG_REG_R15,
+#else
+# ifndef VBOX
+    /* TCG_REG_EBP, */ /* Currently used for the global env. */
+    TCG_REG_EBX,
+    TCG_REG_ESI,
+    TCG_REG_EDI,
+# else
+    TCG_REG_EBP,
+    TCG_REG_EBX,
+    /* TCG_REG_ESI, */ /* Currently used for the global env. */
+    TCG_REG_EDI,
+# endif
+#endif
+};
+
+/* Generate global QEMU prologue and epilogue code */
+static void tcg_target_qemu_prologue(TCGContext *s)
+{
+    int i, frame_size, push_size, stack_addend;
+
+    /* TB prologue */
+
+    /* Save all callee saved registers.  */
+    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
+        tcg_out_push(s, tcg_target_callee_save_regs[i]);
+    }
+# if defined(VBOX_STRICT) && defined(RT_ARCH_X86)
+    tcg_out8(s, 0x31); /* xor ebp, ebp */
+    tcg_out8(s, 0xed);
+# endif
+
+    /* Reserve some stack space.  */
+    push_size = 1 + ARRAY_SIZE(tcg_target_callee_save_regs);
+    push_size *= TCG_TARGET_REG_BITS / 8;
+
+    frame_size = push_size + TCG_STATIC_CALL_ARGS_SIZE;
+#if defined(VBOX) && defined(__MINGW64__)
+    frame_size += TCG_TARGET_CALL_STACK_OFFSET;
+#endif
+    frame_size = (frame_size + TCG_TARGET_STACK_ALIGN - 1) &
+        ~(TCG_TARGET_STACK_ALIGN - 1);
+    stack_addend = frame_size - push_size;
+    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
+
+    /* jmp *tb.  */
+    tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[0]);
+# ifdef VBOX
+    tcg_gen_stack_alignment_check(s);
+# endif
+
+    tcg_out_modrm(s, 0xff, 4, TCG_REG_EAX); /* jmp *%eax */
+
+    /* TB epilogue */
+    tb_ret_addr = s->code_ptr;
+
+    tcg_out_addi(s, TCG_REG_ESP, stack_addend);
+
+    for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
+        tcg_out_pop(s, tcg_target_callee_save_regs[i]);
+    }
+    tcg_out_opc(s, OPC_RET, 0, 0, 0);
+}
+
+static void tcg_target_init(TCGContext *s)
+{
+#if !defined(CONFIG_USER_ONLY)
+    /* fail safe */
+    if ((1 << CPU_TLB_ENTRY_BITS) != sizeof(CPUTLBEntry))
+        tcg_abort();
+#endif
+
+    if (TCG_TARGET_REG_BITS == 64) {
+        tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xffff);
+        tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I64], 0, 0xffff);
+    } else {
+        tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xff);
+    }
+
+    tcg_regset_clear(tcg_target_call_clobber_regs);
+    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
+    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
+    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
+    if (TCG_TARGET_REG_BITS == 64) {
+# if !defined(VBOX) || !defined(__MINGW64__)
+        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
+        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
+# endif
+        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
+        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
+        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
+        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
+    }
+
+    tcg_regset_clear(s->reserved_regs);
+    tcg_regset_set_reg(s->reserved_regs, TCG_REG_ESP);
+
+    tcg_add_target_add_op_defs(x86_op_defs);
+}
diff --git a/src/recompiler/tcg/i386/tcg-target.h b/src/recompiler/tcg/i386/tcg-target.h
new file mode 100644
index 00000000..e812bc58
--- /dev/null
+++ b/src/recompiler/tcg/i386/tcg-target.h
@@ -0,0 +1,134 @@
+/*
+ * Tiny Code Generator for QEMU
+ *
+ * Copyright (c) 2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#define TCG_TARGET_I386 1
+
+#if defined(__x86_64__)
+# define TCG_TARGET_REG_BITS 64
+#else
+# define TCG_TARGET_REG_BITS 32
+#endif
+//#define TCG_TARGET_WORDS_BIGENDIAN
+
+#if TCG_TARGET_REG_BITS == 64
+# define TCG_TARGET_NB_REGS 16
+#else
+# define TCG_TARGET_NB_REGS 8
+#endif
+
+enum {
+    TCG_REG_EAX = 0,
+    TCG_REG_ECX,
+    TCG_REG_EDX,
+    TCG_REG_EBX,
+    TCG_REG_ESP,
+    TCG_REG_EBP,
+    TCG_REG_ESI,
+    TCG_REG_EDI,
+
+    /* 64-bit registers; always define the symbols to avoid
+       too much if-deffing.  */
+    TCG_REG_R8,
+    TCG_REG_R9,
+    TCG_REG_R10,
+    TCG_REG_R11,
+    TCG_REG_R12,
+    TCG_REG_R13,
+    TCG_REG_R14,
+    TCG_REG_R15,
+    TCG_REG_RAX = TCG_REG_EAX,
+    TCG_REG_RCX = TCG_REG_ECX,
+    TCG_REG_RDX = TCG_REG_EDX,
+    TCG_REG_RBX = TCG_REG_EBX,
+    TCG_REG_RSP = TCG_REG_ESP,
+    TCG_REG_RBP = TCG_REG_EBP,
+    TCG_REG_RSI = TCG_REG_ESI,
+    TCG_REG_RDI = TCG_REG_EDI,
+};
+
+#define TCG_CT_CONST_S32 0x100
+#define TCG_CT_CONST_U32 0x200
+
+/* used for function call generation */
+#define TCG_REG_CALL_STACK TCG_REG_ESP
+#define TCG_TARGET_STACK_ALIGN 16
+#if defined(VBOX) && defined(__MINGW64__)
+# define TCG_TARGET_CALL_STACK_OFFSET 32 /* 4 qword argument/register spill zone */
+#else
+#define TCG_TARGET_CALL_STACK_OFFSET 0
+#endif
+
+/* optional instructions */
+#define TCG_TARGET_HAS_div2_i32
+#define TCG_TARGET_HAS_rot_i32
+#define TCG_TARGET_HAS_ext8s_i32
+#define TCG_TARGET_HAS_ext16s_i32
+#define TCG_TARGET_HAS_ext8u_i32
+#define TCG_TARGET_HAS_ext16u_i32
+#define TCG_TARGET_HAS_bswap16_i32
+#define TCG_TARGET_HAS_bswap32_i32
+#define TCG_TARGET_HAS_neg_i32
+#define TCG_TARGET_HAS_not_i32
+// #define TCG_TARGET_HAS_andc_i32
+// #define TCG_TARGET_HAS_orc_i32
+// #define TCG_TARGET_HAS_eqv_i32
+// #define TCG_TARGET_HAS_nand_i32
+// #define TCG_TARGET_HAS_nor_i32
+
+#if TCG_TARGET_REG_BITS == 64
+#define TCG_TARGET_HAS_div2_i64
+#define TCG_TARGET_HAS_rot_i64
+#define TCG_TARGET_HAS_ext8s_i64
+#define TCG_TARGET_HAS_ext16s_i64
+#define TCG_TARGET_HAS_ext32s_i64
+#define TCG_TARGET_HAS_ext8u_i64
+#define TCG_TARGET_HAS_ext16u_i64
+#define TCG_TARGET_HAS_ext32u_i64
+#define TCG_TARGET_HAS_bswap16_i64
+#define TCG_TARGET_HAS_bswap32_i64
+#define TCG_TARGET_HAS_bswap64_i64
+#define TCG_TARGET_HAS_neg_i64
+#define TCG_TARGET_HAS_not_i64
+// #define TCG_TARGET_HAS_andc_i64
+// #define TCG_TARGET_HAS_orc_i64
+// #define TCG_TARGET_HAS_eqv_i64
+// #define TCG_TARGET_HAS_nand_i64
+// #define TCG_TARGET_HAS_nor_i64
+#endif
+
+#define TCG_TARGET_HAS_GUEST_BASE
+
+/* Note: must be synced with dyngen-exec.h */
+#if TCG_TARGET_REG_BITS == 64
+# define TCG_AREG0 TCG_REG_R14
+#else
+# ifndef VBOX /* we're using ESI instead of EBP, probably due to frame pointer opt issues */
+# define TCG_AREG0 TCG_REG_EBP
+# else  /* VBOX */
+#  define TCG_AREG0 TCG_REG_ESI
+# endif /* VBOX */
+#endif
+
+static inline void flush_icache_range(uintptr_t start, uintptr_t stop)
+{
+}
diff --git a/src/recompiler/tcg/tcg-dyngen.c b/src/recompiler/tcg/tcg-dyngen.c
new file mode 100644
index 00000000..b068a0a7
--- /dev/null
+++ b/src/recompiler/tcg/tcg-dyngen.c
@@ -0,0 +1,437 @@
+/*
+ * Tiny Code Generator for QEMU
+ *
+ * Copyright (c) 2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef VBOX
+#include <assert.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+#else
+# include <stdio.h>
+# include "osdep.h"
+#endif
+
+#include "config.h"
+#include "osdep.h"
+
+#include "tcg.h"
+
+int __op_param1, __op_param2, __op_param3;
+#if defined(__sparc__) || defined(__arm__)
+  void __op_gen_label1(){}
+  void __op_gen_label2(){}
+  void __op_gen_label3(){}
+#else
+  int __op_gen_label1, __op_gen_label2, __op_gen_label3;
+#endif
+int __op_jmp0, __op_jmp1, __op_jmp2, __op_jmp3;
+
+#if 0
+#if defined(__s390__)
+static inline void flush_icache_range(uintptr_t start, uintptr_t stop)
+{
+}
+#elif defined(__ia64__)
+static inline void flush_icache_range(uintptr_t start, uintptr_t stop)
+{
+    while (start < stop) {
+	asm volatile ("fc %0" :: "r"(start));
+	start += 32;
+    }
+    asm volatile (";;sync.i;;srlz.i;;");
+}
+#elif defined(__powerpc__)
+
+#define MIN_CACHE_LINE_SIZE 8 /* conservative value */
+
+static inline void flush_icache_range(uintptr_t start, uintptr_t stop)
+{
+    uintptr_t p;
+
+    start &= ~(MIN_CACHE_LINE_SIZE - 1);
+    stop = (stop + MIN_CACHE_LINE_SIZE - 1) & ~(MIN_CACHE_LINE_SIZE - 1);
+
+    for (p = start; p < stop; p += MIN_CACHE_LINE_SIZE) {
+        asm volatile ("dcbst 0,%0" : : "r"(p) : "memory");
+    }
+    asm volatile ("sync" : : : "memory");
+    for (p = start; p < stop; p += MIN_CACHE_LINE_SIZE) {
+        asm volatile ("icbi 0,%0" : : "r"(p) : "memory");
+    }
+    asm volatile ("sync" : : : "memory");
+    asm volatile ("isync" : : : "memory");
+}
+#elif defined(__alpha__)
+static inline void flush_icache_range(uintptr_t start, uintptr_t stop)
+{
+    asm ("imb");
+}
+#elif defined(__sparc__)
+static inline void flush_icache_range(uintptr_t start, uintptr_t stop)
+{
+	uintptr_t p;
+
+	p = start & ~(8UL - 1UL);
+	stop = (stop + (8UL - 1UL)) & ~(8UL - 1UL);
+
+	for (; p < stop; p += 8)
+		__asm__ __volatile__("flush\t%0" : : "r" (p));
+}
+#elif defined(__arm__)
+static inline void flush_icache_range(uintptr_t start, uintptr_t stop)
+{
+    register uintptr_t _beg __asm ("a1") = start;
+    register uintptr_t _end __asm ("a2") = stop;
+    register uintptr_t _flg __asm ("a3") = 0;
+    __asm __volatile__ ("swi 0x9f0002" : : "r" (_beg), "r" (_end), "r" (_flg));
+}
+#elif defined(__mc68000)
+
+# include <asm/cachectl.h>
+static inline void flush_icache_range(uintptr_t start, uintptr_t stop)
+{
+    cacheflush(start,FLUSH_SCOPE_LINE,FLUSH_CACHE_BOTH,stop-start+16);
+}
+#elif defined(__mips__)
+
+#include <sys/cachectl.h>
+static inline void flush_icache_range(uintptr_t start, uintptr_t stop)
+{
+    _flush_cache ((void *)start, stop - start, BCACHE);
+}
+#else
+#error unsupported CPU
+#endif
+
+#ifdef __alpha__
+
+register int gp asm("$29");
+
+static inline void immediate_ldah(void *p, int val) {
+    uint32_t *dest = p;
+    long high = ((val >> 16) + ((val >> 15) & 1)) & 0xffff;
+
+    *dest &= ~0xffff;
+    *dest |= high;
+    *dest |= 31 << 16;
+}
+static inline void immediate_lda(void *dest, int val) {
+    *(uint16_t *) dest = val;
+}
+void fix_bsr(void *p, int offset) {
+    uint32_t *dest = p;
+    *dest &= ~((1 << 21) - 1);
+    *dest |= (offset >> 2) & ((1 << 21) - 1);
+}
+
+#endif /* __alpha__ */
+
+#ifdef __ia64
+
+/* Patch instruction with "val" where "mask" has 1 bits. */
+static inline void ia64_patch (uint64_t insn_addr, uint64_t mask, uint64_t val)
+{
+    uint64_t m0, m1, v0, v1, b0, b1, *b = (uint64_t *) (insn_addr & -16);
+#   define insn_mask ((1UL << 41) - 1)
+    uintptr_t shift;
+
+    b0 = b[0]; b1 = b[1];
+    shift = 5 + 41 * (insn_addr % 16); /* 5 template, 3 x 41-bit insns */
+    if (shift >= 64) {
+	m1 = mask << (shift - 64);
+	v1 = val << (shift - 64);
+    } else {
+	m0 = mask << shift; m1 = mask >> (64 - shift);
+	v0 = val  << shift; v1 = val >> (64 - shift);
+	b[0] = (b0 & ~m0) | (v0 & m0);
+    }
+    b[1] = (b1 & ~m1) | (v1 & m1);
+}
+
+static inline void ia64_patch_imm60 (uint64_t insn_addr, uint64_t val)
+{
+	ia64_patch(insn_addr,
+		   0x011ffffe000UL,
+		   (  ((val & 0x0800000000000000UL) >> 23) /* bit 59 -> 36 */
+		    | ((val & 0x00000000000fffffUL) << 13) /* bit 0 -> 13 */));
+	ia64_patch(insn_addr - 1, 0x1fffffffffcUL, val >> 18);
+}
+
+static inline void ia64_imm64 (void *insn, uint64_t val)
+{
+    /* Ignore the slot number of the relocation; GCC and Intel
+       toolchains differed for some time on whether IMM64 relocs are
+       against slot 1 (Intel) or slot 2 (GCC).  */
+    uint64_t insn_addr = (uint64_t) insn & ~3UL;
+
+    ia64_patch(insn_addr + 2,
+	       0x01fffefe000UL,
+	       (  ((val & 0x8000000000000000UL) >> 27) /* bit 63 -> 36 */
+		| ((val & 0x0000000000200000UL) <<  0) /* bit 21 -> 21 */
+		| ((val & 0x00000000001f0000UL) <<  6) /* bit 16 -> 22 */
+		| ((val & 0x000000000000ff80UL) << 20) /* bit  7 -> 27 */
+		| ((val & 0x000000000000007fUL) << 13) /* bit  0 -> 13 */)
+	    );
+    ia64_patch(insn_addr + 1, 0x1ffffffffffUL, val >> 22);
+}
+
+static inline void ia64_imm60b (void *insn, uint64_t val)
+{
+    /* Ignore the slot number of the relocation; GCC and Intel
+       toolchains differed for some time on whether IMM64 relocs are
+       against slot 1 (Intel) or slot 2 (GCC).  */
+    uint64_t insn_addr = (uint64_t) insn & ~3UL;
+
+    if (val + ((uint64_t) 1 << 59) >= (1UL << 60))
+	fprintf(stderr, "%s: value %ld out of IMM60 range\n",
+		__FUNCTION__, (int64_t) val);
+    ia64_patch_imm60(insn_addr + 2, val);
+}
+
+static inline void ia64_imm22 (void *insn, uint64_t val)
+{
+    if (val + (1 << 21) >= (1 << 22))
+	fprintf(stderr, "%s: value %li out of IMM22 range\n",
+		__FUNCTION__, (int64_t)val);
+    ia64_patch((uint64_t) insn, 0x01fffcfe000UL,
+	       (  ((val & 0x200000UL) << 15) /* bit 21 -> 36 */
+		| ((val & 0x1f0000UL) <<  6) /* bit 16 -> 22 */
+		| ((val & 0x00ff80UL) << 20) /* bit  7 -> 27 */
+		| ((val & 0x00007fUL) << 13) /* bit  0 -> 13 */));
+}
+
+/* Like ia64_imm22(), but also clear bits 20-21.  For addl, this has
+   the effect of turning "addl rX=imm22,rY" into "addl
+   rX=imm22,r0".  */
+static inline void ia64_imm22_r0 (void *insn, uint64_t val)
+{
+    if (val + (1 << 21) >= (1 << 22))
+	fprintf(stderr, "%s: value %li out of IMM22 range\n",
+		__FUNCTION__, (int64_t)val);
+    ia64_patch((uint64_t) insn, 0x01fffcfe000UL | (0x3UL << 20),
+	       (  ((val & 0x200000UL) << 15) /* bit 21 -> 36 */
+		| ((val & 0x1f0000UL) <<  6) /* bit 16 -> 22 */
+		| ((val & 0x00ff80UL) << 20) /* bit  7 -> 27 */
+		| ((val & 0x00007fUL) << 13) /* bit  0 -> 13 */));
+}
+
+static inline void ia64_imm21b (void *insn, uint64_t val)
+{
+    if (val + (1 << 20) >= (1 << 21))
+	fprintf(stderr, "%s: value %li out of IMM21b range\n",
+		__FUNCTION__, (int64_t)val);
+    ia64_patch((uint64_t) insn, 0x11ffffe000UL,
+	       (  ((val & 0x100000UL) << 16) /* bit 20 -> 36 */
+		| ((val & 0x0fffffUL) << 13) /* bit  0 -> 13 */));
+}
+
+static inline void ia64_nop_b (void *insn)
+{
+    ia64_patch((uint64_t) insn, (1UL << 41) - 1, 2UL << 37);
+}
+
+static inline void ia64_ldxmov(void *insn, uint64_t val)
+{
+    if (val + (1 << 21) < (1 << 22))
+	ia64_patch((uint64_t) insn, 0x1fff80fe000UL, 8UL << 37);
+}
+
+static inline int ia64_patch_ltoff(void *insn, uint64_t val,
+				   int relaxable)
+{
+    if (relaxable && (val + (1 << 21) < (1 << 22))) {
+	ia64_imm22_r0(insn, val);
+	return 0;
+    }
+    return 1;
+}
+
+struct ia64_fixup {
+    struct ia64_fixup *next;
+    void *addr;			/* address that needs to be patched */
+    long value;
+};
+
+#define IA64_PLT(insn, plt_index)			\
+do {							\
+    struct ia64_fixup *fixup = alloca(sizeof(*fixup));	\
+    fixup->next = plt_fixes;				\
+    plt_fixes = fixup;					\
+    fixup->addr = (insn);				\
+    fixup->value = (plt_index);				\
+    plt_offset[(plt_index)] = 1;			\
+} while (0)
+
+#define IA64_LTOFF(insn, val, relaxable)			\
+do {								\
+    if (ia64_patch_ltoff(insn, val, relaxable)) {		\
+	struct ia64_fixup *fixup = alloca(sizeof(*fixup));	\
+	fixup->next = ltoff_fixes;				\
+	ltoff_fixes = fixup;					\
+	fixup->addr = (insn);					\
+	fixup->value = (val);					\
+    }								\
+} while (0)
+
+static inline void ia64_apply_fixes (uint8_t **gen_code_pp,
+				     struct ia64_fixup *ltoff_fixes,
+				     uint64_t gp,
+				     struct ia64_fixup *plt_fixes,
+				     int num_plts,
+				     uintptr_t *plt_target,
+				     unsigned int *plt_offset)
+{
+    static const uint8_t plt_bundle[] = {
+	0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,	/* nop 0; movl r1=GP */
+	0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x60,
+
+	0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,	/* nop 0; brl IP */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0
+    };
+    uint8_t *gen_code_ptr = *gen_code_pp, *plt_start, *got_start;
+    uint64_t *vp;
+    struct ia64_fixup *fixup;
+    unsigned int offset = 0;
+    struct fdesc {
+	long ip;
+	long gp;
+    } *fdesc;
+    int i;
+
+    if (plt_fixes) {
+	plt_start = gen_code_ptr;
+
+	for (i = 0; i < num_plts; ++i) {
+	    if (plt_offset[i]) {
+		plt_offset[i] = offset;
+		offset += sizeof(plt_bundle);
+
+		fdesc = (struct fdesc *) plt_target[i];
+		memcpy(gen_code_ptr, plt_bundle, sizeof(plt_bundle));
+		ia64_imm64 (gen_code_ptr + 0x02, fdesc->gp);
+		ia64_imm60b(gen_code_ptr + 0x12,
+			    (fdesc->ip - (long) (gen_code_ptr + 0x10)) >> 4);
+		gen_code_ptr += sizeof(plt_bundle);
+	    }
+	}
+
+	for (fixup = plt_fixes; fixup; fixup = fixup->next)
+	    ia64_imm21b(fixup->addr,
+			((long) plt_start + plt_offset[fixup->value]
+			 - ((long) fixup->addr & ~0xf)) >> 4);
+    }
+
+    got_start = gen_code_ptr;
+
+    /* First, create the GOT: */
+    for (fixup = ltoff_fixes; fixup; fixup = fixup->next) {
+	/* first check if we already have this value in the GOT: */
+	for (vp = (uint64_t *) got_start; vp < (uint64_t *) gen_code_ptr; ++vp)
+	    if (*vp == fixup->value)
+		break;
+	if (vp == (uint64_t *) gen_code_ptr) {
+	    /* Nope, we need to put the value in the GOT: */
+	    *vp = fixup->value;
+	    gen_code_ptr += 8;
+	}
+	ia64_imm22(fixup->addr, (long) vp - gp);
+    }
+    /* Keep code ptr aligned. */
+    if ((long) gen_code_ptr & 15)
+	gen_code_ptr += 8;
+    *gen_code_pp = gen_code_ptr;
+}
+#endif
+#endif
+
+#ifdef CONFIG_DYNGEN_OP
+
+#if defined __hppa__
+struct hppa_branch_stub {
+    uint32_t *location;
+    long target;
+    struct hppa_branch_stub *next;
+};
+
+#define HPPA_RECORD_BRANCH(LIST, LOC, TARGET) \
+do { \
+    struct hppa_branch_stub *stub = alloca(sizeof(struct hppa_branch_stub)); \
+    stub->location = LOC; \
+    stub->target = TARGET; \
+    stub->next = LIST; \
+    LIST = stub; \
+} while (0)
+
+static inline void hppa_process_stubs(struct hppa_branch_stub *stub,
+                                      uint8_t **gen_code_pp)
+{
+    uint32_t *s = (uint32_t *)*gen_code_pp;
+    uint32_t *p = s + 1;
+
+    if (!stub) return;
+
+    for (; stub != NULL; stub = stub->next) {
+        uintptr_t l = (uintptr_t)p;
+        /* stub:
+         * ldil L'target, %r1
+         * be,n R'target(%sr4,%r1)
+         */
+        *p++ = 0x20200000 | reassemble_21(lrsel(stub->target, 0));
+        *p++ = 0xe0202002 | (reassemble_17(rrsel(stub->target, 0) >> 2));
+        hppa_patch17f(stub->location, l, 0);
+    }
+    /* b,l,n stub,%r0 */
+    *s = 0xe8000002 | reassemble_17((p - s) - 2);
+    *gen_code_pp = (uint8_t *)p;
+}
+#endif /* __hppa__ */
+
+const TCGArg *dyngen_op(TCGContext *s, int opc, const TCGArg *opparam_ptr)
+{
+    uint8_t *gen_code_ptr;
+
+#ifdef __hppa__
+    struct hppa_branch_stub *hppa_stubs = NULL;
+#endif
+
+    gen_code_ptr = s->code_ptr;
+    switch(opc) {
+
+/* op.h is dynamically generated by dyngen.c from op.c */
+#include "op.h"
+
+    default:
+        tcg_abort();
+    }
+
+#ifdef __hppa__
+    hppa_process_stubs(hppa_stubs, &gen_code_ptr);
+#endif
+
+    s->code_ptr = gen_code_ptr;
+    return opparam_ptr;
+}
+#endif
diff --git a/src/recompiler/tcg/tcg-op.h b/src/recompiler/tcg/tcg-op.h
new file mode 100644
index 00000000..b3890804
--- /dev/null
+++ b/src/recompiler/tcg/tcg-op.h
@@ -0,0 +1,2469 @@
+/*
+ * Tiny Code Generator for QEMU
+ *
+ * Copyright (c) 2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "tcg.h"
+
+int gen_new_label(void);
+
+static inline void tcg_gen_op1_i32(TCGOpcode opc, TCGv_i32 arg1)
+{
+    *gen_opc_ptr++ = opc;
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg1);
+}
+
+static inline void tcg_gen_op1_i64(TCGOpcode opc, TCGv_i64 arg1)
+{
+    *gen_opc_ptr++ = opc;
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg1);
+}
+
+static inline void tcg_gen_op1i(TCGOpcode opc, TCGArg arg1)
+{
+    *gen_opc_ptr++ = opc;
+    *gen_opparam_ptr++ = arg1;
+}
+
+static inline void tcg_gen_op2_i32(TCGOpcode opc, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    *gen_opc_ptr++ = opc;
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg1);
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg2);
+}
+
+static inline void tcg_gen_op2_i64(TCGOpcode opc, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    *gen_opc_ptr++ = opc;
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg1);
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg2);
+}
+
+static inline void tcg_gen_op2i_i32(TCGOpcode opc, TCGv_i32 arg1, TCGArg arg2)
+{
+    *gen_opc_ptr++ = opc;
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg1);
+    *gen_opparam_ptr++ = arg2;
+}
+
+static inline void tcg_gen_op2i_i64(TCGOpcode opc, TCGv_i64 arg1, TCGArg arg2)
+{
+    *gen_opc_ptr++ = opc;
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg1);
+    *gen_opparam_ptr++ = arg2;
+}
+
+static inline void tcg_gen_op2ii(TCGOpcode opc, TCGArg arg1, TCGArg arg2)
+{
+    *gen_opc_ptr++ = opc;
+    *gen_opparam_ptr++ = arg1;
+    *gen_opparam_ptr++ = arg2;
+}
+
+static inline void tcg_gen_op3_i32(TCGOpcode opc, TCGv_i32 arg1, TCGv_i32 arg2,
+                                   TCGv_i32 arg3)
+{
+    *gen_opc_ptr++ = opc;
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg1);
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg2);
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg3);
+}
+
+static inline void tcg_gen_op3_i64(TCGOpcode opc, TCGv_i64 arg1, TCGv_i64 arg2,
+                                   TCGv_i64 arg3)
+{
+    *gen_opc_ptr++ = opc;
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg1);
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg2);
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg3);
+}
+
+static inline void tcg_gen_op3i_i32(TCGOpcode opc, TCGv_i32 arg1,
+                                    TCGv_i32 arg2, TCGArg arg3)
+{
+    *gen_opc_ptr++ = opc;
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg1);
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg2);
+    *gen_opparam_ptr++ = arg3;
+}
+
+static inline void tcg_gen_op3i_i64(TCGOpcode opc, TCGv_i64 arg1,
+                                    TCGv_i64 arg2, TCGArg arg3)
+{
+    *gen_opc_ptr++ = opc;
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg1);
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg2);
+    *gen_opparam_ptr++ = arg3;
+}
+
+static inline void tcg_gen_ldst_op_i32(TCGOpcode opc, TCGv_i32 val,
+                                       TCGv_ptr base, TCGArg offset)
+{
+    *gen_opc_ptr++ = opc;
+    *gen_opparam_ptr++ = GET_TCGV_I32(val);
+    *gen_opparam_ptr++ = GET_TCGV_PTR(base);
+    *gen_opparam_ptr++ = offset;
+}
+
+static inline void tcg_gen_ldst_op_i64(TCGOpcode opc, TCGv_i64 val,
+                                       TCGv_ptr base, TCGArg offset)
+{
+    *gen_opc_ptr++ = opc;
+    *gen_opparam_ptr++ = GET_TCGV_I64(val);
+    *gen_opparam_ptr++ = GET_TCGV_PTR(base);
+    *gen_opparam_ptr++ = offset;
+}
+
+static inline void tcg_gen_qemu_ldst_op_i64_i32(TCGOpcode opc, TCGv_i64 val,
+                                                TCGv_i32 addr, TCGArg mem_index)
+{
+    *gen_opc_ptr++ = opc;
+    *gen_opparam_ptr++ = GET_TCGV_I64(val);
+    *gen_opparam_ptr++ = GET_TCGV_I32(addr);
+    *gen_opparam_ptr++ = mem_index;
+}
+
+static inline void tcg_gen_qemu_ldst_op_i64_i64(TCGOpcode opc, TCGv_i64 val,
+                                                TCGv_i64 addr, TCGArg mem_index)
+{
+    *gen_opc_ptr++ = opc;
+    *gen_opparam_ptr++ = GET_TCGV_I64(val);
+    *gen_opparam_ptr++ = GET_TCGV_I64(addr);
+    *gen_opparam_ptr++ = mem_index;
+}
+
+static inline void tcg_gen_op4_i32(TCGOpcode opc, TCGv_i32 arg1, TCGv_i32 arg2,
+                                   TCGv_i32 arg3, TCGv_i32 arg4)
+{
+    *gen_opc_ptr++ = opc;
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg1);
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg2);
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg3);
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg4);
+}
+
+static inline void tcg_gen_op4_i64(TCGOpcode opc, TCGv_i64 arg1, TCGv_i64 arg2,
+                                   TCGv_i64 arg3, TCGv_i64 arg4)
+{
+    *gen_opc_ptr++ = opc;
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg1);
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg2);
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg3);
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg4);
+}
+
+static inline void tcg_gen_op4i_i32(TCGOpcode opc, TCGv_i32 arg1, TCGv_i32 arg2,
+                                    TCGv_i32 arg3, TCGArg arg4)
+{
+    *gen_opc_ptr++ = opc;
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg1);
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg2);
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg3);
+    *gen_opparam_ptr++ = arg4;
+}
+
+static inline void tcg_gen_op4i_i64(TCGOpcode opc, TCGv_i64 arg1, TCGv_i64 arg2,
+                                    TCGv_i64 arg3, TCGArg arg4)
+{
+    *gen_opc_ptr++ = opc;
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg1);
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg2);
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg3);
+    *gen_opparam_ptr++ = arg4;
+}
+
+static inline void tcg_gen_op4ii_i32(TCGOpcode opc, TCGv_i32 arg1, TCGv_i32 arg2,
+                                     TCGArg arg3, TCGArg arg4)
+{
+    *gen_opc_ptr++ = opc;
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg1);
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg2);
+    *gen_opparam_ptr++ = arg3;
+    *gen_opparam_ptr++ = arg4;
+}
+
+static inline void tcg_gen_op4ii_i64(TCGOpcode opc, TCGv_i64 arg1, TCGv_i64 arg2,
+                                     TCGArg arg3, TCGArg arg4)
+{
+    *gen_opc_ptr++ = opc;
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg1);
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg2);
+    *gen_opparam_ptr++ = arg3;
+    *gen_opparam_ptr++ = arg4;
+}
+
+static inline void tcg_gen_op5_i32(TCGOpcode opc, TCGv_i32 arg1, TCGv_i32 arg2,
+                                   TCGv_i32 arg3, TCGv_i32 arg4, TCGv_i32 arg5)
+{
+    *gen_opc_ptr++ = opc;
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg1);
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg2);
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg3);
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg4);
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg5);
+}
+
+static inline void tcg_gen_op5_i64(TCGOpcode opc, TCGv_i64 arg1, TCGv_i64 arg2,
+                                   TCGv_i64 arg3, TCGv_i64 arg4, TCGv_i64 arg5)
+{
+    *gen_opc_ptr++ = opc;
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg1);
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg2);
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg3);
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg4);
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg5);
+}
+
+static inline void tcg_gen_op5i_i32(TCGOpcode opc, TCGv_i32 arg1, TCGv_i32 arg2,
+                                    TCGv_i32 arg3, TCGv_i32 arg4, TCGArg arg5)
+{
+    *gen_opc_ptr++ = opc;
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg1);
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg2);
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg3);
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg4);
+    *gen_opparam_ptr++ = arg5;
+}
+
+static inline void tcg_gen_op5i_i64(TCGOpcode opc, TCGv_i64 arg1, TCGv_i64 arg2,
+                                    TCGv_i64 arg3, TCGv_i64 arg4, TCGArg arg5)
+{
+    *gen_opc_ptr++ = opc;
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg1);
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg2);
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg3);
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg4);
+    *gen_opparam_ptr++ = arg5;
+}
+
+static inline void tcg_gen_op6_i32(TCGOpcode opc, TCGv_i32 arg1, TCGv_i32 arg2,
+                                   TCGv_i32 arg3, TCGv_i32 arg4, TCGv_i32 arg5,
+                                   TCGv_i32 arg6)
+{
+    *gen_opc_ptr++ = opc;
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg1);
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg2);
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg3);
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg4);
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg5);
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg6);
+}
+
+static inline void tcg_gen_op6_i64(TCGOpcode opc, TCGv_i64 arg1, TCGv_i64 arg2,
+                                   TCGv_i64 arg3, TCGv_i64 arg4, TCGv_i64 arg5,
+                                   TCGv_i64 arg6)
+{
+    *gen_opc_ptr++ = opc;
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg1);
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg2);
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg3);
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg4);
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg5);
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg6);
+}
+
+static inline void tcg_gen_op6i_i32(TCGOpcode opc, TCGv_i32 arg1, TCGv_i32 arg2,
+                                    TCGv_i32 arg3, TCGv_i32 arg4,
+                                    TCGv_i32 arg5, TCGArg arg6)
+{
+    *gen_opc_ptr++ = opc;
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg1);
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg2);
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg3);
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg4);
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg5);
+    *gen_opparam_ptr++ = arg6;
+}
+
+static inline void tcg_gen_op6i_i64(TCGOpcode opc, TCGv_i64 arg1, TCGv_i64 arg2,
+                                    TCGv_i64 arg3, TCGv_i64 arg4,
+                                    TCGv_i64 arg5, TCGArg arg6)
+{
+    *gen_opc_ptr++ = opc;
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg1);
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg2);
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg3);
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg4);
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg5);
+    *gen_opparam_ptr++ = arg6;
+}
+
+static inline void tcg_gen_op6ii_i32(TCGOpcode opc, TCGv_i32 arg1,
+                                     TCGv_i32 arg2, TCGv_i32 arg3,
+                                     TCGv_i32 arg4, TCGArg arg5, TCGArg arg6)
+{
+    *gen_opc_ptr++ = opc;
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg1);
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg2);
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg3);
+    *gen_opparam_ptr++ = GET_TCGV_I32(arg4);
+    *gen_opparam_ptr++ = arg5;
+    *gen_opparam_ptr++ = arg6;
+}
+
+static inline void tcg_gen_op6ii_i64(TCGOpcode opc, TCGv_i64 arg1,
+                                     TCGv_i64 arg2, TCGv_i64 arg3,
+                                     TCGv_i64 arg4, TCGArg arg5, TCGArg arg6)
+{
+    *gen_opc_ptr++ = opc;
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg1);
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg2);
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg3);
+    *gen_opparam_ptr++ = GET_TCGV_I64(arg4);
+    *gen_opparam_ptr++ = arg5;
+    *gen_opparam_ptr++ = arg6;
+}
+
+static inline void gen_set_label(int n)
+{
+    tcg_gen_op1i(INDEX_op_set_label, n);
+}
+
+static inline void tcg_gen_br(int label)
+{
+    tcg_gen_op1i(INDEX_op_br, label);
+}
+
+static inline void tcg_gen_mov_i32(TCGv_i32 ret, TCGv_i32 arg)
+{
+    if (!TCGV_EQUAL_I32(ret, arg))
+        tcg_gen_op2_i32(INDEX_op_mov_i32, ret, arg);
+}
+
+static inline void tcg_gen_movi_i32(TCGv_i32 ret, int32_t arg)
+{
+    tcg_gen_op2i_i32(INDEX_op_movi_i32, ret, arg);
+}
+
+/* A version of dh_sizemask from def-helper.h that doesn't rely on
+   preprocessor magic.  */
+static inline int tcg_gen_sizemask(int n, int is_64bit, int is_signed)
+{
+    return (is_64bit << n*2) | (is_signed << (n*2 + 1));
+}
+
+/* helper calls */
+static inline void tcg_gen_helperN(void *func, int flags, int sizemask,
+                                   TCGArg ret, int nargs, TCGArg *args)
+{
+    TCGv_ptr fn;
+    fn = tcg_const_ptr((tcg_target_long)func);
+    tcg_gen_callN(&tcg_ctx, fn, flags, sizemask, ret,
+                  nargs, args);
+    tcg_temp_free_ptr(fn);
+}
+
+/* Note: Both tcg_gen_helper32() and tcg_gen_helper64() are currently
+   reserved for helpers in tcg-runtime.c. These helpers are all const
+   and pure, hence the call to tcg_gen_callN() with TCG_CALL_CONST |
+   TCG_CALL_PURE. This may need to be adjusted if these functions
+   start to be used with other helpers. */
+static inline void tcg_gen_helper32(void *func, int sizemask, TCGv_i32 ret,
+                                    TCGv_i32 a, TCGv_i32 b)
+{
+    TCGv_ptr fn;
+    TCGArg args[2];
+    fn = tcg_const_ptr((tcg_target_long)func);
+    args[0] = GET_TCGV_I32(a);
+    args[1] = GET_TCGV_I32(b);
+    tcg_gen_callN(&tcg_ctx, fn, TCG_CALL_CONST | TCG_CALL_PURE, sizemask,
+                  GET_TCGV_I32(ret), 2, args);
+    tcg_temp_free_ptr(fn);
+}
+
+static inline void tcg_gen_helper64(void *func, int sizemask, TCGv_i64 ret,
+                                    TCGv_i64 a, TCGv_i64 b)
+{
+    TCGv_ptr fn;
+    TCGArg args[2];
+    fn = tcg_const_ptr((tcg_target_long)func);
+    args[0] = GET_TCGV_I64(a);
+    args[1] = GET_TCGV_I64(b);
+    tcg_gen_callN(&tcg_ctx, fn, TCG_CALL_CONST | TCG_CALL_PURE, sizemask,
+                  GET_TCGV_I64(ret), 2, args);
+    tcg_temp_free_ptr(fn);
+}
+
+/* 32 bit ops */
+
+static inline void tcg_gen_ld8u_i32(TCGv_i32 ret, TCGv_ptr arg2, tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i32(INDEX_op_ld8u_i32, ret, arg2, offset);
+}
+
+static inline void tcg_gen_ld8s_i32(TCGv_i32 ret, TCGv_ptr arg2, tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i32(INDEX_op_ld8s_i32, ret, arg2, offset);
+}
+
+static inline void tcg_gen_ld16u_i32(TCGv_i32 ret, TCGv_ptr arg2, tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i32(INDEX_op_ld16u_i32, ret, arg2, offset);
+}
+
+static inline void tcg_gen_ld16s_i32(TCGv_i32 ret, TCGv_ptr arg2, tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i32(INDEX_op_ld16s_i32, ret, arg2, offset);
+}
+
+static inline void tcg_gen_ld_i32(TCGv_i32 ret, TCGv_ptr arg2, tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i32(INDEX_op_ld_i32, ret, arg2, offset);
+}
+
+static inline void tcg_gen_st8_i32(TCGv_i32 arg1, TCGv_ptr arg2, tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i32(INDEX_op_st8_i32, arg1, arg2, offset);
+}
+
+static inline void tcg_gen_st16_i32(TCGv_i32 arg1, TCGv_ptr arg2, tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i32(INDEX_op_st16_i32, arg1, arg2, offset);
+}
+
+static inline void tcg_gen_st_i32(TCGv_i32 arg1, TCGv_ptr arg2, tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i32(INDEX_op_st_i32, arg1, arg2, offset);
+}
+
+static inline void tcg_gen_add_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    tcg_gen_op3_i32(INDEX_op_add_i32, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_addi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
+{
+    /* some cases can be optimized here */
+    if (arg2 == 0) {
+        tcg_gen_mov_i32(ret, arg1);
+    } else {
+        TCGv_i32 t0 = tcg_const_i32(arg2);
+        tcg_gen_add_i32(ret, arg1, t0);
+        tcg_temp_free_i32(t0);
+    }
+}
+
+static inline void tcg_gen_sub_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    tcg_gen_op3_i32(INDEX_op_sub_i32, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_subfi_i32(TCGv_i32 ret, int32_t arg1, TCGv_i32 arg2)
+{
+    TCGv_i32 t0 = tcg_const_i32(arg1);
+    tcg_gen_sub_i32(ret, t0, arg2);
+    tcg_temp_free_i32(t0);
+}
+
+static inline void tcg_gen_subi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
+{
+    /* some cases can be optimized here */
+    if (arg2 == 0) {
+        tcg_gen_mov_i32(ret, arg1);
+    } else {
+        TCGv_i32 t0 = tcg_const_i32(arg2);
+        tcg_gen_sub_i32(ret, arg1, t0);
+        tcg_temp_free_i32(t0);
+    }
+}
+
+static inline void tcg_gen_and_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    if (TCGV_EQUAL_I32(arg1, arg2)) {
+        tcg_gen_mov_i32(ret, arg1);
+    } else {
+        tcg_gen_op3_i32(INDEX_op_and_i32, ret, arg1, arg2);
+    }
+}
+
+static inline void tcg_gen_andi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
+{
+    /* some cases can be optimized here */
+    if (arg2 == 0) {
+        tcg_gen_movi_i32(ret, 0);
+    } else if (arg2 == 0xffffffff) {
+        tcg_gen_mov_i32(ret, arg1);
+    } else {
+        TCGv_i32 t0 = tcg_const_i32(arg2);
+        tcg_gen_and_i32(ret, arg1, t0);
+        tcg_temp_free_i32(t0);
+    }
+}
+
+static inline void tcg_gen_or_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    if (TCGV_EQUAL_I32(arg1, arg2)) {
+        tcg_gen_mov_i32(ret, arg1);
+    } else {
+        tcg_gen_op3_i32(INDEX_op_or_i32, ret, arg1, arg2);
+    }
+}
+
+static inline void tcg_gen_ori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
+{
+    /* some cases can be optimized here */
+    if (arg2 == 0xffffffff) {
+        tcg_gen_movi_i32(ret, 0xffffffff);
+    } else if (arg2 == 0) {
+        tcg_gen_mov_i32(ret, arg1);
+    } else {
+        TCGv_i32 t0 = tcg_const_i32(arg2);
+        tcg_gen_or_i32(ret, arg1, t0);
+        tcg_temp_free_i32(t0);
+    }
+}
+
+static inline void tcg_gen_xor_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    if (TCGV_EQUAL_I32(arg1, arg2)) {
+        tcg_gen_movi_i32(ret, 0);
+    } else {
+        tcg_gen_op3_i32(INDEX_op_xor_i32, ret, arg1, arg2);
+    }
+}
+
+static inline void tcg_gen_xori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
+{
+    /* some cases can be optimized here */
+    if (arg2 == 0) {
+        tcg_gen_mov_i32(ret, arg1);
+    } else {
+        TCGv_i32 t0 = tcg_const_i32(arg2);
+        tcg_gen_xor_i32(ret, arg1, t0);
+        tcg_temp_free_i32(t0);
+    }
+}
+
+static inline void tcg_gen_shl_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    tcg_gen_op3_i32(INDEX_op_shl_i32, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_shli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
+{
+    if (arg2 == 0) {
+        tcg_gen_mov_i32(ret, arg1);
+    } else {
+        TCGv_i32 t0 = tcg_const_i32(arg2);
+        tcg_gen_shl_i32(ret, arg1, t0);
+        tcg_temp_free_i32(t0);
+    }
+}
+
+static inline void tcg_gen_shr_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    tcg_gen_op3_i32(INDEX_op_shr_i32, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_shri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
+{
+    if (arg2 == 0) {
+        tcg_gen_mov_i32(ret, arg1);
+    } else {
+        TCGv_i32 t0 = tcg_const_i32(arg2);
+        tcg_gen_shr_i32(ret, arg1, t0);
+        tcg_temp_free_i32(t0);
+    }
+}
+
+static inline void tcg_gen_sar_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    tcg_gen_op3_i32(INDEX_op_sar_i32, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_sari_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
+{
+    if (arg2 == 0) {
+        tcg_gen_mov_i32(ret, arg1);
+    } else {
+        TCGv_i32 t0 = tcg_const_i32(arg2);
+        tcg_gen_sar_i32(ret, arg1, t0);
+        tcg_temp_free_i32(t0);
+    }
+}
+
+static inline void tcg_gen_brcond_i32(TCGCond cond, TCGv_i32 arg1,
+                                      TCGv_i32 arg2, int label_index)
+{
+    tcg_gen_op4ii_i32(INDEX_op_brcond_i32, arg1, arg2, cond, label_index);
+}
+
+static inline void tcg_gen_brcondi_i32(TCGCond cond, TCGv_i32 arg1,
+                                       int32_t arg2, int label_index)
+{
+    TCGv_i32 t0 = tcg_const_i32(arg2);
+    tcg_gen_brcond_i32(cond, arg1, t0, label_index);
+    tcg_temp_free_i32(t0);
+}
+
+static inline void tcg_gen_setcond_i32(TCGCond cond, TCGv_i32 ret,
+                                       TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    tcg_gen_op4i_i32(INDEX_op_setcond_i32, ret, arg1, arg2, cond);
+}
+
+static inline void tcg_gen_setcondi_i32(TCGCond cond, TCGv_i32 ret,
+                                        TCGv_i32 arg1, int32_t arg2)
+{
+    TCGv_i32 t0 = tcg_const_i32(arg2);
+    tcg_gen_setcond_i32(cond, ret, arg1, t0);
+    tcg_temp_free_i32(t0);
+}
+
+static inline void tcg_gen_mul_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    tcg_gen_op3_i32(INDEX_op_mul_i32, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_muli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
+{
+    TCGv_i32 t0 = tcg_const_i32(arg2);
+    tcg_gen_mul_i32(ret, arg1, t0);
+    tcg_temp_free_i32(t0);
+}
+
+#ifdef TCG_TARGET_HAS_div_i32
+static inline void tcg_gen_div_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    tcg_gen_op3_i32(INDEX_op_div_i32, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_rem_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    tcg_gen_op3_i32(INDEX_op_rem_i32, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_divu_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    tcg_gen_op3_i32(INDEX_op_divu_i32, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_remu_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    tcg_gen_op3_i32(INDEX_op_remu_i32, ret, arg1, arg2);
+}
+#elif defined(TCG_TARGET_HAS_div2_i32)
+static inline void tcg_gen_div_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    TCGv_i32 t0;
+    t0 = tcg_temp_new_i32();
+    tcg_gen_sari_i32(t0, arg1, 31);
+    tcg_gen_op5_i32(INDEX_op_div2_i32, ret, t0, arg1, t0, arg2);
+    tcg_temp_free_i32(t0);
+}
+
+static inline void tcg_gen_rem_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    TCGv_i32 t0;
+    t0 = tcg_temp_new_i32();
+    tcg_gen_sari_i32(t0, arg1, 31);
+    tcg_gen_op5_i32(INDEX_op_div2_i32, t0, ret, arg1, t0, arg2);
+    tcg_temp_free_i32(t0);
+}
+
+static inline void tcg_gen_divu_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    TCGv_i32 t0;
+    t0 = tcg_temp_new_i32();
+    tcg_gen_movi_i32(t0, 0);
+    tcg_gen_op5_i32(INDEX_op_divu2_i32, ret, t0, arg1, t0, arg2);
+    tcg_temp_free_i32(t0);
+}
+
+static inline void tcg_gen_remu_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    TCGv_i32 t0;
+    t0 = tcg_temp_new_i32();
+    tcg_gen_movi_i32(t0, 0);
+    tcg_gen_op5_i32(INDEX_op_divu2_i32, t0, ret, arg1, t0, arg2);
+    tcg_temp_free_i32(t0);
+}
+#else
+static inline void tcg_gen_div_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    int sizemask = 0;
+    /* Return value and both arguments are 32-bit and signed.  */
+    sizemask |= tcg_gen_sizemask(0, 0, 1);
+    sizemask |= tcg_gen_sizemask(1, 0, 1);
+    sizemask |= tcg_gen_sizemask(2, 0, 1);
+
+    tcg_gen_helper32(tcg_helper_div_i32, sizemask, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_rem_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    int sizemask = 0;
+    /* Return value and both arguments are 32-bit and signed.  */
+    sizemask |= tcg_gen_sizemask(0, 0, 1);
+    sizemask |= tcg_gen_sizemask(1, 0, 1);
+    sizemask |= tcg_gen_sizemask(2, 0, 1);
+
+    tcg_gen_helper32(tcg_helper_rem_i32, sizemask, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_divu_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    int sizemask = 0;
+    /* Return value and both arguments are 32-bit and unsigned.  */
+    sizemask |= tcg_gen_sizemask(0, 0, 0);
+    sizemask |= tcg_gen_sizemask(1, 0, 0);
+    sizemask |= tcg_gen_sizemask(2, 0, 0);
+
+    tcg_gen_helper32(tcg_helper_divu_i32, ret, arg1, arg2, 0);
+}
+
+static inline void tcg_gen_remu_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    int sizemask = 0;
+    /* Return value and both arguments are 32-bit and unsigned.  */
+    sizemask |= tcg_gen_sizemask(0, 0, 0);
+    sizemask |= tcg_gen_sizemask(1, 0, 0);
+    sizemask |= tcg_gen_sizemask(2, 0, 0);
+
+    tcg_gen_helper32(tcg_helper_remu_i32, ret, arg1, arg2, 0);
+}
+#endif
+
+#if TCG_TARGET_REG_BITS == 32
+
+static inline void tcg_gen_mov_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    if (!TCGV_EQUAL_I64(ret, arg)) {
+        tcg_gen_mov_i32(TCGV_LOW(ret), TCGV_LOW(arg));
+        tcg_gen_mov_i32(TCGV_HIGH(ret), TCGV_HIGH(arg));
+    }
+}
+
+static inline void tcg_gen_movi_i64(TCGv_i64 ret, int64_t arg)
+{
+    tcg_gen_movi_i32(TCGV_LOW(ret), arg);
+    tcg_gen_movi_i32(TCGV_HIGH(ret), arg >> 32);
+}
+
+static inline void tcg_gen_ld8u_i64(TCGv_i64 ret, TCGv_ptr arg2,
+                                    tcg_target_long offset)
+{
+    tcg_gen_ld8u_i32(TCGV_LOW(ret), arg2, offset);
+    tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+}
+
+static inline void tcg_gen_ld8s_i64(TCGv_i64 ret, TCGv_ptr arg2,
+                                    tcg_target_long offset)
+{
+    tcg_gen_ld8s_i32(TCGV_LOW(ret), arg2, offset);
+    tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_HIGH(ret), 31);
+}
+
+static inline void tcg_gen_ld16u_i64(TCGv_i64 ret, TCGv_ptr arg2,
+                                     tcg_target_long offset)
+{
+    tcg_gen_ld16u_i32(TCGV_LOW(ret), arg2, offset);
+    tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+}
+
+static inline void tcg_gen_ld16s_i64(TCGv_i64 ret, TCGv_ptr arg2,
+                                     tcg_target_long offset)
+{
+    tcg_gen_ld16s_i32(TCGV_LOW(ret), arg2, offset);
+    tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
+}
+
+static inline void tcg_gen_ld32u_i64(TCGv_i64 ret, TCGv_ptr arg2,
+                                     tcg_target_long offset)
+{
+    tcg_gen_ld_i32(TCGV_LOW(ret), arg2, offset);
+    tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+}
+
+static inline void tcg_gen_ld32s_i64(TCGv_i64 ret, TCGv_ptr arg2,
+                                     tcg_target_long offset)
+{
+    tcg_gen_ld_i32(TCGV_LOW(ret), arg2, offset);
+    tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
+}
+
+static inline void tcg_gen_ld_i64(TCGv_i64 ret, TCGv_ptr arg2,
+                                  tcg_target_long offset)
+{
+    /* since arg2 and ret have different types, they cannot be the
+       same temporary */
+#ifdef TCG_TARGET_WORDS_BIGENDIAN
+    tcg_gen_ld_i32(TCGV_HIGH(ret), arg2, offset);
+    tcg_gen_ld_i32(TCGV_LOW(ret), arg2, offset + 4);
+#else
+    tcg_gen_ld_i32(TCGV_LOW(ret), arg2, offset);
+    tcg_gen_ld_i32(TCGV_HIGH(ret), arg2, offset + 4);
+#endif
+}
+
+static inline void tcg_gen_st8_i64(TCGv_i64 arg1, TCGv_ptr arg2,
+                                   tcg_target_long offset)
+{
+    tcg_gen_st8_i32(TCGV_LOW(arg1), arg2, offset);
+}
+
+static inline void tcg_gen_st16_i64(TCGv_i64 arg1, TCGv_ptr arg2,
+                                    tcg_target_long offset)
+{
+    tcg_gen_st16_i32(TCGV_LOW(arg1), arg2, offset);
+}
+
+static inline void tcg_gen_st32_i64(TCGv_i64 arg1, TCGv_ptr arg2,
+                                    tcg_target_long offset)
+{
+    tcg_gen_st_i32(TCGV_LOW(arg1), arg2, offset);
+}
+
+static inline void tcg_gen_st_i64(TCGv_i64 arg1, TCGv_ptr arg2,
+                                  tcg_target_long offset)
+{
+#ifdef TCG_TARGET_WORDS_BIGENDIAN
+    tcg_gen_st_i32(TCGV_HIGH(arg1), arg2, offset);
+    tcg_gen_st_i32(TCGV_LOW(arg1), arg2, offset + 4);
+#else
+    tcg_gen_st_i32(TCGV_LOW(arg1), arg2, offset);
+    tcg_gen_st_i32(TCGV_HIGH(arg1), arg2, offset + 4);
+#endif
+}
+
+static inline void tcg_gen_add_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    tcg_gen_op6_i32(INDEX_op_add2_i32, TCGV_LOW(ret), TCGV_HIGH(ret),
+                    TCGV_LOW(arg1), TCGV_HIGH(arg1), TCGV_LOW(arg2),
+                    TCGV_HIGH(arg2));
+}
+
+static inline void tcg_gen_sub_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    tcg_gen_op6_i32(INDEX_op_sub2_i32, TCGV_LOW(ret), TCGV_HIGH(ret),
+                    TCGV_LOW(arg1), TCGV_HIGH(arg1), TCGV_LOW(arg2),
+                    TCGV_HIGH(arg2));
+}
+
+static inline void tcg_gen_and_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    tcg_gen_and_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
+    tcg_gen_and_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
+}
+
+static inline void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
+{
+    tcg_gen_andi_i32(TCGV_LOW(ret), TCGV_LOW(arg1), arg2);
+    tcg_gen_andi_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), arg2 >> 32);
+}
+
+static inline void tcg_gen_or_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    tcg_gen_or_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
+    tcg_gen_or_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
+}
+
+static inline void tcg_gen_ori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
+{
+    tcg_gen_ori_i32(TCGV_LOW(ret), TCGV_LOW(arg1), arg2);
+    tcg_gen_ori_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), arg2 >> 32);
+}
+
+static inline void tcg_gen_xor_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    tcg_gen_xor_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
+    tcg_gen_xor_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
+}
+
+static inline void tcg_gen_xori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
+{
+    tcg_gen_xori_i32(TCGV_LOW(ret), TCGV_LOW(arg1), arg2);
+    tcg_gen_xori_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), arg2 >> 32);
+}
+
+/* XXX: use generic code when basic block handling is OK or CPU
+   specific code (x86) */
+static inline void tcg_gen_shl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    int sizemask = 0;
+    /* Return value and both arguments are 64-bit and signed.  */
+    sizemask |= tcg_gen_sizemask(0, 1, 1);
+    sizemask |= tcg_gen_sizemask(1, 1, 1);
+    sizemask |= tcg_gen_sizemask(2, 1, 1);
+
+    tcg_gen_helper64(tcg_helper_shl_i64, sizemask, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_shli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
+{
+    tcg_gen_shifti_i64(ret, arg1, arg2, 0, 0);
+}
+
+static inline void tcg_gen_shr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    int sizemask = 0;
+    /* Return value and both arguments are 64-bit and signed.  */
+    sizemask |= tcg_gen_sizemask(0, 1, 1);
+    sizemask |= tcg_gen_sizemask(1, 1, 1);
+    sizemask |= tcg_gen_sizemask(2, 1, 1);
+
+    tcg_gen_helper64(tcg_helper_shr_i64, sizemask, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_shri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
+{
+    tcg_gen_shifti_i64(ret, arg1, arg2, 1, 0);
+}
+
+static inline void tcg_gen_sar_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    int sizemask = 0;
+    /* Return value and both arguments are 64-bit and signed.  */
+    sizemask |= tcg_gen_sizemask(0, 1, 1);
+    sizemask |= tcg_gen_sizemask(1, 1, 1);
+    sizemask |= tcg_gen_sizemask(2, 1, 1);
+
+    tcg_gen_helper64(tcg_helper_sar_i64, sizemask, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_sari_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
+{
+    tcg_gen_shifti_i64(ret, arg1, arg2, 1, 1);
+}
+
+static inline void tcg_gen_brcond_i64(TCGCond cond, TCGv_i64 arg1,
+                                      TCGv_i64 arg2, int label_index)
+{
+    tcg_gen_op6ii_i32(INDEX_op_brcond2_i32,
+                      TCGV_LOW(arg1), TCGV_HIGH(arg1), TCGV_LOW(arg2),
+                      TCGV_HIGH(arg2), cond, label_index);
+}
+
+static inline void tcg_gen_setcond_i64(TCGCond cond, TCGv_i64 ret,
+                                       TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    tcg_gen_op6i_i32(INDEX_op_setcond2_i32, TCGV_LOW(ret),
+                     TCGV_LOW(arg1), TCGV_HIGH(arg1),
+                     TCGV_LOW(arg2), TCGV_HIGH(arg2), cond);
+    tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+}
+
+static inline void tcg_gen_mul_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    TCGv_i64 t0;
+    TCGv_i32 t1;
+
+    t0 = tcg_temp_new_i64();
+    t1 = tcg_temp_new_i32();
+
+    tcg_gen_op4_i32(INDEX_op_mulu2_i32, TCGV_LOW(t0), TCGV_HIGH(t0),
+                    TCGV_LOW(arg1), TCGV_LOW(arg2));
+
+    tcg_gen_mul_i32(t1, TCGV_LOW(arg1), TCGV_HIGH(arg2));
+    tcg_gen_add_i32(TCGV_HIGH(t0), TCGV_HIGH(t0), t1);
+    tcg_gen_mul_i32(t1, TCGV_HIGH(arg1), TCGV_LOW(arg2));
+    tcg_gen_add_i32(TCGV_HIGH(t0), TCGV_HIGH(t0), t1);
+
+    tcg_gen_mov_i64(ret, t0);
+    tcg_temp_free_i64(t0);
+    tcg_temp_free_i32(t1);
+}
+
+static inline void tcg_gen_div_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    int sizemask = 0;
+    /* Return value and both arguments are 64-bit and signed.  */
+    sizemask |= tcg_gen_sizemask(0, 1, 1);
+    sizemask |= tcg_gen_sizemask(1, 1, 1);
+    sizemask |= tcg_gen_sizemask(2, 1, 1);
+
+    tcg_gen_helper64(tcg_helper_div_i64, sizemask, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_rem_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    int sizemask = 0;
+    /* Return value and both arguments are 64-bit and signed.  */
+    sizemask |= tcg_gen_sizemask(0, 1, 1);
+    sizemask |= tcg_gen_sizemask(1, 1, 1);
+    sizemask |= tcg_gen_sizemask(2, 1, 1);
+
+    tcg_gen_helper64(tcg_helper_rem_i64, sizemask, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_divu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    int sizemask = 0;
+    /* Return value and both arguments are 64-bit and unsigned.  */
+    sizemask |= tcg_gen_sizemask(0, 1, 0);
+    sizemask |= tcg_gen_sizemask(1, 1, 0);
+    sizemask |= tcg_gen_sizemask(2, 1, 0);
+
+    tcg_gen_helper64(tcg_helper_divu_i64, sizemask, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_remu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    int sizemask = 0;
+    /* Return value and both arguments are 64-bit and unsigned.  */
+    sizemask |= tcg_gen_sizemask(0, 1, 0);
+    sizemask |= tcg_gen_sizemask(1, 1, 0);
+    sizemask |= tcg_gen_sizemask(2, 1, 0);
+
+    tcg_gen_helper64(tcg_helper_remu_i64, sizemask, ret, arg1, arg2);
+}
+
+#else
+
+static inline void tcg_gen_mov_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    if (!TCGV_EQUAL_I64(ret, arg))
+        tcg_gen_op2_i64(INDEX_op_mov_i64, ret, arg);
+}
+
+static inline void tcg_gen_movi_i64(TCGv_i64 ret, int64_t arg)
+{
+    tcg_gen_op2i_i64(INDEX_op_movi_i64, ret, arg);
+}
+
+static inline void tcg_gen_ld8u_i64(TCGv_i64 ret, TCGv_i64 arg2,
+                                    tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i64(INDEX_op_ld8u_i64, ret, arg2, offset);
+}
+
+static inline void tcg_gen_ld8s_i64(TCGv_i64 ret, TCGv_i64 arg2,
+                                    tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i64(INDEX_op_ld8s_i64, ret, arg2, offset);
+}
+
+static inline void tcg_gen_ld16u_i64(TCGv_i64 ret, TCGv_i64 arg2,
+                                     tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i64(INDEX_op_ld16u_i64, ret, arg2, offset);
+}
+
+static inline void tcg_gen_ld16s_i64(TCGv_i64 ret, TCGv_i64 arg2,
+                                     tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i64(INDEX_op_ld16s_i64, ret, arg2, offset);
+}
+
+static inline void tcg_gen_ld32u_i64(TCGv_i64 ret, TCGv_i64 arg2,
+                                     tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i64(INDEX_op_ld32u_i64, ret, arg2, offset);
+}
+
+static inline void tcg_gen_ld32s_i64(TCGv_i64 ret, TCGv_i64 arg2,
+                                     tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i64(INDEX_op_ld32s_i64, ret, arg2, offset);
+}
+
+static inline void tcg_gen_ld_i64(TCGv_i64 ret, TCGv_i64 arg2, tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i64(INDEX_op_ld_i64, ret, arg2, offset);
+}
+
+static inline void tcg_gen_st8_i64(TCGv_i64 arg1, TCGv_i64 arg2,
+                                   tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i64(INDEX_op_st8_i64, arg1, arg2, offset);
+}
+
+static inline void tcg_gen_st16_i64(TCGv_i64 arg1, TCGv_i64 arg2,
+                                    tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i64(INDEX_op_st16_i64, arg1, arg2, offset);
+}
+
+static inline void tcg_gen_st32_i64(TCGv_i64 arg1, TCGv_i64 arg2,
+                                    tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i64(INDEX_op_st32_i64, arg1, arg2, offset);
+}
+
+static inline void tcg_gen_st_i64(TCGv_i64 arg1, TCGv_i64 arg2, tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i64(INDEX_op_st_i64, arg1, arg2, offset);
+}
+
+static inline void tcg_gen_add_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    tcg_gen_op3_i64(INDEX_op_add_i64, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_sub_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    tcg_gen_op3_i64(INDEX_op_sub_i64, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_and_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    if (TCGV_EQUAL_I64(arg1, arg2)) {
+        tcg_gen_mov_i64(ret, arg1);
+    } else {
+        tcg_gen_op3_i64(INDEX_op_and_i64, ret, arg1, arg2);
+    }
+}
+
+static inline void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
+{
+    TCGv_i64 t0 = tcg_const_i64(arg2);
+    tcg_gen_and_i64(ret, arg1, t0);
+    tcg_temp_free_i64(t0);
+}
+
+static inline void tcg_gen_or_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    if (TCGV_EQUAL_I64(arg1, arg2)) {
+        tcg_gen_mov_i64(ret, arg1);
+    } else {
+        tcg_gen_op3_i64(INDEX_op_or_i64, ret, arg1, arg2);
+    }
+}
+
+static inline void tcg_gen_ori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
+{
+    TCGv_i64 t0 = tcg_const_i64(arg2);
+    tcg_gen_or_i64(ret, arg1, t0);
+    tcg_temp_free_i64(t0);
+}
+
+static inline void tcg_gen_xor_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    if (TCGV_EQUAL_I64(arg1, arg2)) {
+        tcg_gen_movi_i64(ret, 0);
+    } else {
+        tcg_gen_op3_i64(INDEX_op_xor_i64, ret, arg1, arg2);
+    }
+}
+
+static inline void tcg_gen_xori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
+{
+    TCGv_i64 t0 = tcg_const_i64(arg2);
+    tcg_gen_xor_i64(ret, arg1, t0);
+    tcg_temp_free_i64(t0);
+}
+
+static inline void tcg_gen_shl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    tcg_gen_op3_i64(INDEX_op_shl_i64, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_shli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
+{
+    if (arg2 == 0) {
+        tcg_gen_mov_i64(ret, arg1);
+    } else {
+        TCGv_i64 t0 = tcg_const_i64(arg2);
+        tcg_gen_shl_i64(ret, arg1, t0);
+        tcg_temp_free_i64(t0);
+    }
+}
+
+static inline void tcg_gen_shr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    tcg_gen_op3_i64(INDEX_op_shr_i64, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_shri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
+{
+    if (arg2 == 0) {
+        tcg_gen_mov_i64(ret, arg1);
+    } else {
+        TCGv_i64 t0 = tcg_const_i64(arg2);
+        tcg_gen_shr_i64(ret, arg1, t0);
+        tcg_temp_free_i64(t0);
+    }
+}
+
+static inline void tcg_gen_sar_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    tcg_gen_op3_i64(INDEX_op_sar_i64, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_sari_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
+{
+    if (arg2 == 0) {
+        tcg_gen_mov_i64(ret, arg1);
+    } else {
+        TCGv_i64 t0 = tcg_const_i64(arg2);
+        tcg_gen_sar_i64(ret, arg1, t0);
+        tcg_temp_free_i64(t0);
+    }
+}
+
+static inline void tcg_gen_brcond_i64(TCGCond cond, TCGv_i64 arg1,
+                                      TCGv_i64 arg2, int label_index)
+{
+    tcg_gen_op4ii_i64(INDEX_op_brcond_i64, arg1, arg2, cond, label_index);
+}
+
+static inline void tcg_gen_setcond_i64(TCGCond cond, TCGv_i64 ret,
+                                       TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    tcg_gen_op4i_i64(INDEX_op_setcond_i64, ret, arg1, arg2, cond);
+}
+
+static inline void tcg_gen_mul_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    tcg_gen_op3_i64(INDEX_op_mul_i64, ret, arg1, arg2);
+}
+
+#ifdef TCG_TARGET_HAS_div_i64
+static inline void tcg_gen_div_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    tcg_gen_op3_i64(INDEX_op_div_i64, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_rem_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    tcg_gen_op3_i64(INDEX_op_rem_i64, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_divu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    tcg_gen_op3_i64(INDEX_op_divu_i64, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_remu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    tcg_gen_op3_i64(INDEX_op_remu_i64, ret, arg1, arg2);
+}
+#elif defined(TCG_TARGET_HAS_div2_i64)
+static inline void tcg_gen_div_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    TCGv_i64 t0;
+    t0 = tcg_temp_new_i64();
+    tcg_gen_sari_i64(t0, arg1, 63);
+    tcg_gen_op5_i64(INDEX_op_div2_i64, ret, t0, arg1, t0, arg2);
+    tcg_temp_free_i64(t0);
+}
+
+static inline void tcg_gen_rem_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    TCGv_i64 t0;
+    t0 = tcg_temp_new_i64();
+    tcg_gen_sari_i64(t0, arg1, 63);
+    tcg_gen_op5_i64(INDEX_op_div2_i64, t0, ret, arg1, t0, arg2);
+    tcg_temp_free_i64(t0);
+}
+
+static inline void tcg_gen_divu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    TCGv_i64 t0;
+    t0 = tcg_temp_new_i64();
+    tcg_gen_movi_i64(t0, 0);
+    tcg_gen_op5_i64(INDEX_op_divu2_i64, ret, t0, arg1, t0, arg2);
+    tcg_temp_free_i64(t0);
+}
+
+static inline void tcg_gen_remu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    TCGv_i64 t0;
+    t0 = tcg_temp_new_i64();
+    tcg_gen_movi_i64(t0, 0);
+    tcg_gen_op5_i64(INDEX_op_divu2_i64, t0, ret, arg1, t0, arg2);
+    tcg_temp_free_i64(t0);
+}
+#else
+static inline void tcg_gen_div_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    int sizemask = 0;
+    /* Return value and both arguments are 64-bit and signed.  */
+    sizemask |= tcg_gen_sizemask(0, 1, 1);
+    sizemask |= tcg_gen_sizemask(1, 1, 1);
+    sizemask |= tcg_gen_sizemask(2, 1, 1);
+
+    tcg_gen_helper64(tcg_helper_div_i64, sizemask, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_rem_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    int sizemask = 0;
+    /* Return value and both arguments are 64-bit and signed.  */
+    sizemask |= tcg_gen_sizemask(0, 1, 1);
+    sizemask |= tcg_gen_sizemask(1, 1, 1);
+    sizemask |= tcg_gen_sizemask(2, 1, 1);
+
+    tcg_gen_helper64(tcg_helper_rem_i64, sizemask, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_divu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    int sizemask = 0;
+    /* Return value and both arguments are 64-bit and unsigned.  */
+    sizemask |= tcg_gen_sizemask(0, 1, 0);
+    sizemask |= tcg_gen_sizemask(1, 1, 0);
+    sizemask |= tcg_gen_sizemask(2, 1, 0);
+
+    tcg_gen_helper64(tcg_helper_divu_i64, sizemask, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_remu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    int sizemask = 0;
+    /* Return value and both arguments are 64-bit and unsigned.  */
+    sizemask |= tcg_gen_sizemask(0, 1, 0);
+    sizemask |= tcg_gen_sizemask(1, 1, 0);
+    sizemask |= tcg_gen_sizemask(2, 1, 0);
+
+    tcg_gen_helper64(tcg_helper_remu_i64, sizemask, ret, arg1, arg2);
+}
+#endif
+
+#endif
+
+static inline void tcg_gen_addi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
+{
+    /* some cases can be optimized here */
+    if (arg2 == 0) {
+        tcg_gen_mov_i64(ret, arg1);
+    } else {
+        TCGv_i64 t0 = tcg_const_i64(arg2);
+        tcg_gen_add_i64(ret, arg1, t0);
+        tcg_temp_free_i64(t0);
+    }
+}
+
+static inline void tcg_gen_subfi_i64(TCGv_i64 ret, int64_t arg1, TCGv_i64 arg2)
+{
+    TCGv_i64 t0 = tcg_const_i64(arg1);
+    tcg_gen_sub_i64(ret, t0, arg2);
+    tcg_temp_free_i64(t0);
+}
+
+static inline void tcg_gen_subi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
+{
+    /* some cases can be optimized here */
+    if (arg2 == 0) {
+        tcg_gen_mov_i64(ret, arg1);
+    } else {
+        TCGv_i64 t0 = tcg_const_i64(arg2);
+        tcg_gen_sub_i64(ret, arg1, t0);
+        tcg_temp_free_i64(t0);
+    }
+}
+static inline void tcg_gen_brcondi_i64(TCGCond cond, TCGv_i64 arg1,
+                                       int64_t arg2, int label_index)
+{
+    TCGv_i64 t0 = tcg_const_i64(arg2);
+    tcg_gen_brcond_i64(cond, arg1, t0, label_index);
+    tcg_temp_free_i64(t0);
+}
+
+static inline void tcg_gen_setcondi_i64(TCGCond cond, TCGv_i64 ret,
+                                        TCGv_i64 arg1, int64_t arg2)
+{
+    TCGv_i64 t0 = tcg_const_i64(arg2);
+    tcg_gen_setcond_i64(cond, ret, arg1, t0);
+    tcg_temp_free_i64(t0);
+}
+
+static inline void tcg_gen_muli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
+{
+    TCGv_i64 t0 = tcg_const_i64(arg2);
+    tcg_gen_mul_i64(ret, arg1, t0);
+    tcg_temp_free_i64(t0);
+}
+
+
+/***************************************/
+/* optional operations */
+
+static inline void tcg_gen_ext8s_i32(TCGv_i32 ret, TCGv_i32 arg)
+{
+#ifdef TCG_TARGET_HAS_ext8s_i32
+    tcg_gen_op2_i32(INDEX_op_ext8s_i32, ret, arg);
+#else
+    tcg_gen_shli_i32(ret, arg, 24);
+    tcg_gen_sari_i32(ret, ret, 24);
+#endif
+}
+
+static inline void tcg_gen_ext16s_i32(TCGv_i32 ret, TCGv_i32 arg)
+{
+#ifdef TCG_TARGET_HAS_ext16s_i32
+    tcg_gen_op2_i32(INDEX_op_ext16s_i32, ret, arg);
+#else
+    tcg_gen_shli_i32(ret, arg, 16);
+    tcg_gen_sari_i32(ret, ret, 16);
+#endif
+}
+
+static inline void tcg_gen_ext8u_i32(TCGv_i32 ret, TCGv_i32 arg)
+{
+#ifdef TCG_TARGET_HAS_ext8u_i32
+    tcg_gen_op2_i32(INDEX_op_ext8u_i32, ret, arg);
+#else
+    tcg_gen_andi_i32(ret, arg, 0xffu);
+#endif
+}
+
+static inline void tcg_gen_ext16u_i32(TCGv_i32 ret, TCGv_i32 arg)
+{
+#ifdef TCG_TARGET_HAS_ext16u_i32
+    tcg_gen_op2_i32(INDEX_op_ext16u_i32, ret, arg);
+#else
+    tcg_gen_andi_i32(ret, arg, 0xffffu);
+#endif
+}
+
+/* Note: we assume the two high bytes are set to zero */
+static inline void tcg_gen_bswap16_i32(TCGv_i32 ret, TCGv_i32 arg)
+{
+#ifdef TCG_TARGET_HAS_bswap16_i32
+    tcg_gen_op2_i32(INDEX_op_bswap16_i32, ret, arg);
+#else
+    TCGv_i32 t0 = tcg_temp_new_i32();
+
+    tcg_gen_ext8u_i32(t0, arg);
+    tcg_gen_shli_i32(t0, t0, 8);
+    tcg_gen_shri_i32(ret, arg, 8);
+    tcg_gen_or_i32(ret, ret, t0);
+    tcg_temp_free_i32(t0);
+#endif
+}
+
+static inline void tcg_gen_bswap32_i32(TCGv_i32 ret, TCGv_i32 arg)
+{
+#ifdef TCG_TARGET_HAS_bswap32_i32
+    tcg_gen_op2_i32(INDEX_op_bswap32_i32, ret, arg);
+#else
+    TCGv_i32 t0, t1;
+    t0 = tcg_temp_new_i32();
+    t1 = tcg_temp_new_i32();
+
+    tcg_gen_shli_i32(t0, arg, 24);
+
+    tcg_gen_andi_i32(t1, arg, 0x0000ff00);
+    tcg_gen_shli_i32(t1, t1, 8);
+    tcg_gen_or_i32(t0, t0, t1);
+
+    tcg_gen_shri_i32(t1, arg, 8);
+    tcg_gen_andi_i32(t1, t1, 0x0000ff00);
+    tcg_gen_or_i32(t0, t0, t1);
+
+    tcg_gen_shri_i32(t1, arg, 24);
+    tcg_gen_or_i32(ret, t0, t1);
+    tcg_temp_free_i32(t0);
+    tcg_temp_free_i32(t1);
+#endif
+}
+
+#if TCG_TARGET_REG_BITS == 32
+static inline void tcg_gen_ext8s_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    tcg_gen_ext8s_i32(TCGV_LOW(ret), TCGV_LOW(arg));
+    tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
+}
+
+static inline void tcg_gen_ext16s_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    tcg_gen_ext16s_i32(TCGV_LOW(ret), TCGV_LOW(arg));
+    tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
+}
+
+static inline void tcg_gen_ext32s_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    tcg_gen_mov_i32(TCGV_LOW(ret), TCGV_LOW(arg));
+    tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
+}
+
+static inline void tcg_gen_ext8u_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    tcg_gen_ext8u_i32(TCGV_LOW(ret), TCGV_LOW(arg));
+    tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+}
+
+static inline void tcg_gen_ext16u_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    tcg_gen_ext16u_i32(TCGV_LOW(ret), TCGV_LOW(arg));
+    tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+}
+
+static inline void tcg_gen_ext32u_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    tcg_gen_mov_i32(TCGV_LOW(ret), TCGV_LOW(arg));
+    tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+}
+
+static inline void tcg_gen_trunc_i64_i32(TCGv_i32 ret, TCGv_i64 arg)
+{
+    tcg_gen_mov_i32(ret, TCGV_LOW(arg));
+}
+
+static inline void tcg_gen_extu_i32_i64(TCGv_i64 ret, TCGv_i32 arg)
+{
+    tcg_gen_mov_i32(TCGV_LOW(ret), arg);
+    tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+}
+
+static inline void tcg_gen_ext_i32_i64(TCGv_i64 ret, TCGv_i32 arg)
+{
+    tcg_gen_mov_i32(TCGV_LOW(ret), arg);
+    tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
+}
+
+/* Note: we assume the six high bytes are set to zero */
+static inline void tcg_gen_bswap16_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    tcg_gen_mov_i32(TCGV_HIGH(ret), TCGV_HIGH(arg));
+    tcg_gen_bswap16_i32(TCGV_LOW(ret), TCGV_LOW(arg));
+}
+
+/* Note: we assume the four high bytes are set to zero */
+static inline void tcg_gen_bswap32_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    tcg_gen_mov_i32(TCGV_HIGH(ret), TCGV_HIGH(arg));
+    tcg_gen_bswap32_i32(TCGV_LOW(ret), TCGV_LOW(arg));
+}
+
+static inline void tcg_gen_bswap64_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    TCGv_i32 t0, t1;
+    t0 = tcg_temp_new_i32();
+    t1 = tcg_temp_new_i32();
+
+    tcg_gen_bswap32_i32(t0, TCGV_LOW(arg));
+    tcg_gen_bswap32_i32(t1, TCGV_HIGH(arg));
+    tcg_gen_mov_i32(TCGV_LOW(ret), t1);
+    tcg_gen_mov_i32(TCGV_HIGH(ret), t0);
+    tcg_temp_free_i32(t0);
+    tcg_temp_free_i32(t1);
+}
+#else
+
+static inline void tcg_gen_ext8s_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+#ifdef TCG_TARGET_HAS_ext8s_i64
+    tcg_gen_op2_i64(INDEX_op_ext8s_i64, ret, arg);
+#else
+    tcg_gen_shli_i64(ret, arg, 56);
+    tcg_gen_sari_i64(ret, ret, 56);
+#endif
+}
+
+static inline void tcg_gen_ext16s_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+#ifdef TCG_TARGET_HAS_ext16s_i64
+    tcg_gen_op2_i64(INDEX_op_ext16s_i64, ret, arg);
+#else
+    tcg_gen_shli_i64(ret, arg, 48);
+    tcg_gen_sari_i64(ret, ret, 48);
+#endif
+}
+
+static inline void tcg_gen_ext32s_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+#ifdef TCG_TARGET_HAS_ext32s_i64
+    tcg_gen_op2_i64(INDEX_op_ext32s_i64, ret, arg);
+#else
+    tcg_gen_shli_i64(ret, arg, 32);
+    tcg_gen_sari_i64(ret, ret, 32);
+#endif
+}
+
+static inline void tcg_gen_ext8u_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+#ifdef TCG_TARGET_HAS_ext8u_i64
+    tcg_gen_op2_i64(INDEX_op_ext8u_i64, ret, arg);
+#else
+    tcg_gen_andi_i64(ret, arg, 0xffu);
+#endif
+}
+
+static inline void tcg_gen_ext16u_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+#ifdef TCG_TARGET_HAS_ext16u_i64
+    tcg_gen_op2_i64(INDEX_op_ext16u_i64, ret, arg);
+#else
+    tcg_gen_andi_i64(ret, arg, 0xffffu);
+#endif
+}
+
+static inline void tcg_gen_ext32u_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+#ifdef TCG_TARGET_HAS_ext32u_i64
+    tcg_gen_op2_i64(INDEX_op_ext32u_i64, ret, arg);
+#else
+    tcg_gen_andi_i64(ret, arg, 0xffffffffu);
+#endif
+}
+
+/* Note: we assume the target supports move between 32 and 64 bit
+   registers.  This will probably break MIPS64 targets.  */
+static inline void tcg_gen_trunc_i64_i32(TCGv_i32 ret, TCGv_i64 arg)
+{
+    tcg_gen_mov_i32(ret, MAKE_TCGV_I32(GET_TCGV_I64(arg)));
+}
+
+/* Note: we assume the target supports move between 32 and 64 bit
+   registers */
+static inline void tcg_gen_extu_i32_i64(TCGv_i64 ret, TCGv_i32 arg)
+{
+    tcg_gen_ext32u_i64(ret, MAKE_TCGV_I64(GET_TCGV_I32(arg)));
+}
+
+/* Note: we assume the target supports move between 32 and 64 bit
+   registers */
+static inline void tcg_gen_ext_i32_i64(TCGv_i64 ret, TCGv_i32 arg)
+{
+    tcg_gen_ext32s_i64(ret, MAKE_TCGV_I64(GET_TCGV_I32(arg)));
+}
+
+/* Note: we assume the six high bytes are set to zero */
+static inline void tcg_gen_bswap16_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+#ifdef TCG_TARGET_HAS_bswap16_i64
+    tcg_gen_op2_i64(INDEX_op_bswap16_i64, ret, arg);
+#else
+    TCGv_i64 t0 = tcg_temp_new_i64();
+
+    tcg_gen_ext8u_i64(t0, arg);
+    tcg_gen_shli_i64(t0, t0, 8);
+    tcg_gen_shri_i64(ret, arg, 8);
+    tcg_gen_or_i64(ret, ret, t0);
+    tcg_temp_free_i64(t0);
+#endif
+}
+
+/* Note: we assume the four high bytes are set to zero */
+static inline void tcg_gen_bswap32_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+#ifdef TCG_TARGET_HAS_bswap32_i64
+    tcg_gen_op2_i64(INDEX_op_bswap32_i64, ret, arg);
+#else
+    TCGv_i64 t0, t1;
+    t0 = tcg_temp_new_i64();
+    t1 = tcg_temp_new_i64();
+
+    tcg_gen_shli_i64(t0, arg, 24);
+    tcg_gen_ext32u_i64(t0, t0);
+
+    tcg_gen_andi_i64(t1, arg, 0x0000ff00);
+    tcg_gen_shli_i64(t1, t1, 8);
+    tcg_gen_or_i64(t0, t0, t1);
+
+    tcg_gen_shri_i64(t1, arg, 8);
+    tcg_gen_andi_i64(t1, t1, 0x0000ff00);
+    tcg_gen_or_i64(t0, t0, t1);
+
+    tcg_gen_shri_i64(t1, arg, 24);
+    tcg_gen_or_i64(ret, t0, t1);
+    tcg_temp_free_i64(t0);
+    tcg_temp_free_i64(t1);
+#endif
+}
+
+static inline void tcg_gen_bswap64_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+#ifdef TCG_TARGET_HAS_bswap64_i64
+    tcg_gen_op2_i64(INDEX_op_bswap64_i64, ret, arg);
+#else
+    TCGv_i64 t0 = tcg_temp_new_i64();
+    TCGv_i64 t1 = tcg_temp_new_i64();
+
+    tcg_gen_shli_i64(t0, arg, 56);
+
+    tcg_gen_andi_i64(t1, arg, 0x0000ff00);
+    tcg_gen_shli_i64(t1, t1, 40);
+    tcg_gen_or_i64(t0, t0, t1);
+
+    tcg_gen_andi_i64(t1, arg, 0x00ff0000);
+    tcg_gen_shli_i64(t1, t1, 24);
+    tcg_gen_or_i64(t0, t0, t1);
+
+    tcg_gen_andi_i64(t1, arg, 0xff000000);
+    tcg_gen_shli_i64(t1, t1, 8);
+    tcg_gen_or_i64(t0, t0, t1);
+
+    tcg_gen_shri_i64(t1, arg, 8);
+    tcg_gen_andi_i64(t1, t1, 0xff000000);
+    tcg_gen_or_i64(t0, t0, t1);
+
+    tcg_gen_shri_i64(t1, arg, 24);
+    tcg_gen_andi_i64(t1, t1, 0x00ff0000);
+    tcg_gen_or_i64(t0, t0, t1);
+
+    tcg_gen_shri_i64(t1, arg, 40);
+    tcg_gen_andi_i64(t1, t1, 0x0000ff00);
+    tcg_gen_or_i64(t0, t0, t1);
+
+    tcg_gen_shri_i64(t1, arg, 56);
+    tcg_gen_or_i64(ret, t0, t1);
+    tcg_temp_free_i64(t0);
+    tcg_temp_free_i64(t1);
+#endif
+}
+
+#endif
+
+static inline void tcg_gen_neg_i32(TCGv_i32 ret, TCGv_i32 arg)
+{
+#ifdef TCG_TARGET_HAS_neg_i32
+    tcg_gen_op2_i32(INDEX_op_neg_i32, ret, arg);
+#else
+    TCGv_i32 t0 = tcg_const_i32(0);
+    tcg_gen_sub_i32(ret, t0, arg);
+    tcg_temp_free_i32(t0);
+#endif
+}
+
+static inline void tcg_gen_neg_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+#ifdef TCG_TARGET_HAS_neg_i64
+    tcg_gen_op2_i64(INDEX_op_neg_i64, ret, arg);
+#else
+    TCGv_i64 t0 = tcg_const_i64(0);
+    tcg_gen_sub_i64(ret, t0, arg);
+    tcg_temp_free_i64(t0);
+#endif
+}
+
+static inline void tcg_gen_not_i32(TCGv_i32 ret, TCGv_i32 arg)
+{
+#ifdef TCG_TARGET_HAS_not_i32
+    tcg_gen_op2_i32(INDEX_op_not_i32, ret, arg);
+#else
+    tcg_gen_xori_i32(ret, arg, -1);
+#endif
+}
+
+static inline void tcg_gen_not_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+#ifdef TCG_TARGET_HAS_not_i64
+    tcg_gen_op2_i64(INDEX_op_not_i64, ret, arg);
+#elif defined(TCG_TARGET_HAS_not_i32) && TCG_TARGET_REG_BITS == 32
+    tcg_gen_not_i32(TCGV_LOW(ret), TCGV_LOW(arg));
+    tcg_gen_not_i32(TCGV_HIGH(ret), TCGV_HIGH(arg));
+#else
+    tcg_gen_xori_i64(ret, arg, -1);
+#endif
+}
+
+static inline void tcg_gen_discard_i32(TCGv_i32 arg)
+{
+    tcg_gen_op1_i32(INDEX_op_discard, arg);
+}
+
+#if TCG_TARGET_REG_BITS == 32
+static inline void tcg_gen_discard_i64(TCGv_i64 arg)
+{
+    tcg_gen_discard_i32(TCGV_LOW(arg));
+    tcg_gen_discard_i32(TCGV_HIGH(arg));
+}
+#else
+static inline void tcg_gen_discard_i64(TCGv_i64 arg)
+{
+    tcg_gen_op1_i64(INDEX_op_discard, arg);
+}
+#endif
+
+static inline void tcg_gen_concat_i32_i64(TCGv_i64 dest, TCGv_i32 low, TCGv_i32 high)
+{
+#if TCG_TARGET_REG_BITS == 32
+    tcg_gen_mov_i32(TCGV_LOW(dest), low);
+    tcg_gen_mov_i32(TCGV_HIGH(dest), high);
+#else
+    TCGv_i64 tmp = tcg_temp_new_i64();
+    /* This extension is only needed for type correctness.
+       We may be able to do better given target specific information.  */
+    tcg_gen_extu_i32_i64(tmp, high);
+    tcg_gen_shli_i64(tmp, tmp, 32);
+    tcg_gen_extu_i32_i64(dest, low);
+    tcg_gen_or_i64(dest, dest, tmp);
+    tcg_temp_free_i64(tmp);
+#endif
+}
+
+static inline void tcg_gen_concat32_i64(TCGv_i64 dest, TCGv_i64 low, TCGv_i64 high)
+{
+#if TCG_TARGET_REG_BITS == 32
+    tcg_gen_concat_i32_i64(dest, TCGV_LOW(low), TCGV_LOW(high));
+#else
+    TCGv_i64 tmp = tcg_temp_new_i64();
+    tcg_gen_ext32u_i64(dest, low);
+    tcg_gen_shli_i64(tmp, high, 32);
+    tcg_gen_or_i64(dest, dest, tmp);
+    tcg_temp_free_i64(tmp);
+#endif
+}
+
+static inline void tcg_gen_andc_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+#ifdef TCG_TARGET_HAS_andc_i32
+    tcg_gen_op3_i32(INDEX_op_andc_i32, ret, arg1, arg2);
+#else
+    TCGv_i32 t0;
+    t0 = tcg_temp_new_i32();
+    tcg_gen_not_i32(t0, arg2);
+    tcg_gen_and_i32(ret, arg1, t0);
+    tcg_temp_free_i32(t0);
+#endif
+}
+
+static inline void tcg_gen_andc_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+#ifdef TCG_TARGET_HAS_andc_i64
+    tcg_gen_op3_i64(INDEX_op_andc_i64, ret, arg1, arg2);
+#elif defined(TCG_TARGET_HAS_andc_i32) && TCG_TARGET_REG_BITS == 32
+    tcg_gen_andc_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
+    tcg_gen_andc_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
+#else
+    TCGv_i64 t0;
+    t0 = tcg_temp_new_i64();
+    tcg_gen_not_i64(t0, arg2);
+    tcg_gen_and_i64(ret, arg1, t0);
+    tcg_temp_free_i64(t0);
+#endif
+}
+
+static inline void tcg_gen_eqv_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+#ifdef TCG_TARGET_HAS_eqv_i32
+    tcg_gen_op3_i32(INDEX_op_eqv_i32, ret, arg1, arg2);
+#else
+    tcg_gen_xor_i32(ret, arg1, arg2);
+    tcg_gen_not_i32(ret, ret);
+#endif
+}
+
+static inline void tcg_gen_eqv_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+#ifdef TCG_TARGET_HAS_eqv_i64
+    tcg_gen_op3_i64(INDEX_op_eqv_i64, ret, arg1, arg2);
+#elif defined(TCG_TARGET_HAS_eqv_i32) && TCG_TARGET_REG_BITS == 32
+    tcg_gen_eqv_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
+    tcg_gen_eqv_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
+#else
+    tcg_gen_xor_i64(ret, arg1, arg2);
+    tcg_gen_not_i64(ret, ret);
+#endif
+}
+
+static inline void tcg_gen_nand_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+#ifdef TCG_TARGET_HAS_nand_i32
+    tcg_gen_op3_i32(INDEX_op_nand_i32, ret, arg1, arg2);
+#else
+    tcg_gen_and_i32(ret, arg1, arg2);
+    tcg_gen_not_i32(ret, ret);
+#endif
+}
+
+static inline void tcg_gen_nand_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+#ifdef TCG_TARGET_HAS_nand_i64
+    tcg_gen_op3_i64(INDEX_op_nand_i64, ret, arg1, arg2);
+#elif defined(TCG_TARGET_HAS_nand_i32) && TCG_TARGET_REG_BITS == 32
+    tcg_gen_nand_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
+    tcg_gen_nand_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
+#else
+    tcg_gen_and_i64(ret, arg1, arg2);
+    tcg_gen_not_i64(ret, ret);
+#endif
+}
+
+static inline void tcg_gen_nor_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+#ifdef TCG_TARGET_HAS_nor_i32
+    tcg_gen_op3_i32(INDEX_op_nor_i32, ret, arg1, arg2);
+#else
+    tcg_gen_or_i32(ret, arg1, arg2);
+    tcg_gen_not_i32(ret, ret);
+#endif
+}
+
+static inline void tcg_gen_nor_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+#ifdef TCG_TARGET_HAS_nor_i64
+    tcg_gen_op3_i64(INDEX_op_nor_i64, ret, arg1, arg2);
+#elif defined(TCG_TARGET_HAS_nor_i32) && TCG_TARGET_REG_BITS == 32
+    tcg_gen_nor_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
+    tcg_gen_nor_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
+#else
+    tcg_gen_or_i64(ret, arg1, arg2);
+    tcg_gen_not_i64(ret, ret);
+#endif
+}
+
+static inline void tcg_gen_orc_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+#ifdef TCG_TARGET_HAS_orc_i32
+    tcg_gen_op3_i32(INDEX_op_orc_i32, ret, arg1, arg2);
+#else
+    TCGv_i32 t0;
+    t0 = tcg_temp_new_i32();
+    tcg_gen_not_i32(t0, arg2);
+    tcg_gen_or_i32(ret, arg1, t0);
+    tcg_temp_free_i32(t0);
+#endif
+}
+
+static inline void tcg_gen_orc_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+#ifdef TCG_TARGET_HAS_orc_i64
+    tcg_gen_op3_i64(INDEX_op_orc_i64, ret, arg1, arg2);
+#elif defined(TCG_TARGET_HAS_orc_i32) && TCG_TARGET_REG_BITS == 32
+    tcg_gen_orc_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
+    tcg_gen_orc_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
+#else
+    TCGv_i64 t0;
+    t0 = tcg_temp_new_i64();
+    tcg_gen_not_i64(t0, arg2);
+    tcg_gen_or_i64(ret, arg1, t0);
+    tcg_temp_free_i64(t0);
+#endif
+}
+
+static inline void tcg_gen_rotl_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+#ifdef TCG_TARGET_HAS_rot_i32
+    tcg_gen_op3_i32(INDEX_op_rotl_i32, ret, arg1, arg2);
+#else
+    TCGv_i32 t0, t1;
+
+    t0 = tcg_temp_new_i32();
+    t1 = tcg_temp_new_i32();
+    tcg_gen_shl_i32(t0, arg1, arg2);
+    tcg_gen_subfi_i32(t1, 32, arg2);
+    tcg_gen_shr_i32(t1, arg1, t1);
+    tcg_gen_or_i32(ret, t0, t1);
+    tcg_temp_free_i32(t0);
+    tcg_temp_free_i32(t1);
+#endif
+}
+
+static inline void tcg_gen_rotl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+#ifdef TCG_TARGET_HAS_rot_i64
+    tcg_gen_op3_i64(INDEX_op_rotl_i64, ret, arg1, arg2);
+#else
+    TCGv_i64 t0, t1;
+
+    t0 = tcg_temp_new_i64();
+    t1 = tcg_temp_new_i64();
+    tcg_gen_shl_i64(t0, arg1, arg2);
+    tcg_gen_subfi_i64(t1, 64, arg2);
+    tcg_gen_shr_i64(t1, arg1, t1);
+    tcg_gen_or_i64(ret, t0, t1);
+    tcg_temp_free_i64(t0);
+    tcg_temp_free_i64(t1);
+#endif
+}
+
+static inline void tcg_gen_rotli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
+{
+    /* some cases can be optimized here */
+    if (arg2 == 0) {
+        tcg_gen_mov_i32(ret, arg1);
+    } else {
+#ifdef TCG_TARGET_HAS_rot_i32
+        TCGv_i32 t0 = tcg_const_i32(arg2);
+        tcg_gen_rotl_i32(ret, arg1, t0);
+        tcg_temp_free_i32(t0);
+#else
+        TCGv_i32 t0, t1;
+        t0 = tcg_temp_new_i32();
+        t1 = tcg_temp_new_i32();
+        tcg_gen_shli_i32(t0, arg1, arg2);
+        tcg_gen_shri_i32(t1, arg1, 32 - arg2);
+        tcg_gen_or_i32(ret, t0, t1);
+        tcg_temp_free_i32(t0);
+        tcg_temp_free_i32(t1);
+#endif
+    }
+}
+
+static inline void tcg_gen_rotli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
+{
+    /* some cases can be optimized here */
+    if (arg2 == 0) {
+        tcg_gen_mov_i64(ret, arg1);
+    } else {
+#ifdef TCG_TARGET_HAS_rot_i64
+        TCGv_i64 t0 = tcg_const_i64(arg2);
+        tcg_gen_rotl_i64(ret, arg1, t0);
+        tcg_temp_free_i64(t0);
+#else
+        TCGv_i64 t0, t1;
+        t0 = tcg_temp_new_i64();
+        t1 = tcg_temp_new_i64();
+        tcg_gen_shli_i64(t0, arg1, arg2);
+        tcg_gen_shri_i64(t1, arg1, 64 - arg2);
+        tcg_gen_or_i64(ret, t0, t1);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+#endif
+    }
+}
+
+static inline void tcg_gen_rotr_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+#ifdef TCG_TARGET_HAS_rot_i32
+    tcg_gen_op3_i32(INDEX_op_rotr_i32, ret, arg1, arg2);
+#else
+    TCGv_i32 t0, t1;
+
+    t0 = tcg_temp_new_i32();
+    t1 = tcg_temp_new_i32();
+    tcg_gen_shr_i32(t0, arg1, arg2);
+    tcg_gen_subfi_i32(t1, 32, arg2);
+    tcg_gen_shl_i32(t1, arg1, t1);
+    tcg_gen_or_i32(ret, t0, t1);
+    tcg_temp_free_i32(t0);
+    tcg_temp_free_i32(t1);
+#endif
+}
+
+static inline void tcg_gen_rotr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+#ifdef TCG_TARGET_HAS_rot_i64
+    tcg_gen_op3_i64(INDEX_op_rotr_i64, ret, arg1, arg2);
+#else
+    TCGv_i64 t0, t1;
+
+    t0 = tcg_temp_new_i64();
+    t1 = tcg_temp_new_i64();
+    tcg_gen_shr_i64(t0, arg1, arg2);
+    tcg_gen_subfi_i64(t1, 64, arg2);
+    tcg_gen_shl_i64(t1, arg1, t1);
+    tcg_gen_or_i64(ret, t0, t1);
+    tcg_temp_free_i64(t0);
+    tcg_temp_free_i64(t1);
+#endif
+}
+
+static inline void tcg_gen_rotri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
+{
+    /* some cases can be optimized here */
+    if (arg2 == 0) {
+        tcg_gen_mov_i32(ret, arg1);
+    } else {
+        tcg_gen_rotli_i32(ret, arg1, 32 - arg2);
+    }
+}
+
+static inline void tcg_gen_rotri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
+{
+    /* some cases can be optimized here */
+    if (arg2 == 0) {
+        tcg_gen_mov_i64(ret, arg1);
+    } else {
+        tcg_gen_rotli_i64(ret, arg1, 64 - arg2);
+    }
+}
+
+/***************************************/
+/* QEMU specific operations. Their type depend on the QEMU CPU
+   type. */
+#ifndef TARGET_LONG_BITS
+#error must include QEMU headers
+#endif
+
+#if TARGET_LONG_BITS == 32
+#define TCGv TCGv_i32
+#define tcg_temp_new() tcg_temp_new_i32()
+#define tcg_global_reg_new tcg_global_reg_new_i32
+#define tcg_global_mem_new tcg_global_mem_new_i32
+#define tcg_temp_local_new() tcg_temp_local_new_i32()
+#define tcg_temp_free tcg_temp_free_i32
+#define tcg_gen_qemu_ldst_op tcg_gen_op3i_i32
+#define tcg_gen_qemu_ldst_op_i64 tcg_gen_qemu_ldst_op_i64_i32
+#define TCGV_UNUSED(x) TCGV_UNUSED_I32(x)
+#define TCGV_EQUAL(a, b) TCGV_EQUAL_I32(a, b)
+#else
+#define TCGv TCGv_i64
+#define tcg_temp_new() tcg_temp_new_i64()
+#define tcg_global_reg_new tcg_global_reg_new_i64
+#define tcg_global_mem_new tcg_global_mem_new_i64
+#define tcg_temp_local_new() tcg_temp_local_new_i64()
+#define tcg_temp_free tcg_temp_free_i64
+#define tcg_gen_qemu_ldst_op tcg_gen_op3i_i64
+#define tcg_gen_qemu_ldst_op_i64 tcg_gen_qemu_ldst_op_i64_i64
+#define TCGV_UNUSED(x) TCGV_UNUSED_I64(x)
+#define TCGV_EQUAL(a, b) TCGV_EQUAL_I64(a, b)
+#endif
+
+/* debug info: write the PC of the corresponding QEMU CPU instruction */
+static inline void tcg_gen_debug_insn_start(uint64_t pc)
+{
+    /* XXX: must really use a 32 bit size for TCGArg in all cases */
+#if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
+    tcg_gen_op2ii(INDEX_op_debug_insn_start,
+                  (uint32_t)(pc), (uint32_t)(pc >> 32));
+#else
+    tcg_gen_op1i(INDEX_op_debug_insn_start, pc);
+#endif
+}
+
+static inline void tcg_gen_exit_tb(tcg_target_long val)
+{
+    tcg_gen_op1i(INDEX_op_exit_tb, val);
+}
+
+static inline void tcg_gen_goto_tb(int idx)
+{
+    tcg_gen_op1i(INDEX_op_goto_tb, idx);
+}
+
+#if TCG_TARGET_REG_BITS == 32
+static inline void tcg_gen_qemu_ld8u(TCGv ret, TCGv addr, int mem_index)
+{
+#if TARGET_LONG_BITS == 32
+    tcg_gen_op3i_i32(INDEX_op_qemu_ld8u, ret, addr, mem_index);
+#else
+    tcg_gen_op4i_i32(INDEX_op_qemu_ld8u, TCGV_LOW(ret), TCGV_LOW(addr),
+                     TCGV_HIGH(addr), mem_index);
+    tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+#endif
+}
+
+static inline void tcg_gen_qemu_ld8s(TCGv ret, TCGv addr, int mem_index)
+{
+#if TARGET_LONG_BITS == 32
+    tcg_gen_op3i_i32(INDEX_op_qemu_ld8s, ret, addr, mem_index);
+#else
+    tcg_gen_op4i_i32(INDEX_op_qemu_ld8s, TCGV_LOW(ret), TCGV_LOW(addr),
+                     TCGV_HIGH(addr), mem_index);
+    tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
+#endif
+}
+
+static inline void tcg_gen_qemu_ld16u(TCGv ret, TCGv addr, int mem_index)
+{
+#if TARGET_LONG_BITS == 32
+    tcg_gen_op3i_i32(INDEX_op_qemu_ld16u, ret, addr, mem_index);
+#else
+    tcg_gen_op4i_i32(INDEX_op_qemu_ld16u, TCGV_LOW(ret), TCGV_LOW(addr),
+                     TCGV_HIGH(addr), mem_index);
+    tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+#endif
+}
+
+static inline void tcg_gen_qemu_ld16s(TCGv ret, TCGv addr, int mem_index)
+{
+#if TARGET_LONG_BITS == 32
+    tcg_gen_op3i_i32(INDEX_op_qemu_ld16s, ret, addr, mem_index);
+#else
+    tcg_gen_op4i_i32(INDEX_op_qemu_ld16s, TCGV_LOW(ret), TCGV_LOW(addr),
+                     TCGV_HIGH(addr), mem_index);
+    tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
+#endif
+}
+
+static inline void tcg_gen_qemu_ld32u(TCGv ret, TCGv addr, int mem_index)
+{
+#if TARGET_LONG_BITS == 32
+    tcg_gen_op3i_i32(INDEX_op_qemu_ld32, ret, addr, mem_index);
+#else
+    tcg_gen_op4i_i32(INDEX_op_qemu_ld32, TCGV_LOW(ret), TCGV_LOW(addr),
+                     TCGV_HIGH(addr), mem_index);
+    tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+#endif
+}
+
+static inline void tcg_gen_qemu_ld32s(TCGv ret, TCGv addr, int mem_index)
+{
+#if TARGET_LONG_BITS == 32
+    tcg_gen_op3i_i32(INDEX_op_qemu_ld32, ret, addr, mem_index);
+#else
+    tcg_gen_op4i_i32(INDEX_op_qemu_ld32, TCGV_LOW(ret), TCGV_LOW(addr),
+                     TCGV_HIGH(addr), mem_index);
+    tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
+#endif
+}
+
+static inline void tcg_gen_qemu_ld64(TCGv_i64 ret, TCGv addr, int mem_index)
+{
+#if TARGET_LONG_BITS == 32
+    tcg_gen_op4i_i32(INDEX_op_qemu_ld64, TCGV_LOW(ret), TCGV_HIGH(ret), addr, mem_index);
+#else
+    tcg_gen_op5i_i32(INDEX_op_qemu_ld64, TCGV_LOW(ret), TCGV_HIGH(ret),
+                     TCGV_LOW(addr), TCGV_HIGH(addr), mem_index);
+#endif
+}
+
+static inline void tcg_gen_qemu_st8(TCGv arg, TCGv addr, int mem_index)
+{
+#if TARGET_LONG_BITS == 32
+    tcg_gen_op3i_i32(INDEX_op_qemu_st8, arg, addr, mem_index);
+#else
+    tcg_gen_op4i_i32(INDEX_op_qemu_st8, TCGV_LOW(arg), TCGV_LOW(addr),
+                     TCGV_HIGH(addr), mem_index);
+#endif
+}
+
+static inline void tcg_gen_qemu_st16(TCGv arg, TCGv addr, int mem_index)
+{
+#if TARGET_LONG_BITS == 32
+    tcg_gen_op3i_i32(INDEX_op_qemu_st16, arg, addr, mem_index);
+#else
+    tcg_gen_op4i_i32(INDEX_op_qemu_st16, TCGV_LOW(arg), TCGV_LOW(addr),
+                     TCGV_HIGH(addr), mem_index);
+#endif
+}
+
+static inline void tcg_gen_qemu_st32(TCGv arg, TCGv addr, int mem_index)
+{
+#if TARGET_LONG_BITS == 32
+    tcg_gen_op3i_i32(INDEX_op_qemu_st32, arg, addr, mem_index);
+#else
+    tcg_gen_op4i_i32(INDEX_op_qemu_st32, TCGV_LOW(arg), TCGV_LOW(addr),
+                     TCGV_HIGH(addr), mem_index);
+#endif
+}
+
+static inline void tcg_gen_qemu_st64(TCGv_i64 arg, TCGv addr, int mem_index)
+{
+#if TARGET_LONG_BITS == 32
+    tcg_gen_op4i_i32(INDEX_op_qemu_st64, TCGV_LOW(arg), TCGV_HIGH(arg), addr,
+                     mem_index);
+#else
+    tcg_gen_op5i_i32(INDEX_op_qemu_st64, TCGV_LOW(arg), TCGV_HIGH(arg),
+                     TCGV_LOW(addr), TCGV_HIGH(addr), mem_index);
+#endif
+}
+
+#define tcg_gen_ld_ptr tcg_gen_ld_i32
+#define tcg_gen_discard_ptr tcg_gen_discard_i32
+
+#else /* TCG_TARGET_REG_BITS == 32 */
+
+static inline void tcg_gen_qemu_ld8u(TCGv ret, TCGv addr, int mem_index)
+{
+    tcg_gen_qemu_ldst_op(INDEX_op_qemu_ld8u, ret, addr, mem_index);
+}
+
+static inline void tcg_gen_qemu_ld8s(TCGv ret, TCGv addr, int mem_index)
+{
+    tcg_gen_qemu_ldst_op(INDEX_op_qemu_ld8s, ret, addr, mem_index);
+}
+
+static inline void tcg_gen_qemu_ld16u(TCGv ret, TCGv addr, int mem_index)
+{
+    tcg_gen_qemu_ldst_op(INDEX_op_qemu_ld16u, ret, addr, mem_index);
+}
+
+static inline void tcg_gen_qemu_ld16s(TCGv ret, TCGv addr, int mem_index)
+{
+    tcg_gen_qemu_ldst_op(INDEX_op_qemu_ld16s, ret, addr, mem_index);
+}
+
+static inline void tcg_gen_qemu_ld32u(TCGv ret, TCGv addr, int mem_index)
+{
+#if TARGET_LONG_BITS == 32
+    tcg_gen_qemu_ldst_op(INDEX_op_qemu_ld32, ret, addr, mem_index);
+#else
+    tcg_gen_qemu_ldst_op(INDEX_op_qemu_ld32u, ret, addr, mem_index);
+#endif
+}
+
+static inline void tcg_gen_qemu_ld32s(TCGv ret, TCGv addr, int mem_index)
+{
+#if TARGET_LONG_BITS == 32
+    tcg_gen_qemu_ldst_op(INDEX_op_qemu_ld32, ret, addr, mem_index);
+#else
+    tcg_gen_qemu_ldst_op(INDEX_op_qemu_ld32s, ret, addr, mem_index);
+#endif
+}
+
+static inline void tcg_gen_qemu_ld64(TCGv_i64 ret, TCGv addr, int mem_index)
+{
+    tcg_gen_qemu_ldst_op_i64(INDEX_op_qemu_ld64, ret, addr, mem_index);
+}
+
+static inline void tcg_gen_qemu_st8(TCGv arg, TCGv addr, int mem_index)
+{
+    tcg_gen_qemu_ldst_op(INDEX_op_qemu_st8, arg, addr, mem_index);
+}
+
+static inline void tcg_gen_qemu_st16(TCGv arg, TCGv addr, int mem_index)
+{
+    tcg_gen_qemu_ldst_op(INDEX_op_qemu_st16, arg, addr, mem_index);
+}
+
+static inline void tcg_gen_qemu_st32(TCGv arg, TCGv addr, int mem_index)
+{
+    tcg_gen_qemu_ldst_op(INDEX_op_qemu_st32, arg, addr, mem_index);
+}
+
+static inline void tcg_gen_qemu_st64(TCGv_i64 arg, TCGv addr, int mem_index)
+{
+    tcg_gen_qemu_ldst_op_i64(INDEX_op_qemu_st64, arg, addr, mem_index);
+}
+
+#define tcg_gen_ld_ptr tcg_gen_ld_i64
+#define tcg_gen_discard_ptr tcg_gen_discard_i64
+
+#endif /* TCG_TARGET_REG_BITS != 32 */
+
+#if TARGET_LONG_BITS == 64
+#define tcg_gen_movi_tl tcg_gen_movi_i64
+#define tcg_gen_mov_tl tcg_gen_mov_i64
+#define tcg_gen_ld8u_tl tcg_gen_ld8u_i64
+#define tcg_gen_ld8s_tl tcg_gen_ld8s_i64
+#define tcg_gen_ld16u_tl tcg_gen_ld16u_i64
+#define tcg_gen_ld16s_tl tcg_gen_ld16s_i64
+#define tcg_gen_ld32u_tl tcg_gen_ld32u_i64
+#define tcg_gen_ld32s_tl tcg_gen_ld32s_i64
+#define tcg_gen_ld_tl tcg_gen_ld_i64
+#define tcg_gen_st8_tl tcg_gen_st8_i64
+#define tcg_gen_st16_tl tcg_gen_st16_i64
+#define tcg_gen_st32_tl tcg_gen_st32_i64
+#define tcg_gen_st_tl tcg_gen_st_i64
+#define tcg_gen_add_tl tcg_gen_add_i64
+#define tcg_gen_addi_tl tcg_gen_addi_i64
+#define tcg_gen_sub_tl tcg_gen_sub_i64
+#define tcg_gen_neg_tl tcg_gen_neg_i64
+#define tcg_gen_subfi_tl tcg_gen_subfi_i64
+#define tcg_gen_subi_tl tcg_gen_subi_i64
+#define tcg_gen_and_tl tcg_gen_and_i64
+#define tcg_gen_andi_tl tcg_gen_andi_i64
+#define tcg_gen_or_tl tcg_gen_or_i64
+#define tcg_gen_ori_tl tcg_gen_ori_i64
+#define tcg_gen_xor_tl tcg_gen_xor_i64
+#define tcg_gen_xori_tl tcg_gen_xori_i64
+#define tcg_gen_not_tl tcg_gen_not_i64
+#define tcg_gen_shl_tl tcg_gen_shl_i64
+#define tcg_gen_shli_tl tcg_gen_shli_i64
+#define tcg_gen_shr_tl tcg_gen_shr_i64
+#define tcg_gen_shri_tl tcg_gen_shri_i64
+#define tcg_gen_sar_tl tcg_gen_sar_i64
+#define tcg_gen_sari_tl tcg_gen_sari_i64
+#define tcg_gen_brcond_tl tcg_gen_brcond_i64
+#define tcg_gen_brcondi_tl tcg_gen_brcondi_i64
+#define tcg_gen_setcond_tl tcg_gen_setcond_i64
+#define tcg_gen_setcondi_tl tcg_gen_setcondi_i64
+#define tcg_gen_mul_tl tcg_gen_mul_i64
+#define tcg_gen_muli_tl tcg_gen_muli_i64
+#define tcg_gen_div_tl tcg_gen_div_i64
+#define tcg_gen_rem_tl tcg_gen_rem_i64
+#define tcg_gen_divu_tl tcg_gen_divu_i64
+#define tcg_gen_remu_tl tcg_gen_remu_i64
+#define tcg_gen_discard_tl tcg_gen_discard_i64
+#define tcg_gen_trunc_tl_i32 tcg_gen_trunc_i64_i32
+#define tcg_gen_trunc_i64_tl tcg_gen_mov_i64
+#define tcg_gen_extu_i32_tl tcg_gen_extu_i32_i64
+#define tcg_gen_ext_i32_tl tcg_gen_ext_i32_i64
+#define tcg_gen_extu_tl_i64 tcg_gen_mov_i64
+#define tcg_gen_ext_tl_i64 tcg_gen_mov_i64
+#define tcg_gen_ext8u_tl tcg_gen_ext8u_i64
+#define tcg_gen_ext8s_tl tcg_gen_ext8s_i64
+#define tcg_gen_ext16u_tl tcg_gen_ext16u_i64
+#define tcg_gen_ext16s_tl tcg_gen_ext16s_i64
+#define tcg_gen_ext32u_tl tcg_gen_ext32u_i64
+#define tcg_gen_ext32s_tl tcg_gen_ext32s_i64
+#define tcg_gen_bswap16_tl tcg_gen_bswap16_i64
+#define tcg_gen_bswap32_tl tcg_gen_bswap32_i64
+#define tcg_gen_bswap64_tl tcg_gen_bswap64_i64
+#define tcg_gen_concat_tl_i64 tcg_gen_concat32_i64
+#define tcg_gen_andc_tl tcg_gen_andc_i64
+#define tcg_gen_eqv_tl tcg_gen_eqv_i64
+#define tcg_gen_nand_tl tcg_gen_nand_i64
+#define tcg_gen_nor_tl tcg_gen_nor_i64
+#define tcg_gen_orc_tl tcg_gen_orc_i64
+#define tcg_gen_rotl_tl tcg_gen_rotl_i64
+#define tcg_gen_rotli_tl tcg_gen_rotli_i64
+#define tcg_gen_rotr_tl tcg_gen_rotr_i64
+#define tcg_gen_rotri_tl tcg_gen_rotri_i64
+#define tcg_const_tl tcg_const_i64
+#define tcg_const_local_tl tcg_const_local_i64
+#else
+#define tcg_gen_movi_tl tcg_gen_movi_i32
+#define tcg_gen_mov_tl tcg_gen_mov_i32
+#define tcg_gen_ld8u_tl tcg_gen_ld8u_i32
+#define tcg_gen_ld8s_tl tcg_gen_ld8s_i32
+#define tcg_gen_ld16u_tl tcg_gen_ld16u_i32
+#define tcg_gen_ld16s_tl tcg_gen_ld16s_i32
+#define tcg_gen_ld32u_tl tcg_gen_ld_i32
+#define tcg_gen_ld32s_tl tcg_gen_ld_i32
+#define tcg_gen_ld_tl tcg_gen_ld_i32
+#define tcg_gen_st8_tl tcg_gen_st8_i32
+#define tcg_gen_st16_tl tcg_gen_st16_i32
+#define tcg_gen_st32_tl tcg_gen_st_i32
+#define tcg_gen_st_tl tcg_gen_st_i32
+#define tcg_gen_add_tl tcg_gen_add_i32
+#define tcg_gen_addi_tl tcg_gen_addi_i32
+#define tcg_gen_sub_tl tcg_gen_sub_i32
+#define tcg_gen_neg_tl tcg_gen_neg_i32
+#define tcg_gen_subfi_tl tcg_gen_subfi_i32
+#define tcg_gen_subi_tl tcg_gen_subi_i32
+#define tcg_gen_and_tl tcg_gen_and_i32
+#define tcg_gen_andi_tl tcg_gen_andi_i32
+#define tcg_gen_or_tl tcg_gen_or_i32
+#define tcg_gen_ori_tl tcg_gen_ori_i32
+#define tcg_gen_xor_tl tcg_gen_xor_i32
+#define tcg_gen_xori_tl tcg_gen_xori_i32
+#define tcg_gen_not_tl tcg_gen_not_i32
+#define tcg_gen_shl_tl tcg_gen_shl_i32
+#define tcg_gen_shli_tl tcg_gen_shli_i32
+#define tcg_gen_shr_tl tcg_gen_shr_i32
+#define tcg_gen_shri_tl tcg_gen_shri_i32
+#define tcg_gen_sar_tl tcg_gen_sar_i32
+#define tcg_gen_sari_tl tcg_gen_sari_i32
+#define tcg_gen_brcond_tl tcg_gen_brcond_i32
+#define tcg_gen_brcondi_tl tcg_gen_brcondi_i32
+#define tcg_gen_setcond_tl tcg_gen_setcond_i32
+#define tcg_gen_setcondi_tl tcg_gen_setcondi_i32
+#define tcg_gen_mul_tl tcg_gen_mul_i32
+#define tcg_gen_muli_tl tcg_gen_muli_i32
+#define tcg_gen_div_tl tcg_gen_div_i32
+#define tcg_gen_rem_tl tcg_gen_rem_i32
+#define tcg_gen_divu_tl tcg_gen_divu_i32
+#define tcg_gen_remu_tl tcg_gen_remu_i32
+#define tcg_gen_discard_tl tcg_gen_discard_i32
+#define tcg_gen_trunc_tl_i32 tcg_gen_mov_i32
+#define tcg_gen_trunc_i64_tl tcg_gen_trunc_i64_i32
+#define tcg_gen_extu_i32_tl tcg_gen_mov_i32
+#define tcg_gen_ext_i32_tl tcg_gen_mov_i32
+#define tcg_gen_extu_tl_i64 tcg_gen_extu_i32_i64
+#define tcg_gen_ext_tl_i64 tcg_gen_ext_i32_i64
+#define tcg_gen_ext8u_tl tcg_gen_ext8u_i32
+#define tcg_gen_ext8s_tl tcg_gen_ext8s_i32
+#define tcg_gen_ext16u_tl tcg_gen_ext16u_i32
+#define tcg_gen_ext16s_tl tcg_gen_ext16s_i32
+#define tcg_gen_ext32u_tl tcg_gen_mov_i32
+#define tcg_gen_ext32s_tl tcg_gen_mov_i32
+#define tcg_gen_bswap16_tl tcg_gen_bswap16_i32
+#define tcg_gen_bswap32_tl tcg_gen_bswap32_i32
+#define tcg_gen_concat_tl_i64 tcg_gen_concat_i32_i64
+#define tcg_gen_andc_tl tcg_gen_andc_i32
+#define tcg_gen_eqv_tl tcg_gen_eqv_i32
+#define tcg_gen_nand_tl tcg_gen_nand_i32
+#define tcg_gen_nor_tl tcg_gen_nor_i32
+#define tcg_gen_orc_tl tcg_gen_orc_i32
+#define tcg_gen_rotl_tl tcg_gen_rotl_i32
+#define tcg_gen_rotli_tl tcg_gen_rotli_i32
+#define tcg_gen_rotr_tl tcg_gen_rotr_i32
+#define tcg_gen_rotri_tl tcg_gen_rotri_i32
+#define tcg_const_tl tcg_const_i32
+#define tcg_const_local_tl tcg_const_local_i32
+#endif
+
+#if TCG_TARGET_REG_BITS == 32
+#define tcg_gen_add_ptr tcg_gen_add_i32
+#define tcg_gen_addi_ptr tcg_gen_addi_i32
+#define tcg_gen_ext_i32_ptr tcg_gen_mov_i32
+#else /* TCG_TARGET_REG_BITS == 32 */
+#define tcg_gen_add_ptr tcg_gen_add_i64
+#define tcg_gen_addi_ptr tcg_gen_addi_i64
+#define tcg_gen_ext_i32_ptr tcg_gen_ext_i32_i64
+#endif /* TCG_TARGET_REG_BITS != 32 */
diff --git a/src/recompiler/tcg/tcg-opc.h b/src/recompiler/tcg/tcg-opc.h
new file mode 100644
index 00000000..2a98fed9
--- /dev/null
+++ b/src/recompiler/tcg/tcg-opc.h
@@ -0,0 +1,304 @@
+/*
+ * Tiny Code Generator for QEMU
+ *
+ * Copyright (c) 2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+/*
+ * DEF(name, oargs, iargs, cargs, flags)
+ */
+
+/* predefined ops */
+DEF(end, 0, 0, 0, 0) /* must be kept first */
+DEF(nop, 0, 0, 0, 0)
+DEF(nop1, 0, 0, 1, 0)
+DEF(nop2, 0, 0, 2, 0)
+DEF(nop3, 0, 0, 3, 0)
+DEF(nopn, 0, 0, 1, 0) /* variable number of parameters */
+
+DEF(discard, 1, 0, 0, 0)
+
+DEF(set_label, 0, 0, 1, 0)
+DEF(call, 0, 1, 2, TCG_OPF_SIDE_EFFECTS) /* variable number of parameters */
+DEF(jmp, 0, 1, 0, TCG_OPF_BB_END | TCG_OPF_SIDE_EFFECTS)
+DEF(br, 0, 0, 1, TCG_OPF_BB_END | TCG_OPF_SIDE_EFFECTS)
+
+DEF(mov_i32, 1, 1, 0, 0)
+DEF(movi_i32, 1, 0, 1, 0)
+DEF(setcond_i32, 1, 2, 1, 0)
+/* load/store */
+DEF(ld8u_i32, 1, 1, 1, 0)
+DEF(ld8s_i32, 1, 1, 1, 0)
+DEF(ld16u_i32, 1, 1, 1, 0)
+DEF(ld16s_i32, 1, 1, 1, 0)
+DEF(ld_i32, 1, 1, 1, 0)
+DEF(st8_i32, 0, 2, 1, TCG_OPF_SIDE_EFFECTS)
+DEF(st16_i32, 0, 2, 1, TCG_OPF_SIDE_EFFECTS)
+DEF(st_i32, 0, 2, 1, TCG_OPF_SIDE_EFFECTS)
+/* arith */
+DEF(add_i32, 1, 2, 0, 0)
+DEF(sub_i32, 1, 2, 0, 0)
+DEF(mul_i32, 1, 2, 0, 0)
+#ifdef TCG_TARGET_HAS_div_i32
+DEF(div_i32, 1, 2, 0, 0)
+DEF(divu_i32, 1, 2, 0, 0)
+DEF(rem_i32, 1, 2, 0, 0)
+DEF(remu_i32, 1, 2, 0, 0)
+#endif
+#ifdef TCG_TARGET_HAS_div2_i32
+DEF(div2_i32, 2, 3, 0, 0)
+DEF(divu2_i32, 2, 3, 0, 0)
+#endif
+DEF(and_i32, 1, 2, 0, 0)
+DEF(or_i32, 1, 2, 0, 0)
+DEF(xor_i32, 1, 2, 0, 0)
+/* shifts/rotates */
+DEF(shl_i32, 1, 2, 0, 0)
+DEF(shr_i32, 1, 2, 0, 0)
+DEF(sar_i32, 1, 2, 0, 0)
+#ifdef TCG_TARGET_HAS_rot_i32
+DEF(rotl_i32, 1, 2, 0, 0)
+DEF(rotr_i32, 1, 2, 0, 0)
+#endif
+
+DEF(brcond_i32, 0, 2, 2, TCG_OPF_BB_END | TCG_OPF_SIDE_EFFECTS)
+#if TCG_TARGET_REG_BITS == 32
+DEF(add2_i32, 2, 4, 0, 0)
+DEF(sub2_i32, 2, 4, 0, 0)
+DEF(brcond2_i32, 0, 4, 2, TCG_OPF_BB_END | TCG_OPF_SIDE_EFFECTS)
+DEF(mulu2_i32, 2, 2, 0, 0)
+DEF(setcond2_i32, 1, 4, 1, 0)
+#endif
+#ifdef TCG_TARGET_HAS_ext8s_i32
+DEF(ext8s_i32, 1, 1, 0, 0)
+#endif
+#ifdef TCG_TARGET_HAS_ext16s_i32
+DEF(ext16s_i32, 1, 1, 0, 0)
+#endif
+#ifdef TCG_TARGET_HAS_ext8u_i32
+DEF(ext8u_i32, 1, 1, 0, 0)
+#endif
+#ifdef TCG_TARGET_HAS_ext16u_i32
+DEF(ext16u_i32, 1, 1, 0, 0)
+#endif
+#ifdef TCG_TARGET_HAS_bswap16_i32
+DEF(bswap16_i32, 1, 1, 0, 0)
+#endif
+#ifdef TCG_TARGET_HAS_bswap32_i32
+DEF(bswap32_i32, 1, 1, 0, 0)
+#endif
+#ifdef TCG_TARGET_HAS_not_i32
+DEF(not_i32, 1, 1, 0, 0)
+#endif
+#ifdef TCG_TARGET_HAS_neg_i32
+DEF(neg_i32, 1, 1, 0, 0)
+#endif
+#ifdef TCG_TARGET_HAS_andc_i32
+DEF(andc_i32, 1, 2, 0, 0)
+#endif
+#ifdef TCG_TARGET_HAS_orc_i32
+DEF(orc_i32, 1, 2, 0, 0)
+#endif
+#ifdef TCG_TARGET_HAS_eqv_i32
+DEF(eqv_i32, 1, 2, 0, 0)
+#endif
+#ifdef TCG_TARGET_HAS_nand_i32
+DEF(nand_i32, 1, 2, 0, 0)
+#endif
+#ifdef TCG_TARGET_HAS_nor_i32
+DEF(nor_i32, 1, 2, 0, 0)
+#endif
+
+#if TCG_TARGET_REG_BITS == 64
+DEF(mov_i64, 1, 1, 0, 0)
+DEF(movi_i64, 1, 0, 1, 0)
+DEF(setcond_i64, 1, 2, 1, 0)
+/* load/store */
+DEF(ld8u_i64, 1, 1, 1, 0)
+DEF(ld8s_i64, 1, 1, 1, 0)
+DEF(ld16u_i64, 1, 1, 1, 0)
+DEF(ld16s_i64, 1, 1, 1, 0)
+DEF(ld32u_i64, 1, 1, 1, 0)
+DEF(ld32s_i64, 1, 1, 1, 0)
+DEF(ld_i64, 1, 1, 1, 0)
+DEF(st8_i64, 0, 2, 1, TCG_OPF_SIDE_EFFECTS)
+DEF(st16_i64, 0, 2, 1, TCG_OPF_SIDE_EFFECTS)
+DEF(st32_i64, 0, 2, 1, TCG_OPF_SIDE_EFFECTS)
+DEF(st_i64, 0, 2, 1, TCG_OPF_SIDE_EFFECTS)
+/* arith */
+DEF(add_i64, 1, 2, 0, 0)
+DEF(sub_i64, 1, 2, 0, 0)
+DEF(mul_i64, 1, 2, 0, 0)
+#ifdef TCG_TARGET_HAS_div_i64
+DEF(div_i64, 1, 2, 0, 0)
+DEF(divu_i64, 1, 2, 0, 0)
+DEF(rem_i64, 1, 2, 0, 0)
+DEF(remu_i64, 1, 2, 0, 0)
+#endif
+#ifdef TCG_TARGET_HAS_div2_i64
+DEF(div2_i64, 2, 3, 0, 0)
+DEF(divu2_i64, 2, 3, 0, 0)
+#endif
+DEF(and_i64, 1, 2, 0, 0)
+DEF(or_i64, 1, 2, 0, 0)
+DEF(xor_i64, 1, 2, 0, 0)
+/* shifts/rotates */
+DEF(shl_i64, 1, 2, 0, 0)
+DEF(shr_i64, 1, 2, 0, 0)
+DEF(sar_i64, 1, 2, 0, 0)
+#ifdef TCG_TARGET_HAS_rot_i64
+DEF(rotl_i64, 1, 2, 0, 0)
+DEF(rotr_i64, 1, 2, 0, 0)
+#endif
+
+DEF(brcond_i64, 0, 2, 2, TCG_OPF_BB_END | TCG_OPF_SIDE_EFFECTS)
+#ifdef TCG_TARGET_HAS_ext8s_i64
+DEF(ext8s_i64, 1, 1, 0, 0)
+#endif
+#ifdef TCG_TARGET_HAS_ext16s_i64
+DEF(ext16s_i64, 1, 1, 0, 0)
+#endif
+#ifdef TCG_TARGET_HAS_ext32s_i64
+DEF(ext32s_i64, 1, 1, 0, 0)
+#endif
+#ifdef TCG_TARGET_HAS_ext8u_i64
+DEF(ext8u_i64, 1, 1, 0, 0)
+#endif
+#ifdef TCG_TARGET_HAS_ext16u_i64
+DEF(ext16u_i64, 1, 1, 0, 0)
+#endif
+#ifdef TCG_TARGET_HAS_ext32u_i64
+DEF(ext32u_i64, 1, 1, 0, 0)
+#endif
+#ifdef TCG_TARGET_HAS_bswap16_i64
+DEF(bswap16_i64, 1, 1, 0, 0)
+#endif
+#ifdef TCG_TARGET_HAS_bswap32_i64
+DEF(bswap32_i64, 1, 1, 0, 0)
+#endif
+#ifdef TCG_TARGET_HAS_bswap64_i64
+DEF(bswap64_i64, 1, 1, 0, 0)
+#endif
+#ifdef TCG_TARGET_HAS_not_i64
+DEF(not_i64, 1, 1, 0, 0)
+#endif
+#ifdef TCG_TARGET_HAS_neg_i64
+DEF(neg_i64, 1, 1, 0, 0)
+#endif
+#ifdef TCG_TARGET_HAS_andc_i64
+DEF(andc_i64, 1, 2, 0, 0)
+#endif
+#ifdef TCG_TARGET_HAS_orc_i64
+DEF(orc_i64, 1, 2, 0, 0)
+#endif
+#ifdef TCG_TARGET_HAS_eqv_i64
+DEF(eqv_i64, 1, 2, 0, 0)
+#endif
+#ifdef TCG_TARGET_HAS_nand_i64
+DEF(nand_i64, 1, 2, 0, 0)
+#endif
+#ifdef TCG_TARGET_HAS_nor_i64
+DEF(nor_i64, 1, 2, 0, 0)
+#endif
+#endif
+
+/* QEMU specific */
+#if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
+DEF(debug_insn_start, 0, 0, 2, 0)
+#else
+DEF(debug_insn_start, 0, 0, 1, 0)
+#endif
+DEF(exit_tb, 0, 0, 1, TCG_OPF_BB_END | TCG_OPF_SIDE_EFFECTS)
+DEF(goto_tb, 0, 0, 1, TCG_OPF_BB_END | TCG_OPF_SIDE_EFFECTS)
+/* Note: even if TARGET_LONG_BITS is not defined, the INDEX_op
+   constants must be defined */
+#if TCG_TARGET_REG_BITS == 32
+#if TARGET_LONG_BITS == 32
+DEF(qemu_ld8u, 1, 1, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+#else
+DEF(qemu_ld8u, 1, 2, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+#endif
+#if TARGET_LONG_BITS == 32
+DEF(qemu_ld8s, 1, 1, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+#else
+DEF(qemu_ld8s, 1, 2, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+#endif
+#if TARGET_LONG_BITS == 32
+DEF(qemu_ld16u, 1, 1, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+#else
+DEF(qemu_ld16u, 1, 2, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+#endif
+#if TARGET_LONG_BITS == 32
+DEF(qemu_ld16s, 1, 1, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+#else
+DEF(qemu_ld16s, 1, 2, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+#endif
+#if TARGET_LONG_BITS == 32
+DEF(qemu_ld32, 1, 1, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+#else
+DEF(qemu_ld32, 1, 2, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+#endif
+#if TARGET_LONG_BITS == 32
+DEF(qemu_ld64, 2, 1, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+#else
+DEF(qemu_ld64, 2, 2, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+#endif
+
+#if TARGET_LONG_BITS == 32
+DEF(qemu_st8, 0, 2, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+#else
+DEF(qemu_st8, 0, 3, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+#endif
+#if TARGET_LONG_BITS == 32
+DEF(qemu_st16, 0, 2, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+#else
+DEF(qemu_st16, 0, 3, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+#endif
+#if TARGET_LONG_BITS == 32
+DEF(qemu_st32, 0, 2, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+#else
+DEF(qemu_st32, 0, 3, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+#endif
+#if TARGET_LONG_BITS == 32
+DEF(qemu_st64, 0, 3, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+#else
+DEF(qemu_st64, 0, 4, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+#endif
+
+#else /* TCG_TARGET_REG_BITS == 32 */
+
+DEF(qemu_ld8u, 1, 1, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+DEF(qemu_ld8s, 1, 1, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+DEF(qemu_ld16u, 1, 1, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+DEF(qemu_ld16s, 1, 1, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+DEF(qemu_ld32, 1, 1, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+DEF(qemu_ld32u, 1, 1, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+DEF(qemu_ld32s, 1, 1, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+DEF(qemu_ld64, 1, 1, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+
+DEF(qemu_st8, 0, 2, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+DEF(qemu_st16, 0, 2, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+DEF(qemu_st32, 0, 2, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+DEF(qemu_st64, 0, 2, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
+
+#endif /* TCG_TARGET_REG_BITS != 32 */
+
+#undef DEF
diff --git a/src/recompiler/tcg/tcg-runtime.h b/src/recompiler/tcg/tcg-runtime.h
new file mode 100644
index 00000000..5615b133
--- /dev/null
+++ b/src/recompiler/tcg/tcg-runtime.h
@@ -0,0 +1,18 @@
+#ifndef TCG_RUNTIME_H
+#define TCG_RUNTIME_H
+
+/* tcg-runtime.c */
+int32_t tcg_helper_div_i32(int32_t arg1, int32_t arg2);
+int32_t tcg_helper_rem_i32(int32_t arg1, int32_t arg2);
+uint32_t tcg_helper_divu_i32(uint32_t arg1, uint32_t arg2);
+uint32_t tcg_helper_remu_i32(uint32_t arg1, uint32_t arg2);
+
+int64_t tcg_helper_shl_i64(int64_t arg1, int64_t arg2);
+int64_t tcg_helper_shr_i64(int64_t arg1, int64_t arg2);
+int64_t tcg_helper_sar_i64(int64_t arg1, int64_t arg2);
+int64_t tcg_helper_div_i64(int64_t arg1, int64_t arg2);
+int64_t tcg_helper_rem_i64(int64_t arg1, int64_t arg2);
+uint64_t tcg_helper_divu_i64(uint64_t arg1, uint64_t arg2);
+uint64_t tcg_helper_remu_i64(uint64_t arg1, uint64_t arg2);
+
+#endif
diff --git a/src/recompiler/tcg/tcg.c b/src/recompiler/tcg/tcg.c
new file mode 100644
index 00000000..4e636099
--- /dev/null
+++ b/src/recompiler/tcg/tcg.c
@@ -0,0 +1,2212 @@
+/*
+ * Tiny Code Generator for QEMU
+ *
+ * Copyright (c) 2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+/* define it to use liveness analysis (better code) */
+#define USE_LIVENESS_ANALYSIS
+
+#include "config.h"
+
+#if !defined(CONFIG_DEBUG_TCG) && !defined(NDEBUG)
+/* define it to suppress various consistency checks (faster) */
+#define NDEBUG
+#endif
+
+#ifndef VBOX
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+#ifdef _WIN32
+#include <malloc.h>
+#endif
+#ifdef _AIX
+#include <alloca.h>
+#endif
+#else  /* VBOX */
+# include <stdio.h>
+# include "osdep.h"
+#endif /* VBOX */
+
+#include "qemu-common.h"
+#include "cache-utils.h"
+#include "host-utils.h"
+#include "qemu-timer.h"
+
+/* Note: the long term plan is to reduce the dependancies on the QEMU
+   CPU definitions. Currently they are used for qemu_ld/st
+   instructions */
+#define NO_CPU_IO_DEFS
+#include "cpu.h"
+#include "exec-all.h"
+
+#include "tcg-op.h"
+#include "elf.h"
+
+#if defined(CONFIG_USE_GUEST_BASE) && !defined(TCG_TARGET_HAS_GUEST_BASE)
+#error GUEST_BASE not supported on this host.
+#endif
+
+#ifdef VBOX
+/*
+ * Liveness analysis doesn't work well with 32-bit hosts and 64-bit targets,
+ * second element of the register pair to store 64-bit value is considered
+ * dead, it seems. */
+ /** @todo re-test this */
+# if defined(TARGET_X86_64) && (TCG_TARGET_REG_BITS == 32)
+#  undef USE_LIVENESS_ANALYSIS
+# endif
+#endif /* VBOX */
+
+static void tcg_target_init(TCGContext *s);
+static void tcg_target_qemu_prologue(TCGContext *s);
+static void patch_reloc(uint8_t *code_ptr, int type,
+                        tcg_target_long value, tcg_target_long addend);
+
+static TCGOpDef tcg_op_defs[] = {
+#define DEF(s, oargs, iargs, cargs, flags) { #s, oargs, iargs, cargs, iargs + oargs + cargs, flags },
+#include "tcg-opc.h"
+#undef DEF
+};
+
+static TCGRegSet tcg_target_available_regs[2];
+static TCGRegSet tcg_target_call_clobber_regs;
+
+/* XXX: move that inside the context */
+uint16_t *gen_opc_ptr;
+TCGArg *gen_opparam_ptr;
+
+static inline void tcg_out8(TCGContext *s, uint8_t v)
+{
+    *s->code_ptr++ = v;
+}
+
+static inline void tcg_out16(TCGContext *s, uint16_t v)
+{
+    *(uint16_t *)s->code_ptr = v;
+    s->code_ptr += 2;
+}
+
+static inline void tcg_out32(TCGContext *s, uint32_t v)
+{
+    *(uint32_t *)s->code_ptr = v;
+    s->code_ptr += 4;
+}
+
+/* label relocation processing */
+
+static void tcg_out_reloc(TCGContext *s, uint8_t *code_ptr, int type,
+                          int label_index, tcg_target_long addend)
+{
+    TCGLabel *l;
+    TCGRelocation *r;
+
+    l = &s->labels[label_index];
+    if (l->has_value) {
+        /* FIXME: This may break relocations on RISC targets that
+           modify instruction fields in place.  The caller may not have
+           written the initial value.  */
+        patch_reloc(code_ptr, type, l->u.value, addend);
+    } else {
+        /* add a new relocation entry */
+        r = tcg_malloc(sizeof(TCGRelocation));
+        r->type = type;
+        r->ptr = code_ptr;
+        r->addend = addend;
+        r->next = l->u.first_reloc;
+        l->u.first_reloc = r;
+    }
+}
+
+static void tcg_out_label(TCGContext *s, int label_index,
+                          tcg_target_long value)
+{
+    TCGLabel *l;
+    TCGRelocation *r;
+
+    l = &s->labels[label_index];
+    if (l->has_value)
+        tcg_abort();
+    r = l->u.first_reloc;
+    while (r != NULL) {
+        patch_reloc(r->ptr, r->type, value, r->addend);
+        r = r->next;
+    }
+    l->has_value = 1;
+    l->u.value = value;
+}
+
+int gen_new_label(void)
+{
+    TCGContext *s = &tcg_ctx;
+    int idx;
+    TCGLabel *l;
+
+    if (s->nb_labels >= TCG_MAX_LABELS)
+        tcg_abort();
+    idx = s->nb_labels++;
+    l = &s->labels[idx];
+    l->has_value = 0;
+    l->u.first_reloc = NULL;
+    return idx;
+}
+
+#include "tcg-target.c"
+
+/* pool based memory allocation */
+void *tcg_malloc_internal(TCGContext *s, int size)
+{
+    TCGPool *p;
+    int pool_size;
+
+    if (size > TCG_POOL_CHUNK_SIZE) {
+        /* big malloc: insert a new pool (XXX: could optimize) */
+        p = qemu_malloc(sizeof(TCGPool) + size);
+        p->size = size;
+        if (s->pool_current)
+            s->pool_current->next = p;
+        else
+            s->pool_first = p;
+        p->next = s->pool_current;
+    } else {
+        p = s->pool_current;
+        if (!p) {
+            p = s->pool_first;
+            if (!p)
+                goto new_pool;
+        } else {
+            if (!p->next) {
+            new_pool:
+                pool_size = TCG_POOL_CHUNK_SIZE;
+                p = qemu_malloc(sizeof(TCGPool) + pool_size);
+                p->size = pool_size;
+                p->next = NULL;
+                if (s->pool_current)
+                    s->pool_current->next = p;
+                else
+                    s->pool_first = p;
+            } else {
+                p = p->next;
+            }
+        }
+    }
+    s->pool_current = p;
+    s->pool_cur = p->data + size;
+    s->pool_end = p->data + p->size;
+    return p->data;
+}
+
+void tcg_pool_reset(TCGContext *s)
+{
+    s->pool_cur = s->pool_end = NULL;
+    s->pool_current = NULL;
+}
+
+void tcg_context_init(TCGContext *s)
+{
+    int op, total_args, n;
+    TCGOpDef *def;
+    TCGArgConstraint *args_ct;
+    int *sorted_args;
+
+    memset(s, 0, sizeof(*s));
+    s->temps = s->static_temps;
+    s->nb_globals = 0;
+
+    /* Count total number of arguments and allocate the corresponding
+       space */
+    total_args = 0;
+    for(op = 0; op < NB_OPS; op++) {
+        def = &tcg_op_defs[op];
+        n = def->nb_iargs + def->nb_oargs;
+        total_args += n;
+    }
+
+    args_ct = qemu_malloc(sizeof(TCGArgConstraint) * total_args);
+    sorted_args = qemu_malloc(sizeof(int) * total_args);
+
+    for(op = 0; op < NB_OPS; op++) {
+        def = &tcg_op_defs[op];
+        def->args_ct = args_ct;
+        def->sorted_args = sorted_args;
+        n = def->nb_iargs + def->nb_oargs;
+        sorted_args += n;
+        args_ct += n;
+    }
+
+    tcg_target_init(s);
+}
+
+void tcg_prologue_init(TCGContext *s)
+{
+    /* init global prologue and epilogue */
+    s->code_buf = code_gen_prologue;
+    s->code_ptr = s->code_buf;
+    tcg_target_qemu_prologue(s);
+    flush_icache_range((uintptr_t)s->code_buf,
+                       (uintptr_t)s->code_ptr);
+}
+
+void tcg_set_frame(TCGContext *s, int reg,
+                   tcg_target_long start, tcg_target_long size)
+{
+    s->frame_start = start;
+    s->frame_end = start + size;
+    s->frame_reg = reg;
+}
+
+void tcg_func_start(TCGContext *s)
+{
+    int i;
+    tcg_pool_reset(s);
+    s->nb_temps = s->nb_globals;
+    for(i = 0; i < (TCG_TYPE_COUNT * 2); i++)
+        s->first_free_temp[i] = -1;
+    s->labels = tcg_malloc(sizeof(TCGLabel) * TCG_MAX_LABELS);
+    s->nb_labels = 0;
+    s->current_frame_offset = s->frame_start;
+
+    gen_opc_ptr = gen_opc_buf;
+    gen_opparam_ptr = gen_opparam_buf;
+}
+
+static inline void tcg_temp_alloc(TCGContext *s, int n)
+{
+    if (n > TCG_MAX_TEMPS)
+        tcg_abort();
+}
+
+static inline int tcg_global_reg_new_internal(TCGType type, int reg,
+                                              const char *name)
+{
+    TCGContext *s = &tcg_ctx;
+    TCGTemp *ts;
+    int idx;
+
+#if TCG_TARGET_REG_BITS == 32
+    if (type != TCG_TYPE_I32)
+        tcg_abort();
+#endif
+    if (tcg_regset_test_reg(s->reserved_regs, reg))
+        tcg_abort();
+    idx = s->nb_globals;
+    tcg_temp_alloc(s, s->nb_globals + 1);
+    ts = &s->temps[s->nb_globals];
+    ts->base_type = type;
+    ts->type = type;
+    ts->fixed_reg = 1;
+    ts->reg = reg;
+    ts->name = name;
+    s->nb_globals++;
+    tcg_regset_set_reg(s->reserved_regs, reg);
+    return idx;
+}
+
+TCGv_i32 tcg_global_reg_new_i32(int reg, const char *name)
+{
+    int idx;
+
+    idx = tcg_global_reg_new_internal(TCG_TYPE_I32, reg, name);
+    return MAKE_TCGV_I32(idx);
+}
+
+TCGv_i64 tcg_global_reg_new_i64(int reg, const char *name)
+{
+    int idx;
+
+    idx = tcg_global_reg_new_internal(TCG_TYPE_I64, reg, name);
+    return MAKE_TCGV_I64(idx);
+}
+
+static inline int tcg_global_mem_new_internal(TCGType type, int reg,
+                                              tcg_target_long offset,
+                                              const char *name)
+{
+    TCGContext *s = &tcg_ctx;
+    TCGTemp *ts;
+    int idx;
+
+    idx = s->nb_globals;
+#if TCG_TARGET_REG_BITS == 32
+    if (type == TCG_TYPE_I64) {
+        char buf[64];
+        tcg_temp_alloc(s, s->nb_globals + 2);
+        ts = &s->temps[s->nb_globals];
+        ts->base_type = type;
+        ts->type = TCG_TYPE_I32;
+        ts->fixed_reg = 0;
+        ts->mem_allocated = 1;
+        ts->mem_reg = reg;
+#ifdef TCG_TARGET_WORDS_BIGENDIAN
+        ts->mem_offset = offset + 4;
+#else
+        ts->mem_offset = offset;
+#endif
+        pstrcpy(buf, sizeof(buf), name);
+        pstrcat(buf, sizeof(buf), "_0");
+        ts->name = strdup(buf);
+        ts++;
+
+        ts->base_type = type;
+        ts->type = TCG_TYPE_I32;
+        ts->fixed_reg = 0;
+        ts->mem_allocated = 1;
+        ts->mem_reg = reg;
+#ifdef TCG_TARGET_WORDS_BIGENDIAN
+        ts->mem_offset = offset;
+#else
+        ts->mem_offset = offset + 4;
+#endif
+        pstrcpy(buf, sizeof(buf), name);
+        pstrcat(buf, sizeof(buf), "_1");
+        ts->name = strdup(buf);
+
+        s->nb_globals += 2;
+    } else
+#endif
+    {
+        tcg_temp_alloc(s, s->nb_globals + 1);
+        ts = &s->temps[s->nb_globals];
+        ts->base_type = type;
+        ts->type = type;
+        ts->fixed_reg = 0;
+        ts->mem_allocated = 1;
+        ts->mem_reg = reg;
+        ts->mem_offset = offset;
+        ts->name = name;
+        s->nb_globals++;
+    }
+    return idx;
+}
+
+TCGv_i32 tcg_global_mem_new_i32(int reg, tcg_target_long offset,
+                                const char *name)
+{
+    int idx;
+
+    idx = tcg_global_mem_new_internal(TCG_TYPE_I32, reg, offset, name);
+    return MAKE_TCGV_I32(idx);
+}
+
+TCGv_i64 tcg_global_mem_new_i64(int reg, tcg_target_long offset,
+                                const char *name)
+{
+    int idx;
+
+    idx = tcg_global_mem_new_internal(TCG_TYPE_I64, reg, offset, name);
+    return MAKE_TCGV_I64(idx);
+}
+
+static inline int tcg_temp_new_internal(TCGType type, int temp_local)
+{
+    TCGContext *s = &tcg_ctx;
+    TCGTemp *ts;
+    int idx, k;
+
+    k = type;
+    if (temp_local)
+        k += TCG_TYPE_COUNT;
+    idx = s->first_free_temp[k];
+    if (idx != -1) {
+        /* There is already an available temp with the
+           right type */
+        ts = &s->temps[idx];
+        s->first_free_temp[k] = ts->next_free_temp;
+        ts->temp_allocated = 1;
+        assert(ts->temp_local == temp_local);
+    } else {
+        idx = s->nb_temps;
+#if TCG_TARGET_REG_BITS == 32
+        if (type == TCG_TYPE_I64) {
+            tcg_temp_alloc(s, s->nb_temps + 2);
+            ts = &s->temps[s->nb_temps];
+            ts->base_type = type;
+            ts->type = TCG_TYPE_I32;
+            ts->temp_allocated = 1;
+            ts->temp_local = temp_local;
+            ts->name = NULL;
+            ts++;
+            ts->base_type = TCG_TYPE_I32;
+            ts->type = TCG_TYPE_I32;
+            ts->temp_allocated = 1;
+            ts->temp_local = temp_local;
+            ts->name = NULL;
+            s->nb_temps += 2;
+        } else
+#endif
+        {
+            tcg_temp_alloc(s, s->nb_temps + 1);
+            ts = &s->temps[s->nb_temps];
+            ts->base_type = type;
+            ts->type = type;
+            ts->temp_allocated = 1;
+            ts->temp_local = temp_local;
+            ts->name = NULL;
+            s->nb_temps++;
+        }
+    }
+    return idx;
+}
+
+TCGv_i32 tcg_temp_new_internal_i32(int temp_local)
+{
+    int idx;
+
+    idx = tcg_temp_new_internal(TCG_TYPE_I32, temp_local);
+    return MAKE_TCGV_I32(idx);
+}
+
+TCGv_i64 tcg_temp_new_internal_i64(int temp_local)
+{
+    int idx;
+
+    idx = tcg_temp_new_internal(TCG_TYPE_I64, temp_local);
+    return MAKE_TCGV_I64(idx);
+}
+
+static inline void tcg_temp_free_internal(int idx)
+{
+    TCGContext *s = &tcg_ctx;
+    TCGTemp *ts;
+    int k;
+
+    assert(idx >= s->nb_globals && idx < s->nb_temps);
+    ts = &s->temps[idx];
+    assert(ts->temp_allocated != 0);
+    ts->temp_allocated = 0;
+    k = ts->base_type;
+    if (ts->temp_local)
+        k += TCG_TYPE_COUNT;
+    ts->next_free_temp = s->first_free_temp[k];
+    s->first_free_temp[k] = idx;
+}
+
+void tcg_temp_free_i32(TCGv_i32 arg)
+{
+    tcg_temp_free_internal(GET_TCGV_I32(arg));
+}
+
+void tcg_temp_free_i64(TCGv_i64 arg)
+{
+    tcg_temp_free_internal(GET_TCGV_I64(arg));
+}
+
+TCGv_i32 tcg_const_i32(int32_t val)
+{
+    TCGv_i32 t0;
+    t0 = tcg_temp_new_i32();
+    tcg_gen_movi_i32(t0, val);
+    return t0;
+}
+
+TCGv_i64 tcg_const_i64(int64_t val)
+{
+    TCGv_i64 t0;
+    t0 = tcg_temp_new_i64();
+    tcg_gen_movi_i64(t0, val);
+    return t0;
+}
+
+TCGv_i32 tcg_const_local_i32(int32_t val)
+{
+    TCGv_i32 t0;
+    t0 = tcg_temp_local_new_i32();
+    tcg_gen_movi_i32(t0, val);
+    return t0;
+}
+
+TCGv_i64 tcg_const_local_i64(int64_t val)
+{
+    TCGv_i64 t0;
+    t0 = tcg_temp_local_new_i64();
+    tcg_gen_movi_i64(t0, val);
+    return t0;
+}
+
+void tcg_register_helper(void *func, const char *name)
+{
+    TCGContext *s = &tcg_ctx;
+    int n;
+    if ((s->nb_helpers + 1) > s->allocated_helpers) {
+        n = s->allocated_helpers;
+        if (n == 0) {
+            n = 4;
+        } else {
+            n *= 2;
+        }
+#ifdef VBOX
+        s->helpers = qemu_realloc(s->helpers, n * sizeof(TCGHelperInfo));
+#else
+        s->helpers = realloc(s->helpers, n * sizeof(TCGHelperInfo));
+#endif
+        s->allocated_helpers = n;
+    }
+    s->helpers[s->nb_helpers].func = (tcg_target_ulong)func;
+    s->helpers[s->nb_helpers].name = name;
+    s->nb_helpers++;
+}
+
+/* Note: we convert the 64 bit args to 32 bit and do some alignment
+   and endian swap. Maybe it would be better to do the alignment
+   and endian swap in tcg_reg_alloc_call(). */
+void tcg_gen_callN(TCGContext *s, TCGv_ptr func, unsigned int flags,
+                   int sizemask, TCGArg ret, int nargs, TCGArg *args)
+{
+#ifdef TCG_TARGET_I386
+    int call_type;
+#endif
+    int i;
+    int real_args;
+    int nb_rets;
+    TCGArg *nparam;
+
+#if defined(TCG_TARGET_EXTEND_ARGS) && TCG_TARGET_REG_BITS == 64
+    for (i = 0; i < nargs; ++i) {
+        int is_64bit = sizemask & (1 << (i+1)*2);
+        int is_signed = sizemask & (2 << (i+1)*2);
+        if (!is_64bit) {
+            TCGv_i64 temp = tcg_temp_new_i64();
+            TCGv_i64 orig = MAKE_TCGV_I64(args[i]);
+            if (is_signed) {
+                tcg_gen_ext32s_i64(temp, orig);
+            } else {
+                tcg_gen_ext32u_i64(temp, orig);
+            }
+            args[i] = GET_TCGV_I64(temp);
+        }
+    }
+#endif /* TCG_TARGET_EXTEND_ARGS */
+
+    *gen_opc_ptr++ = INDEX_op_call;
+    nparam = gen_opparam_ptr++;
+#ifdef TCG_TARGET_I386
+    call_type = (flags & TCG_CALL_TYPE_MASK);
+#endif
+    if (ret != TCG_CALL_DUMMY_ARG) {
+#if TCG_TARGET_REG_BITS < 64
+        if (sizemask & 1) {
+#ifdef TCG_TARGET_WORDS_BIGENDIAN
+            *gen_opparam_ptr++ = ret + 1;
+            *gen_opparam_ptr++ = ret;
+#else
+            *gen_opparam_ptr++ = ret;
+            *gen_opparam_ptr++ = ret + 1;
+#endif
+            nb_rets = 2;
+        } else
+#endif
+        {
+            *gen_opparam_ptr++ = ret;
+            nb_rets = 1;
+        }
+    } else {
+        nb_rets = 0;
+    }
+    real_args = 0;
+    for (i = 0; i < nargs; i++) {
+#if TCG_TARGET_REG_BITS < 64
+        int is_64bit = sizemask & (1 << (i+1)*2);
+        if (is_64bit) {
+#ifdef TCG_TARGET_I386
+            /* REGPARM case: if the third parameter is 64 bit, it is
+               allocated on the stack */
+            if (i == 2 && call_type == TCG_CALL_TYPE_REGPARM) {
+                call_type = TCG_CALL_TYPE_REGPARM_2;
+                flags = (flags & ~TCG_CALL_TYPE_MASK) | call_type;
+            }
+#endif
+#ifdef TCG_TARGET_CALL_ALIGN_ARGS
+            /* some targets want aligned 64 bit args */
+            if (real_args & 1) {
+                *gen_opparam_ptr++ = TCG_CALL_DUMMY_ARG;
+                real_args++;
+            }
+#endif
+	    /* If stack grows up, then we will be placing successive
+	       arguments at lower addresses, which means we need to
+	       reverse the order compared to how we would normally
+	       treat either big or little-endian.  For those arguments
+	       that will wind up in registers, this still works for
+	       HPPA (the only current STACK_GROWSUP target) since the
+	       argument registers are *also* allocated in decreasing
+	       order.  If another such target is added, this logic may
+	       have to get more complicated to differentiate between
+	       stack arguments and register arguments.  */
+#if defined(TCG_TARGET_WORDS_BIGENDIAN) != defined(TCG_TARGET_STACK_GROWSUP)
+            *gen_opparam_ptr++ = args[i] + 1;
+            *gen_opparam_ptr++ = args[i];
+#else
+            *gen_opparam_ptr++ = args[i];
+            *gen_opparam_ptr++ = args[i] + 1;
+#endif
+            real_args += 2;
+            continue;
+        }
+#endif /* TCG_TARGET_REG_BITS < 64 */
+
+        *gen_opparam_ptr++ = args[i];
+        real_args++;
+    }
+    *gen_opparam_ptr++ = GET_TCGV_PTR(func);
+
+    *gen_opparam_ptr++ = flags;
+
+    *nparam = (nb_rets << 16) | (real_args + 1);
+
+    /* total parameters, needed to go backward in the instruction stream */
+    *gen_opparam_ptr++ = 1 + nb_rets + real_args + 3;
+
+#if defined(TCG_TARGET_EXTEND_ARGS) && TCG_TARGET_REG_BITS == 64
+    for (i = 0; i < nargs; ++i) {
+        int is_64bit = sizemask & (1 << (i+1)*2);
+        if (!is_64bit) {
+            TCGv_i64 temp = MAKE_TCGV_I64(args[i]);
+            tcg_temp_free_i64(temp);
+        }
+    }
+#endif /* TCG_TARGET_EXTEND_ARGS */
+}
+
+#if TCG_TARGET_REG_BITS == 32
+void tcg_gen_shifti_i64(TCGv_i64 ret, TCGv_i64 arg1,
+                        int c, int right, int arith)
+{
+    if (c == 0) {
+        tcg_gen_mov_i32(TCGV_LOW(ret), TCGV_LOW(arg1));
+        tcg_gen_mov_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1));
+    } else if (c >= 32) {
+        c -= 32;
+        if (right) {
+            if (arith) {
+                tcg_gen_sari_i32(TCGV_LOW(ret), TCGV_HIGH(arg1), c);
+                tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), 31);
+            } else {
+                tcg_gen_shri_i32(TCGV_LOW(ret), TCGV_HIGH(arg1), c);
+                tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+            }
+        } else {
+            tcg_gen_shli_i32(TCGV_HIGH(ret), TCGV_LOW(arg1), c);
+            tcg_gen_movi_i32(TCGV_LOW(ret), 0);
+        }
+    } else {
+        TCGv_i32 t0, t1;
+
+        t0 = tcg_temp_new_i32();
+        t1 = tcg_temp_new_i32();
+        if (right) {
+            tcg_gen_shli_i32(t0, TCGV_HIGH(arg1), 32 - c);
+            if (arith)
+                tcg_gen_sari_i32(t1, TCGV_HIGH(arg1), c);
+            else
+                tcg_gen_shri_i32(t1, TCGV_HIGH(arg1), c);
+            tcg_gen_shri_i32(TCGV_LOW(ret), TCGV_LOW(arg1), c);
+            tcg_gen_or_i32(TCGV_LOW(ret), TCGV_LOW(ret), t0);
+            tcg_gen_mov_i32(TCGV_HIGH(ret), t1);
+        } else {
+            tcg_gen_shri_i32(t0, TCGV_LOW(arg1), 32 - c);
+            /* Note: ret can be the same as arg1, so we use t1 */
+            tcg_gen_shli_i32(t1, TCGV_LOW(arg1), c);
+            tcg_gen_shli_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), c);
+            tcg_gen_or_i32(TCGV_HIGH(ret), TCGV_HIGH(ret), t0);
+            tcg_gen_mov_i32(TCGV_LOW(ret), t1);
+        }
+        tcg_temp_free_i32(t0);
+        tcg_temp_free_i32(t1);
+    }
+}
+#endif
+
+
+static void tcg_reg_alloc_start(TCGContext *s)
+{
+    int i;
+    TCGTemp *ts;
+    for(i = 0; i < s->nb_globals; i++) {
+        ts = &s->temps[i];
+        if (ts->fixed_reg) {
+            ts->val_type = TEMP_VAL_REG;
+        } else {
+            ts->val_type = TEMP_VAL_MEM;
+        }
+    }
+    for(i = s->nb_globals; i < s->nb_temps; i++) {
+        ts = &s->temps[i];
+        ts->val_type = TEMP_VAL_DEAD;
+        ts->mem_allocated = 0;
+        ts->fixed_reg = 0;
+    }
+    for(i = 0; i < TCG_TARGET_NB_REGS; i++) {
+        s->reg_to_temp[i] = -1;
+    }
+}
+
+static char *tcg_get_arg_str_idx(TCGContext *s, char *buf, int buf_size,
+                                 int idx)
+{
+    TCGTemp *ts;
+
+    ts = &s->temps[idx];
+    if (idx < s->nb_globals) {
+        pstrcpy(buf, buf_size, ts->name);
+    } else {
+        if (ts->temp_local)
+            snprintf(buf, buf_size, "loc%d", idx - s->nb_globals);
+        else
+            snprintf(buf, buf_size, "tmp%d", idx - s->nb_globals);
+    }
+    return buf;
+}
+
+char *tcg_get_arg_str_i32(TCGContext *s, char *buf, int buf_size, TCGv_i32 arg)
+{
+    return tcg_get_arg_str_idx(s, buf, buf_size, GET_TCGV_I32(arg));
+}
+
+char *tcg_get_arg_str_i64(TCGContext *s, char *buf, int buf_size, TCGv_i64 arg)
+{
+    return tcg_get_arg_str_idx(s, buf, buf_size, GET_TCGV_I64(arg));
+}
+
+static int helper_cmp(const void *p1, const void *p2)
+{
+    const TCGHelperInfo *th1 = p1;
+    const TCGHelperInfo *th2 = p2;
+    if (th1->func < th2->func)
+        return -1;
+    else if (th1->func == th2->func)
+        return 0;
+    else
+        return 1;
+}
+
+/* find helper definition (Note: A hash table would be better) */
+static TCGHelperInfo *tcg_find_helper(TCGContext *s, tcg_target_ulong val)
+{
+    int m, m_min, m_max;
+    TCGHelperInfo *th;
+    tcg_target_ulong v;
+
+    if (unlikely(!s->helpers_sorted)) {
+#ifdef VBOX
+        qemu_qsort(s->helpers, s->nb_helpers, sizeof(TCGHelperInfo),
+              helper_cmp);
+#else
+        qsort(s->helpers, s->nb_helpers, sizeof(TCGHelperInfo),
+              helper_cmp);
+#endif
+        s->helpers_sorted = 1;
+    }
+
+    /* binary search */
+    m_min = 0;
+    m_max = s->nb_helpers - 1;
+    while (m_min <= m_max) {
+        m = (m_min + m_max) >> 1;
+        th = &s->helpers[m];
+        v = th->func;
+        if (v == val)
+            return th;
+        else if (val < v) {
+            m_max = m - 1;
+        } else {
+            m_min = m + 1;
+        }
+    }
+    return NULL;
+}
+
+static const char * const cond_name[] =
+{
+    [TCG_COND_EQ] = "eq",
+    [TCG_COND_NE] = "ne",
+    [TCG_COND_LT] = "lt",
+    [TCG_COND_GE] = "ge",
+    [TCG_COND_LE] = "le",
+    [TCG_COND_GT] = "gt",
+    [TCG_COND_LTU] = "ltu",
+    [TCG_COND_GEU] = "geu",
+    [TCG_COND_LEU] = "leu",
+    [TCG_COND_GTU] = "gtu"
+};
+
+void tcg_dump_ops(TCGContext *s, FILE *outfile)
+{
+    const uint16_t *opc_ptr;
+    const TCGArg *args;
+    TCGArg arg;
+    TCGOpcode c;
+    int i, k, nb_oargs, nb_iargs, nb_cargs, first_insn;
+    const TCGOpDef *def;
+    char buf[128];
+
+    first_insn = 1;
+    opc_ptr = gen_opc_buf;
+    args = gen_opparam_buf;
+    while (opc_ptr < gen_opc_ptr) {
+        c = *opc_ptr++;
+        def = &tcg_op_defs[c];
+        if (c == INDEX_op_debug_insn_start) {
+            uint64_t pc;
+#if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
+            pc = ((uint64_t)args[1] << 32) | args[0];
+#else
+            pc = args[0];
+#endif
+            if (!first_insn)
+                fprintf(outfile, "\n");
+            fprintf(outfile, " ---- 0x%" PRIx64, pc);
+            first_insn = 0;
+            nb_oargs = def->nb_oargs;
+            nb_iargs = def->nb_iargs;
+            nb_cargs = def->nb_cargs;
+        } else if (c == INDEX_op_call) {
+            TCGArg arg;
+
+            /* variable number of arguments */
+            arg = *args++;
+            nb_oargs = arg >> 16;
+            nb_iargs = arg & 0xffff;
+            nb_cargs = def->nb_cargs;
+
+            fprintf(outfile, " %s ", def->name);
+
+            /* function name */
+            fprintf(outfile, "%s",
+                    tcg_get_arg_str_idx(s, buf, sizeof(buf), args[nb_oargs + nb_iargs - 1]));
+            /* flags */
+            fprintf(outfile, ",$0x%" TCG_PRIlx,
+                    args[nb_oargs + nb_iargs]);
+            /* nb out args */
+            fprintf(outfile, ",$%d", nb_oargs);
+            for(i = 0; i < nb_oargs; i++) {
+                fprintf(outfile, ",");
+                fprintf(outfile, "%s",
+                        tcg_get_arg_str_idx(s, buf, sizeof(buf), args[i]));
+            }
+            for(i = 0; i < (nb_iargs - 1); i++) {
+                fprintf(outfile, ",");
+                if (args[nb_oargs + i] == TCG_CALL_DUMMY_ARG) {
+                    fprintf(outfile, "<dummy>");
+                } else {
+                    fprintf(outfile, "%s",
+                            tcg_get_arg_str_idx(s, buf, sizeof(buf), args[nb_oargs + i]));
+                }
+            }
+        } else if (c == INDEX_op_movi_i32
+#if TCG_TARGET_REG_BITS == 64
+                   || c == INDEX_op_movi_i64
+#endif
+                   ) {
+            tcg_target_ulong val;
+            TCGHelperInfo *th;
+
+            nb_oargs = def->nb_oargs;
+            nb_iargs = def->nb_iargs;
+            nb_cargs = def->nb_cargs;
+            fprintf(outfile, " %s %s,$", def->name,
+                    tcg_get_arg_str_idx(s, buf, sizeof(buf), args[0]));
+            val = args[1];
+            th = tcg_find_helper(s, val);
+            if (th) {
+                fprintf(outfile, "%s", th->name);
+            } else {
+                if (c == INDEX_op_movi_i32)
+                    fprintf(outfile, "0x%x", (uint32_t)val);
+                else
+                    fprintf(outfile, "0x%" PRIx64 , (uint64_t)val);
+            }
+        } else {
+            fprintf(outfile, " %s ", def->name);
+            if (c == INDEX_op_nopn) {
+                /* variable number of arguments */
+                nb_cargs = *args;
+                nb_oargs = 0;
+                nb_iargs = 0;
+            } else {
+                nb_oargs = def->nb_oargs;
+                nb_iargs = def->nb_iargs;
+                nb_cargs = def->nb_cargs;
+            }
+
+            k = 0;
+            for(i = 0; i < nb_oargs; i++) {
+                if (k != 0)
+                    fprintf(outfile, ",");
+                fprintf(outfile, "%s",
+                        tcg_get_arg_str_idx(s, buf, sizeof(buf), args[k++]));
+            }
+            for(i = 0; i < nb_iargs; i++) {
+                if (k != 0)
+                    fprintf(outfile, ",");
+                fprintf(outfile, "%s",
+                        tcg_get_arg_str_idx(s, buf, sizeof(buf), args[k++]));
+            }
+            switch (c) {
+            case INDEX_op_brcond_i32:
+#if TCG_TARGET_REG_BITS == 32
+            case INDEX_op_brcond2_i32:
+#elif TCG_TARGET_REG_BITS == 64
+            case INDEX_op_brcond_i64:
+#endif
+            case INDEX_op_setcond_i32:
+#if TCG_TARGET_REG_BITS == 32
+            case INDEX_op_setcond2_i32:
+#elif TCG_TARGET_REG_BITS == 64
+            case INDEX_op_setcond_i64:
+#endif
+                if (args[k] < ARRAY_SIZE(cond_name) && cond_name[args[k]])
+                    fprintf(outfile, ",%s", cond_name[args[k++]]);
+                else
+                    fprintf(outfile, ",$0x%" TCG_PRIlx, args[k++]);
+                i = 1;
+                break;
+            default:
+                i = 0;
+                break;
+            }
+            for(; i < nb_cargs; i++) {
+                if (k != 0)
+                    fprintf(outfile, ",");
+                arg = args[k++];
+                fprintf(outfile, "$0x%" TCG_PRIlx, arg);
+            }
+        }
+        fprintf(outfile, "\n");
+        args += nb_iargs + nb_oargs + nb_cargs;
+    }
+}
+
+/* we give more priority to constraints with less registers */
+static int get_constraint_priority(const TCGOpDef *def, int k)
+{
+    const TCGArgConstraint *arg_ct;
+
+    int i, n;
+    arg_ct = &def->args_ct[k];
+    if (arg_ct->ct & TCG_CT_ALIAS) {
+        /* an alias is equivalent to a single register */
+        n = 1;
+    } else {
+        if (!(arg_ct->ct & TCG_CT_REG))
+            return 0;
+        n = 0;
+        for(i = 0; i < TCG_TARGET_NB_REGS; i++) {
+            if (tcg_regset_test_reg(arg_ct->u.regs, i))
+                n++;
+        }
+    }
+    return TCG_TARGET_NB_REGS - n + 1;
+}
+
+/* sort from highest priority to lowest */
+static void sort_constraints(TCGOpDef *def, int start, int n)
+{
+    int i, j, p1, p2, tmp;
+
+    for(i = 0; i < n; i++)
+        def->sorted_args[start + i] = start + i;
+    if (n <= 1)
+        return;
+    for(i = 0; i < n - 1; i++) {
+        for(j = i + 1; j < n; j++) {
+            p1 = get_constraint_priority(def, def->sorted_args[start + i]);
+            p2 = get_constraint_priority(def, def->sorted_args[start + j]);
+            if (p1 < p2) {
+                tmp = def->sorted_args[start + i];
+                def->sorted_args[start + i] = def->sorted_args[start + j];
+                def->sorted_args[start + j] = tmp;
+            }
+        }
+    }
+}
+
+void tcg_add_target_add_op_defs(const TCGTargetOpDef *tdefs)
+{
+    TCGOpcode op;
+    TCGOpDef *def;
+    const char *ct_str;
+    int i, nb_args;
+
+    for(;;) {
+        if (tdefs->op == (TCGOpcode)-1)
+            break;
+        op = tdefs->op;
+        assert((unsigned)op < (unsigned)NB_OPS);
+        def = &tcg_op_defs[op];
+#if defined(CONFIG_DEBUG_TCG)
+        /* Duplicate entry in op definitions? */
+        assert(!def->used);
+        def->used = 1;
+#endif
+        nb_args = def->nb_iargs + def->nb_oargs;
+        for(i = 0; i < nb_args; i++) {
+            ct_str = tdefs->args_ct_str[i];
+            /* Incomplete TCGTargetOpDef entry? */
+            assert(ct_str != NULL);
+            tcg_regset_clear(def->args_ct[i].u.regs);
+            def->args_ct[i].ct = 0;
+            if (ct_str[0] >= '0' && ct_str[0] <= '9') {
+                int oarg;
+                oarg = ct_str[0] - '0';
+                assert(oarg < def->nb_oargs);
+                assert(def->args_ct[oarg].ct & TCG_CT_REG);
+                /* TCG_CT_ALIAS is for the output arguments. The input
+                   argument is tagged with TCG_CT_IALIAS. */
+                def->args_ct[i] = def->args_ct[oarg];
+                def->args_ct[oarg].ct = TCG_CT_ALIAS;
+                def->args_ct[oarg].alias_index = i;
+                def->args_ct[i].ct |= TCG_CT_IALIAS;
+                def->args_ct[i].alias_index = oarg;
+            } else {
+                for(;;) {
+                    if (*ct_str == '\0')
+                        break;
+                    switch(*ct_str) {
+                    case 'i':
+                        def->args_ct[i].ct |= TCG_CT_CONST;
+                        ct_str++;
+                        break;
+                    default:
+                        if (target_parse_constraint(&def->args_ct[i], &ct_str) < 0) {
+                            fprintf(stderr, "Invalid constraint '%s' for arg %d of operation '%s'\n",
+                                    ct_str, i, def->name);
+#ifndef VBOX
+                            exit(1);
+#else
+                            tcg_exit(1);
+#endif
+                        }
+                    }
+                }
+            }
+        }
+
+        /* TCGTargetOpDef entry with too much information? */
+        assert(i == TCG_MAX_OP_ARGS || tdefs->args_ct_str[i] == NULL);
+
+        /* sort the constraints (XXX: this is just an heuristic) */
+        sort_constraints(def, 0, def->nb_oargs);
+        sort_constraints(def, def->nb_oargs, def->nb_iargs);
+
+#if 0
+        {
+            int i;
+
+            printf("%s: sorted=", def->name);
+            for(i = 0; i < def->nb_oargs + def->nb_iargs; i++)
+                printf(" %d", def->sorted_args[i]);
+            printf("\n");
+        }
+#endif
+        tdefs++;
+    }
+
+#if defined(CONFIG_DEBUG_TCG)
+    i = 0;
+    for (op = 0; op < ARRAY_SIZE(tcg_op_defs); op++) {
+        if (op < INDEX_op_call || op == INDEX_op_debug_insn_start) {
+            /* Wrong entry in op definitions? */
+            if (tcg_op_defs[op].used) {
+                fprintf(stderr, "Invalid op definition for %s\n",
+                        tcg_op_defs[op].name);
+                i = 1;
+            }
+        } else {
+            /* Missing entry in op definitions? */
+            if (!tcg_op_defs[op].used) {
+                fprintf(stderr, "Missing op definition for %s\n",
+                        tcg_op_defs[op].name);
+                i = 1;
+            }
+        }
+    }
+    if (i == 1) {
+        tcg_abort();
+    }
+#endif
+}
+
+#ifdef USE_LIVENESS_ANALYSIS
+
+/* set a nop for an operation using 'nb_args' */
+static inline void tcg_set_nop(TCGContext *s, uint16_t *opc_ptr,
+                               TCGArg *args, int nb_args)
+{
+    if (nb_args == 0) {
+        *opc_ptr = INDEX_op_nop;
+    } else {
+        *opc_ptr = INDEX_op_nopn;
+        args[0] = nb_args;
+        args[nb_args - 1] = nb_args;
+    }
+}
+
+/* liveness analysis: end of function: globals are live, temps are
+   dead. */
+/* XXX: at this stage, not used as there would be little gains because
+   most TBs end with a conditional jump. */
+static inline void tcg_la_func_end(TCGContext *s, uint8_t *dead_temps)
+{
+    memset(dead_temps, 0, s->nb_globals);
+    memset(dead_temps + s->nb_globals, 1, s->nb_temps - s->nb_globals);
+}
+
+/* liveness analysis: end of basic block: globals are live, temps are
+   dead, local temps are live. */
+static inline void tcg_la_bb_end(TCGContext *s, uint8_t *dead_temps)
+{
+    int i;
+    TCGTemp *ts;
+
+    memset(dead_temps, 0, s->nb_globals);
+    ts = &s->temps[s->nb_globals];
+    for(i = s->nb_globals; i < s->nb_temps; i++) {
+        if (ts->temp_local)
+            dead_temps[i] = 0;
+        else
+            dead_temps[i] = 1;
+        ts++;
+    }
+}
+
+/* Liveness analysis : update the opc_dead_iargs array to tell if a
+   given input arguments is dead. Instructions updating dead
+   temporaries are removed. */
+static void tcg_liveness_analysis(TCGContext *s)
+{
+    int i, op_index, nb_args, nb_iargs, nb_oargs, arg, nb_ops;
+    TCGOpcode op;
+    TCGArg *args;
+    const TCGOpDef *def;
+    uint8_t *dead_temps;
+    unsigned int dead_iargs;
+
+    gen_opc_ptr++; /* skip end */
+
+    nb_ops = gen_opc_ptr - gen_opc_buf;
+
+    s->op_dead_iargs = tcg_malloc(nb_ops * sizeof(uint16_t));
+
+    dead_temps = tcg_malloc(s->nb_temps);
+    memset(dead_temps, 1, s->nb_temps);
+
+    args = gen_opparam_ptr;
+    op_index = nb_ops - 1;
+    while (op_index >= 0) {
+        op = gen_opc_buf[op_index];
+        def = &tcg_op_defs[op];
+        switch(op) {
+        case INDEX_op_call:
+            {
+                int call_flags;
+
+                nb_args = args[-1];
+                args -= nb_args;
+                nb_iargs = args[0] & 0xffff;
+                nb_oargs = args[0] >> 16;
+                args++;
+                call_flags = args[nb_oargs + nb_iargs];
+
+                /* pure functions can be removed if their result is not
+                   used */
+                if (call_flags & TCG_CALL_PURE) {
+                    for(i = 0; i < nb_oargs; i++) {
+                        arg = args[i];
+                        if (!dead_temps[arg])
+                            goto do_not_remove_call;
+                    }
+                    tcg_set_nop(s, gen_opc_buf + op_index,
+                                args - 1, nb_args);
+                } else {
+                do_not_remove_call:
+
+                    /* output args are dead */
+                    for(i = 0; i < nb_oargs; i++) {
+                        arg = args[i];
+                        dead_temps[arg] = 1;
+                    }
+
+                    if (!(call_flags & TCG_CALL_CONST)) {
+                        /* globals are live (they may be used by the call) */
+                        memset(dead_temps, 0, s->nb_globals);
+                    }
+
+                    /* input args are live */
+                    dead_iargs = 0;
+                    for(i = 0; i < nb_iargs; i++) {
+                        arg = args[i + nb_oargs];
+                        if (arg != TCG_CALL_DUMMY_ARG) {
+                            if (dead_temps[arg]) {
+                                dead_iargs |= (1 << i);
+                            }
+                            dead_temps[arg] = 0;
+                        }
+                    }
+                    s->op_dead_iargs[op_index] = dead_iargs;
+                }
+                args--;
+            }
+            break;
+        case INDEX_op_set_label:
+            args--;
+            /* mark end of basic block */
+            tcg_la_bb_end(s, dead_temps);
+            break;
+        case INDEX_op_debug_insn_start:
+            args -= def->nb_args;
+            break;
+        case INDEX_op_nopn:
+            nb_args = args[-1];
+            args -= nb_args;
+            break;
+        case INDEX_op_discard:
+            args--;
+            /* mark the temporary as dead */
+            dead_temps[args[0]] = 1;
+            break;
+        case INDEX_op_end:
+            break;
+            /* XXX: optimize by hardcoding common cases (e.g. triadic ops) */
+        default:
+            args -= def->nb_args;
+            nb_iargs = def->nb_iargs;
+            nb_oargs = def->nb_oargs;
+
+            /* Test if the operation can be removed because all
+               its outputs are dead. We assume that nb_oargs == 0
+               implies side effects */
+            if (!(def->flags & TCG_OPF_SIDE_EFFECTS) && nb_oargs != 0) {
+                for(i = 0; i < nb_oargs; i++) {
+                    arg = args[i];
+                    if (!dead_temps[arg])
+                        goto do_not_remove;
+                }
+                tcg_set_nop(s, gen_opc_buf + op_index, args, def->nb_args);
+#ifdef CONFIG_PROFILER
+                s->del_op_count++;
+#endif
+            } else {
+            do_not_remove:
+
+                /* output args are dead */
+                for(i = 0; i < nb_oargs; i++) {
+                    arg = args[i];
+                    dead_temps[arg] = 1;
+                }
+
+                /* if end of basic block, update */
+                if (def->flags & TCG_OPF_BB_END) {
+                    tcg_la_bb_end(s, dead_temps);
+                } else if (def->flags & TCG_OPF_CALL_CLOBBER) {
+                    /* globals are live */
+                    memset(dead_temps, 0, s->nb_globals);
+                }
+
+                /* input args are live */
+                dead_iargs = 0;
+                for(i = 0; i < nb_iargs; i++) {
+                    arg = args[i + nb_oargs];
+                    if (dead_temps[arg]) {
+                        dead_iargs |= (1 << i);
+                    }
+                    dead_temps[arg] = 0;
+                }
+                s->op_dead_iargs[op_index] = dead_iargs;
+            }
+            break;
+        }
+        op_index--;
+    }
+
+    if (args != gen_opparam_buf)
+        tcg_abort();
+}
+#else
+/* dummy liveness analysis */
+static void tcg_liveness_analysis(TCGContext *s)
+{
+    int nb_ops;
+    nb_ops = gen_opc_ptr - gen_opc_buf;
+
+    s->op_dead_iargs = tcg_malloc(nb_ops * sizeof(uint16_t));
+    memset(s->op_dead_iargs, 0, nb_ops * sizeof(uint16_t));
+}
+#endif
+
+#ifndef NDEBUG
+static void dump_regs(TCGContext *s)
+{
+    TCGTemp *ts;
+    int i;
+    char buf[64];
+
+    for(i = 0; i < s->nb_temps; i++) {
+        ts = &s->temps[i];
+        printf("  %10s: ", tcg_get_arg_str_idx(s, buf, sizeof(buf), i));
+        switch(ts->val_type) {
+        case TEMP_VAL_REG:
+            printf("%s", tcg_target_reg_names[ts->reg]);
+            break;
+        case TEMP_VAL_MEM:
+            printf("%d(%s)", (int)ts->mem_offset, tcg_target_reg_names[ts->mem_reg]);
+            break;
+        case TEMP_VAL_CONST:
+            printf("$0x%" TCG_PRIlx, ts->val);
+            break;
+        case TEMP_VAL_DEAD:
+            printf("D");
+            break;
+        default:
+            printf("???");
+            break;
+        }
+        printf("\n");
+    }
+
+    for(i = 0; i < TCG_TARGET_NB_REGS; i++) {
+        if (s->reg_to_temp[i] >= 0) {
+            printf("%s: %s\n",
+                   tcg_target_reg_names[i],
+                   tcg_get_arg_str_idx(s, buf, sizeof(buf), s->reg_to_temp[i]));
+        }
+    }
+}
+
+static void check_regs(TCGContext *s)
+{
+    int reg, k;
+    TCGTemp *ts;
+    char buf[64];
+
+    for(reg = 0; reg < TCG_TARGET_NB_REGS; reg++) {
+        k = s->reg_to_temp[reg];
+        if (k >= 0) {
+            ts = &s->temps[k];
+            if (ts->val_type != TEMP_VAL_REG ||
+                ts->reg != reg) {
+                printf("Inconsistency for register %s:\n",
+                       tcg_target_reg_names[reg]);
+                goto fail;
+            }
+        }
+    }
+    for(k = 0; k < s->nb_temps; k++) {
+        ts = &s->temps[k];
+        if (ts->val_type == TEMP_VAL_REG &&
+            !ts->fixed_reg &&
+            s->reg_to_temp[ts->reg] != k) {
+                printf("Inconsistency for temp %s:\n",
+                       tcg_get_arg_str_idx(s, buf, sizeof(buf), k));
+        fail:
+                printf("reg state:\n");
+                dump_regs(s);
+                tcg_abort();
+        }
+    }
+}
+#endif
+
+static void temp_allocate_frame(TCGContext *s, int temp)
+{
+    TCGTemp *ts;
+    ts = &s->temps[temp];
+    s->current_frame_offset = (s->current_frame_offset + sizeof(tcg_target_long) - 1) & ~(sizeof(tcg_target_long) - 1);
+#ifndef VBOX
+    if (s->current_frame_offset + sizeof(tcg_target_long) > s->frame_end)
+#else
+    if ((tcg_target_long)s->current_frame_offset + sizeof(tcg_target_long) > s->frame_end)
+#endif
+        tcg_abort();
+    ts->mem_offset = s->current_frame_offset;
+    ts->mem_reg = s->frame_reg;
+    ts->mem_allocated = 1;
+    s->current_frame_offset += sizeof(tcg_target_long);
+}
+
+/* free register 'reg' by spilling the corresponding temporary if necessary */
+static void tcg_reg_free(TCGContext *s, int reg)
+{
+    TCGTemp *ts;
+    int temp;
+
+    temp = s->reg_to_temp[reg];
+    if (temp != -1) {
+        ts = &s->temps[temp];
+        assert(ts->val_type == TEMP_VAL_REG);
+        if (!ts->mem_coherent) {
+            if (!ts->mem_allocated)
+                temp_allocate_frame(s, temp);
+            tcg_out_st(s, ts->type, reg, ts->mem_reg, ts->mem_offset);
+        }
+        ts->val_type = TEMP_VAL_MEM;
+        s->reg_to_temp[reg] = -1;
+    }
+}
+
+/* Allocate a register belonging to reg1 & ~reg2 */
+static int tcg_reg_alloc(TCGContext *s, TCGRegSet reg1, TCGRegSet reg2)
+{
+    int i, reg;
+    TCGRegSet reg_ct;
+
+    tcg_regset_andnot(reg_ct, reg1, reg2);
+
+    /* first try free registers */
+    for(i = 0; i < ARRAY_SIZE(tcg_target_reg_alloc_order); i++) {
+        reg = tcg_target_reg_alloc_order[i];
+        if (tcg_regset_test_reg(reg_ct, reg) && s->reg_to_temp[reg] == -1)
+            return reg;
+    }
+
+    /* XXX: do better spill choice */
+    for(i = 0; i < ARRAY_SIZE(tcg_target_reg_alloc_order); i++) {
+        reg = tcg_target_reg_alloc_order[i];
+        if (tcg_regset_test_reg(reg_ct, reg)) {
+            tcg_reg_free(s, reg);
+            return reg;
+        }
+    }
+
+    tcg_abort();
+}
+
+/* save a temporary to memory. 'allocated_regs' is used in case a
+   temporary registers needs to be allocated to store a constant. */
+static void temp_save(TCGContext *s, int temp, TCGRegSet allocated_regs)
+{
+    TCGTemp *ts;
+    int reg;
+
+    ts = &s->temps[temp];
+    if (!ts->fixed_reg) {
+        switch(ts->val_type) {
+        case TEMP_VAL_REG:
+            tcg_reg_free(s, ts->reg);
+            break;
+        case TEMP_VAL_DEAD:
+            ts->val_type = TEMP_VAL_MEM;
+            break;
+        case TEMP_VAL_CONST:
+            reg = tcg_reg_alloc(s, tcg_target_available_regs[ts->type],
+                                allocated_regs);
+            if (!ts->mem_allocated)
+                temp_allocate_frame(s, temp);
+            tcg_out_movi(s, ts->type, reg, ts->val);
+            tcg_out_st(s, ts->type, reg, ts->mem_reg, ts->mem_offset);
+            ts->val_type = TEMP_VAL_MEM;
+            break;
+        case TEMP_VAL_MEM:
+            break;
+        default:
+            tcg_abort();
+        }
+    }
+}
+
+/* save globals to their cannonical location and assume they can be
+   modified be the following code. 'allocated_regs' is used in case a
+   temporary registers needs to be allocated to store a constant. */
+static void save_globals(TCGContext *s, TCGRegSet allocated_regs)
+{
+    int i;
+
+    for(i = 0; i < s->nb_globals; i++) {
+        temp_save(s, i, allocated_regs);
+    }
+}
+
+/* at the end of a basic block, we assume all temporaries are dead and
+   all globals are stored at their canonical location. */
+static void tcg_reg_alloc_bb_end(TCGContext *s, TCGRegSet allocated_regs)
+{
+    TCGTemp *ts;
+    int i;
+
+    for(i = s->nb_globals; i < s->nb_temps; i++) {
+        ts = &s->temps[i];
+        if (ts->temp_local) {
+            temp_save(s, i, allocated_regs);
+        } else {
+            if (ts->val_type == TEMP_VAL_REG) {
+                s->reg_to_temp[ts->reg] = -1;
+            }
+            ts->val_type = TEMP_VAL_DEAD;
+        }
+    }
+
+    save_globals(s, allocated_regs);
+}
+
+#define IS_DEAD_IARG(n) ((dead_iargs >> (n)) & 1)
+
+static void tcg_reg_alloc_movi(TCGContext *s, const TCGArg *args)
+{
+    TCGTemp *ots;
+    tcg_target_ulong val;
+
+    ots = &s->temps[args[0]];
+    val = args[1];
+
+    if (ots->fixed_reg) {
+        /* for fixed registers, we do not do any constant
+           propagation */
+        tcg_out_movi(s, ots->type, ots->reg, val);
+    } else {
+        /* The movi is not explicitly generated here */
+        if (ots->val_type == TEMP_VAL_REG)
+            s->reg_to_temp[ots->reg] = -1;
+        ots->val_type = TEMP_VAL_CONST;
+        ots->val = val;
+    }
+}
+
+static void tcg_reg_alloc_mov(TCGContext *s, const TCGOpDef *def,
+                              const TCGArg *args,
+                              unsigned int dead_iargs)
+{
+    TCGTemp *ts, *ots;
+    int reg;
+    const TCGArgConstraint *arg_ct;
+
+    ots = &s->temps[args[0]];
+    ts = &s->temps[args[1]];
+    arg_ct = &def->args_ct[0];
+
+    /* XXX: always mark arg dead if IS_DEAD_IARG(0) */
+    if (ts->val_type == TEMP_VAL_REG) {
+        if (IS_DEAD_IARG(0) && !ts->fixed_reg && !ots->fixed_reg) {
+            /* the mov can be suppressed */
+            if (ots->val_type == TEMP_VAL_REG)
+                s->reg_to_temp[ots->reg] = -1;
+            reg = ts->reg;
+            s->reg_to_temp[reg] = -1;
+            ts->val_type = TEMP_VAL_DEAD;
+        } else {
+            if (ots->val_type == TEMP_VAL_REG) {
+                reg = ots->reg;
+            } else {
+                reg = tcg_reg_alloc(s, arg_ct->u.regs, s->reserved_regs);
+            }
+            if (ts->reg != reg) {
+                tcg_out_mov(s, ots->type, reg, ts->reg);
+            }
+        }
+    } else if (ts->val_type == TEMP_VAL_MEM) {
+        if (ots->val_type == TEMP_VAL_REG) {
+            reg = ots->reg;
+        } else {
+            reg = tcg_reg_alloc(s, arg_ct->u.regs, s->reserved_regs);
+        }
+        tcg_out_ld(s, ts->type, reg, ts->mem_reg, ts->mem_offset);
+    } else if (ts->val_type == TEMP_VAL_CONST) {
+        if (ots->fixed_reg) {
+            reg = ots->reg;
+            tcg_out_movi(s, ots->type, reg, ts->val);
+        } else {
+            /* propagate constant */
+            if (ots->val_type == TEMP_VAL_REG)
+                s->reg_to_temp[ots->reg] = -1;
+            ots->val_type = TEMP_VAL_CONST;
+            ots->val = ts->val;
+            return;
+        }
+    } else {
+        tcg_abort();
+    }
+    s->reg_to_temp[reg] = args[0];
+    ots->reg = reg;
+    ots->val_type = TEMP_VAL_REG;
+    ots->mem_coherent = 0;
+}
+
+static void tcg_reg_alloc_op(TCGContext *s,
+                             const TCGOpDef *def, TCGOpcode opc,
+                             const TCGArg *args,
+                             unsigned int dead_iargs)
+{
+    TCGRegSet allocated_regs;
+    int i, k, nb_iargs, nb_oargs, reg;
+    TCGArg arg;
+    const TCGArgConstraint *arg_ct;
+    TCGTemp *ts;
+    TCGArg new_args[TCG_MAX_OP_ARGS];
+    int const_args[TCG_MAX_OP_ARGS];
+
+    nb_oargs = def->nb_oargs;
+    nb_iargs = def->nb_iargs;
+
+    /* copy constants */
+    memcpy(new_args + nb_oargs + nb_iargs,
+           args + nb_oargs + nb_iargs,
+           sizeof(TCGArg) * def->nb_cargs);
+
+    /* satisfy input constraints */
+    tcg_regset_set(allocated_regs, s->reserved_regs);
+    for(k = 0; k < nb_iargs; k++) {
+        i = def->sorted_args[nb_oargs + k];
+        arg = args[i];
+        arg_ct = &def->args_ct[i];
+        ts = &s->temps[arg];
+        if (ts->val_type == TEMP_VAL_MEM) {
+            reg = tcg_reg_alloc(s, arg_ct->u.regs, allocated_regs);
+            tcg_out_ld(s, ts->type, reg, ts->mem_reg, ts->mem_offset);
+            ts->val_type = TEMP_VAL_REG;
+            ts->reg = reg;
+            ts->mem_coherent = 1;
+            s->reg_to_temp[reg] = arg;
+        } else if (ts->val_type == TEMP_VAL_CONST) {
+            if (tcg_target_const_match(ts->val, arg_ct)) {
+                /* constant is OK for instruction */
+                const_args[i] = 1;
+                new_args[i] = ts->val;
+                goto iarg_end;
+            } else {
+                /* need to move to a register */
+                reg = tcg_reg_alloc(s, arg_ct->u.regs, allocated_regs);
+                tcg_out_movi(s, ts->type, reg, ts->val);
+                ts->val_type = TEMP_VAL_REG;
+                ts->reg = reg;
+                ts->mem_coherent = 0;
+                s->reg_to_temp[reg] = arg;
+            }
+        }
+        assert(ts->val_type == TEMP_VAL_REG);
+        if (arg_ct->ct & TCG_CT_IALIAS) {
+            if (ts->fixed_reg) {
+                /* if fixed register, we must allocate a new register
+                   if the alias is not the same register */
+                if (arg != args[arg_ct->alias_index])
+                    goto allocate_in_reg;
+            } else {
+                /* if the input is aliased to an output and if it is
+                   not dead after the instruction, we must allocate
+                   a new register and move it */
+                if (!IS_DEAD_IARG(i - nb_oargs))
+                    goto allocate_in_reg;
+            }
+        }
+        reg = ts->reg;
+        if (tcg_regset_test_reg(arg_ct->u.regs, reg)) {
+            /* nothing to do : the constraint is satisfied */
+        } else {
+        allocate_in_reg:
+            /* allocate a new register matching the constraint
+               and move the temporary register into it */
+            reg = tcg_reg_alloc(s, arg_ct->u.regs, allocated_regs);
+            tcg_out_mov(s, ts->type, reg, ts->reg);
+        }
+        new_args[i] = reg;
+        const_args[i] = 0;
+        tcg_regset_set_reg(allocated_regs, reg);
+    iarg_end: ;
+    }
+
+    if (def->flags & TCG_OPF_BB_END) {
+        tcg_reg_alloc_bb_end(s, allocated_regs);
+    } else {
+        /* mark dead temporaries and free the associated registers */
+        for(i = 0; i < nb_iargs; i++) {
+            arg = args[nb_oargs + i];
+            if (IS_DEAD_IARG(i)) {
+                ts = &s->temps[arg];
+                if (!ts->fixed_reg) {
+                    if (ts->val_type == TEMP_VAL_REG)
+                        s->reg_to_temp[ts->reg] = -1;
+                    ts->val_type = TEMP_VAL_DEAD;
+                }
+            }
+        }
+
+        if (def->flags & TCG_OPF_CALL_CLOBBER) {
+            /* XXX: permit generic clobber register list ? */
+            for(reg = 0; reg < TCG_TARGET_NB_REGS; reg++) {
+                if (tcg_regset_test_reg(tcg_target_call_clobber_regs, reg)) {
+                    tcg_reg_free(s, reg);
+                }
+            }
+            /* XXX: for load/store we could do that only for the slow path
+               (i.e. when a memory callback is called) */
+
+            /* store globals and free associated registers (we assume the insn
+               can modify any global. */
+            save_globals(s, allocated_regs);
+        }
+
+        /* satisfy the output constraints */
+        tcg_regset_set(allocated_regs, s->reserved_regs);
+        for(k = 0; k < nb_oargs; k++) {
+            i = def->sorted_args[k];
+            arg = args[i];
+            arg_ct = &def->args_ct[i];
+            ts = &s->temps[arg];
+            if (arg_ct->ct & TCG_CT_ALIAS) {
+                reg = new_args[arg_ct->alias_index];
+            } else {
+                /* if fixed register, we try to use it */
+                reg = ts->reg;
+                if (ts->fixed_reg &&
+                    tcg_regset_test_reg(arg_ct->u.regs, reg)) {
+                    goto oarg_end;
+                }
+                reg = tcg_reg_alloc(s, arg_ct->u.regs, allocated_regs);
+            }
+            tcg_regset_set_reg(allocated_regs, reg);
+            /* if a fixed register is used, then a move will be done afterwards */
+            if (!ts->fixed_reg) {
+                if (ts->val_type == TEMP_VAL_REG)
+                    s->reg_to_temp[ts->reg] = -1;
+                ts->val_type = TEMP_VAL_REG;
+                ts->reg = reg;
+                /* temp value is modified, so the value kept in memory is
+                   potentially not the same */
+                ts->mem_coherent = 0;
+                s->reg_to_temp[reg] = arg;
+            }
+        oarg_end:
+            new_args[i] = reg;
+        }
+    }
+
+    /* emit instruction */
+    tcg_out_op(s, opc, new_args, const_args);
+
+    /* move the outputs in the correct register if needed */
+    for(i = 0; i < nb_oargs; i++) {
+        ts = &s->temps[args[i]];
+        reg = new_args[i];
+        if (ts->fixed_reg && ts->reg != reg) {
+            tcg_out_mov(s, ts->type, ts->reg, reg);
+        }
+    }
+}
+
+#ifdef TCG_TARGET_STACK_GROWSUP
+#define STACK_DIR(x) (-(x))
+#else
+#define STACK_DIR(x) (x)
+#endif
+
+static int tcg_reg_alloc_call(TCGContext *s, const TCGOpDef *def,
+                              TCGOpcode opc, const TCGArg *args,
+                              unsigned int dead_iargs)
+{
+    int nb_iargs, nb_oargs, flags, nb_regs, i, reg, nb_params;
+    TCGArg arg, func_arg;
+    TCGTemp *ts;
+    tcg_target_long stack_offset, call_stack_size, func_addr;
+    int const_func_arg, allocate_args;
+    TCGRegSet allocated_regs;
+    const TCGArgConstraint *arg_ct;
+
+    arg = *args++;
+
+    nb_oargs = arg >> 16;
+    nb_iargs = arg & 0xffff;
+    nb_params = nb_iargs - 1;
+
+    flags = args[nb_oargs + nb_iargs];
+
+    nb_regs = tcg_target_get_call_iarg_regs_count(flags);
+    if (nb_regs > nb_params)
+        nb_regs = nb_params;
+
+    /* assign stack slots first */
+    /* XXX: preallocate call stack */
+    call_stack_size = (nb_params - nb_regs) * sizeof(tcg_target_long);
+    call_stack_size = (call_stack_size + TCG_TARGET_STACK_ALIGN - 1) &
+        ~(TCG_TARGET_STACK_ALIGN - 1);
+    allocate_args = (call_stack_size > TCG_STATIC_CALL_ARGS_SIZE);
+    if (allocate_args) {
+        tcg_out_addi(s, TCG_REG_CALL_STACK, -STACK_DIR(call_stack_size));
+    }
+
+    stack_offset = TCG_TARGET_CALL_STACK_OFFSET;
+    for(i = nb_regs; i < nb_params; i++) {
+        arg = args[nb_oargs + i];
+#ifdef TCG_TARGET_STACK_GROWSUP
+        stack_offset -= sizeof(tcg_target_long);
+#endif
+        if (arg != TCG_CALL_DUMMY_ARG) {
+            ts = &s->temps[arg];
+            if (ts->val_type == TEMP_VAL_REG) {
+                tcg_out_st(s, ts->type, ts->reg, TCG_REG_CALL_STACK, stack_offset);
+            } else if (ts->val_type == TEMP_VAL_MEM) {
+                reg = tcg_reg_alloc(s, tcg_target_available_regs[ts->type],
+                                    s->reserved_regs);
+                /* XXX: not correct if reading values from the stack */
+                tcg_out_ld(s, ts->type, reg, ts->mem_reg, ts->mem_offset);
+                tcg_out_st(s, ts->type, reg, TCG_REG_CALL_STACK, stack_offset);
+            } else if (ts->val_type == TEMP_VAL_CONST) {
+                reg = tcg_reg_alloc(s, tcg_target_available_regs[ts->type],
+                                    s->reserved_regs);
+                /* XXX: sign extend may be needed on some targets */
+                tcg_out_movi(s, ts->type, reg, ts->val);
+                tcg_out_st(s, ts->type, reg, TCG_REG_CALL_STACK, stack_offset);
+            } else {
+                tcg_abort();
+            }
+        }
+#ifndef TCG_TARGET_STACK_GROWSUP
+        stack_offset += sizeof(tcg_target_long);
+#endif
+    }
+
+    /* assign input registers */
+    tcg_regset_set(allocated_regs, s->reserved_regs);
+    for(i = 0; i < nb_regs; i++) {
+        arg = args[nb_oargs + i];
+        if (arg != TCG_CALL_DUMMY_ARG) {
+            ts = &s->temps[arg];
+            reg = tcg_target_call_iarg_regs[i];
+            tcg_reg_free(s, reg);
+            if (ts->val_type == TEMP_VAL_REG) {
+                if (ts->reg != reg) {
+                    tcg_out_mov(s, ts->type, reg, ts->reg);
+                }
+            } else if (ts->val_type == TEMP_VAL_MEM) {
+                tcg_out_ld(s, ts->type, reg, ts->mem_reg, ts->mem_offset);
+            } else if (ts->val_type == TEMP_VAL_CONST) {
+                /* XXX: sign extend ? */
+                tcg_out_movi(s, ts->type, reg, ts->val);
+            } else {
+                tcg_abort();
+            }
+            tcg_regset_set_reg(allocated_regs, reg);
+        }
+    }
+
+    /* assign function address */
+    func_arg = args[nb_oargs + nb_iargs - 1];
+    arg_ct = &def->args_ct[0];
+    ts = &s->temps[func_arg];
+    func_addr = ts->val;
+    const_func_arg = 0;
+    if (ts->val_type == TEMP_VAL_MEM) {
+        reg = tcg_reg_alloc(s, arg_ct->u.regs, allocated_regs);
+        tcg_out_ld(s, ts->type, reg, ts->mem_reg, ts->mem_offset);
+        func_arg = reg;
+        tcg_regset_set_reg(allocated_regs, reg);
+    } else if (ts->val_type == TEMP_VAL_REG) {
+        reg = ts->reg;
+        if (!tcg_regset_test_reg(arg_ct->u.regs, reg)) {
+            reg = tcg_reg_alloc(s, arg_ct->u.regs, allocated_regs);
+            tcg_out_mov(s, ts->type, reg, ts->reg);
+        }
+        func_arg = reg;
+        tcg_regset_set_reg(allocated_regs, reg);
+    } else if (ts->val_type == TEMP_VAL_CONST) {
+        if (tcg_target_const_match(func_addr, arg_ct)) {
+            const_func_arg = 1;
+            func_arg = func_addr;
+        } else {
+            reg = tcg_reg_alloc(s, arg_ct->u.regs, allocated_regs);
+            tcg_out_movi(s, ts->type, reg, func_addr);
+            func_arg = reg;
+            tcg_regset_set_reg(allocated_regs, reg);
+        }
+    } else {
+        tcg_abort();
+    }
+
+
+    /* mark dead temporaries and free the associated registers */
+    for(i = 0; i < nb_iargs; i++) {
+        arg = args[nb_oargs + i];
+        if (IS_DEAD_IARG(i)) {
+            ts = &s->temps[arg];
+            if (!ts->fixed_reg) {
+                if (ts->val_type == TEMP_VAL_REG)
+                    s->reg_to_temp[ts->reg] = -1;
+                ts->val_type = TEMP_VAL_DEAD;
+            }
+        }
+    }
+
+    /* clobber call registers */
+    for(reg = 0; reg < TCG_TARGET_NB_REGS; reg++) {
+        if (tcg_regset_test_reg(tcg_target_call_clobber_regs, reg)) {
+            tcg_reg_free(s, reg);
+        }
+    }
+
+    /* store globals and free associated registers (we assume the call
+       can modify any global. */
+    if (!(flags & TCG_CALL_CONST)) {
+        save_globals(s, allocated_regs);
+    }
+
+    tcg_out_op(s, opc, &func_arg, &const_func_arg);
+
+    if (allocate_args) {
+        tcg_out_addi(s, TCG_REG_CALL_STACK, STACK_DIR(call_stack_size));
+    }
+
+    /* assign output registers and emit moves if needed */
+    for(i = 0; i < nb_oargs; i++) {
+        arg = args[i];
+        ts = &s->temps[arg];
+        reg = tcg_target_call_oarg_regs[i];
+        assert(s->reg_to_temp[reg] == -1);
+        if (ts->fixed_reg) {
+            if (ts->reg != reg) {
+                tcg_out_mov(s, ts->type, ts->reg, reg);
+            }
+        } else {
+            if (ts->val_type == TEMP_VAL_REG)
+                s->reg_to_temp[ts->reg] = -1;
+            ts->val_type = TEMP_VAL_REG;
+            ts->reg = reg;
+            ts->mem_coherent = 0;
+            s->reg_to_temp[reg] = arg;
+        }
+    }
+
+    return nb_iargs + nb_oargs + def->nb_cargs + 1;
+}
+
+#ifdef CONFIG_PROFILER
+
+static int64_t tcg_table_op_count[NB_OPS];
+
+static void dump_op_count(void)
+{
+    int i;
+    FILE *f;
+    f = fopen("/tmp/op.log", "w");
+    for(i = INDEX_op_end; i < NB_OPS; i++) {
+        fprintf(f, "%s %" PRId64 "\n", tcg_op_defs[i].name, tcg_table_op_count[i]);
+    }
+    fclose(f);
+}
+#endif
+
+
+static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf,
+                                      intptr_t search_pc)
+{
+    TCGOpcode opc;
+    int op_index;
+    const TCGOpDef *def;
+    unsigned int dead_iargs;
+    const TCGArg *args;
+
+#ifdef DEBUG_DISAS
+    if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP))) {
+        qemu_log("OP:\n");
+        tcg_dump_ops(s, logfile);
+        qemu_log("\n");
+    }
+#endif
+
+#ifdef CONFIG_PROFILER
+    s->la_time -= profile_getclock();
+#endif
+    tcg_liveness_analysis(s);
+#ifdef CONFIG_PROFILER
+    s->la_time += profile_getclock();
+#endif
+
+#ifdef DEBUG_DISAS
+# ifdef USE_LIVENESS_ANALYSIS /* vbox */
+    if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP_OPT))) {
+        qemu_log("OP after liveness analysis:\n");
+        tcg_dump_ops(s, logfile);
+        qemu_log("\n");
+    }
+# endif /* USE_LIVENESS_ANALYSIS - vbox */
+#endif
+
+    tcg_reg_alloc_start(s);
+
+    s->code_buf = gen_code_buf;
+    s->code_ptr = gen_code_buf;
+
+    args = gen_opparam_buf;
+    op_index = 0;
+
+    for(;;) {
+        opc = gen_opc_buf[op_index];
+#ifdef CONFIG_PROFILER
+        tcg_table_op_count[opc]++;
+#endif
+        def = &tcg_op_defs[opc];
+#if 0
+        printf("%s: %d %d %d\n", def->name,
+               def->nb_oargs, def->nb_iargs, def->nb_cargs);
+        //        dump_regs(s);
+#endif
+        switch(opc) {
+        case INDEX_op_mov_i32:
+#if TCG_TARGET_REG_BITS == 64
+        case INDEX_op_mov_i64:
+#endif
+            dead_iargs = s->op_dead_iargs[op_index];
+            tcg_reg_alloc_mov(s, def, args, dead_iargs);
+            break;
+        case INDEX_op_movi_i32:
+#if TCG_TARGET_REG_BITS == 64
+        case INDEX_op_movi_i64:
+#endif
+            tcg_reg_alloc_movi(s, args);
+            break;
+        case INDEX_op_debug_insn_start:
+            /* debug instruction */
+            break;
+        case INDEX_op_nop:
+        case INDEX_op_nop1:
+        case INDEX_op_nop2:
+        case INDEX_op_nop3:
+            break;
+        case INDEX_op_nopn:
+            args += args[0];
+            goto next;
+        case INDEX_op_discard:
+            {
+                TCGTemp *ts;
+                ts = &s->temps[args[0]];
+                /* mark the temporary as dead */
+                if (!ts->fixed_reg) {
+                    if (ts->val_type == TEMP_VAL_REG)
+                        s->reg_to_temp[ts->reg] = -1;
+                    ts->val_type = TEMP_VAL_DEAD;
+                }
+            }
+            break;
+        case INDEX_op_set_label:
+            tcg_reg_alloc_bb_end(s, s->reserved_regs);
+            tcg_out_label(s, args[0], (intptr_t)s->code_ptr);
+            break;
+        case INDEX_op_call:
+            dead_iargs = s->op_dead_iargs[op_index];
+            args += tcg_reg_alloc_call(s, def, opc, args, dead_iargs);
+            goto next;
+        case INDEX_op_end:
+            goto the_end;
+        default:
+            /* Note: in order to speed up the code, it would be much
+               faster to have specialized register allocator functions for
+               some common argument patterns */
+            dead_iargs = s->op_dead_iargs[op_index];
+            tcg_reg_alloc_op(s, def, opc, args, dead_iargs);
+            break;
+        }
+        args += def->nb_args;
+    next:
+        if (search_pc >= 0 && search_pc < s->code_ptr - gen_code_buf) {
+            return op_index;
+        }
+        op_index++;
+#ifndef NDEBUG
+        check_regs(s);
+#endif
+    }
+ the_end:
+    return -1;
+}
+
+int tcg_gen_code(TCGContext *s, uint8_t *gen_code_buf)
+{
+#ifdef CONFIG_PROFILER
+    {
+        int n;
+        n = (gen_opc_ptr - gen_opc_buf);
+        s->op_count += n;
+        if (n > s->op_count_max)
+            s->op_count_max = n;
+
+        s->temp_count += s->nb_temps;
+        if (s->nb_temps > s->temp_count_max)
+            s->temp_count_max = s->nb_temps;
+    }
+#endif
+
+    tcg_gen_code_common(s, gen_code_buf, -1);
+
+    /* flush instruction cache */
+    flush_icache_range((uintptr_t)gen_code_buf,
+                       (uintptr_t)s->code_ptr);
+    return s->code_ptr -  gen_code_buf;
+}
+
+/* Return the index of the micro operation such as the pc after is <
+   offset bytes from the start of the TB.  The contents of gen_code_buf must
+   not be changed, though writing the same values is ok.
+   Return -1 if not found. */
+int tcg_gen_code_search_pc(TCGContext *s, uint8_t *gen_code_buf, intptr_t offset)
+{
+    return tcg_gen_code_common(s, gen_code_buf, offset);
+}
+
+#ifdef CONFIG_PROFILER
+void tcg_dump_info(FILE *f,
+                   int (*cpu_fprintf)(FILE *f, const char *fmt, ...))
+{
+    TCGContext *s = &tcg_ctx;
+    int64_t tot;
+
+    tot = s->interm_time + s->code_time;
+    cpu_fprintf(f, "JIT cycles          %" PRId64 " (%0.3f s at 2.4 GHz)\n",
+                tot, tot / 2.4e9);
+    cpu_fprintf(f, "translated TBs      %" PRId64 " (aborted=%" PRId64 " %0.1f%%)\n",
+                s->tb_count,
+                s->tb_count1 - s->tb_count,
+                s->tb_count1 ? (double)(s->tb_count1 - s->tb_count) / s->tb_count1 * 100.0 : 0);
+    cpu_fprintf(f, "avg ops/TB          %0.1f max=%d\n",
+                s->tb_count ? (double)s->op_count / s->tb_count : 0, s->op_count_max);
+    cpu_fprintf(f, "deleted ops/TB      %0.2f\n",
+                s->tb_count ?
+                (double)s->del_op_count / s->tb_count : 0);
+    cpu_fprintf(f, "avg temps/TB        %0.2f max=%d\n",
+                s->tb_count ?
+                (double)s->temp_count / s->tb_count : 0,
+                s->temp_count_max);
+
+    cpu_fprintf(f, "cycles/op           %0.1f\n",
+                s->op_count ? (double)tot / s->op_count : 0);
+    cpu_fprintf(f, "cycles/in byte      %0.1f\n",
+                s->code_in_len ? (double)tot / s->code_in_len : 0);
+    cpu_fprintf(f, "cycles/out byte     %0.1f\n",
+                s->code_out_len ? (double)tot / s->code_out_len : 0);
+    if (tot == 0)
+        tot = 1;
+    cpu_fprintf(f, "  gen_interm time   %0.1f%%\n",
+                (double)s->interm_time / tot * 100.0);
+    cpu_fprintf(f, "  gen_code time     %0.1f%%\n",
+                (double)s->code_time / tot * 100.0);
+    cpu_fprintf(f, "liveness/code time  %0.1f%%\n",
+                (double)s->la_time / (s->code_time ? s->code_time : 1) * 100.0);
+    cpu_fprintf(f, "cpu_restore count   %" PRId64 "\n",
+                s->restore_count);
+    cpu_fprintf(f, "  avg cycles        %0.1f\n",
+                s->restore_count ? (double)s->restore_time / s->restore_count : 0);
+
+    dump_op_count();
+}
+#else
+void tcg_dump_info(FILE *f,
+                   int (*cpu_fprintf)(FILE *f, const char *fmt, ...))
+{
+    cpu_fprintf(f, "[TCG profiler not compiled]\n");
+}
+#endif
diff --git a/src/recompiler/tcg/tcg.h b/src/recompiler/tcg/tcg.h
new file mode 100644
index 00000000..819fa6c7
--- /dev/null
+++ b/src/recompiler/tcg/tcg.h
@@ -0,0 +1,512 @@
+/*
+ * Tiny Code Generator for QEMU
+ *
+ * Copyright (c) 2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "tcg-target.h"
+#include "tcg-runtime.h"
+
+#if TCG_TARGET_REG_BITS == 32
+typedef int32_t tcg_target_long;
+typedef uint32_t tcg_target_ulong;
+#define TCG_PRIlx PRIx32
+#define TCG_PRIld PRId32
+#elif TCG_TARGET_REG_BITS == 64
+typedef int64_t tcg_target_long;
+typedef uint64_t tcg_target_ulong;
+#define TCG_PRIlx PRIx64
+#define TCG_PRIld PRId64
+#else
+#error unsupported
+#endif
+
+#if TCG_TARGET_NB_REGS <= 32
+typedef uint32_t TCGRegSet;
+#elif TCG_TARGET_NB_REGS <= 64
+typedef uint64_t TCGRegSet;
+#else
+#error unsupported
+#endif
+
+typedef enum TCGOpcode {
+#define DEF(name, oargs, iargs, cargs, flags) INDEX_op_ ## name,
+#include "tcg-opc.h"
+#undef DEF
+    NB_OPS,
+} TCGOpcode;
+
+#define tcg_regset_clear(d) (d) = 0
+#define tcg_regset_set(d, s) (d) = (s)
+#define tcg_regset_set32(d, reg, val32) (d) |= (val32) << (reg)
+#define tcg_regset_set_reg(d, r) (d) |= 1L << (r)
+#define tcg_regset_reset_reg(d, r) (d) &= ~(1L << (r))
+#define tcg_regset_test_reg(d, r) (((d) >> (r)) & 1)
+#define tcg_regset_or(d, a, b) (d) = (a) | (b)
+#define tcg_regset_and(d, a, b) (d) = (a) & (b)
+#define tcg_regset_andnot(d, a, b) (d) = (a) & ~(b)
+#define tcg_regset_not(d, a) (d) = ~(a)
+
+typedef struct TCGRelocation {
+    struct TCGRelocation *next;
+    int type;
+    uint8_t *ptr;
+    tcg_target_long addend;
+} TCGRelocation;
+
+typedef struct TCGLabel {
+    int has_value;
+    union {
+        tcg_target_ulong value;
+        TCGRelocation *first_reloc;
+    } u;
+} TCGLabel;
+
+typedef struct TCGPool {
+    struct TCGPool *next;
+    int size;
+    uint8_t data[0] __attribute__ ((aligned));
+} TCGPool;
+
+#define TCG_POOL_CHUNK_SIZE 32768
+
+#define TCG_MAX_LABELS 512
+
+#define TCG_MAX_TEMPS 512
+
+/* when the size of the arguments of a called function is smaller than
+   this value, they are statically allocated in the TB stack frame */
+#define TCG_STATIC_CALL_ARGS_SIZE 128
+
+typedef enum TCGType {
+    TCG_TYPE_I32,
+    TCG_TYPE_I64,
+    TCG_TYPE_COUNT, /* number of different types */
+
+    /* An alias for the size of the host register.  */
+#if TCG_TARGET_REG_BITS == 32
+    TCG_TYPE_REG = TCG_TYPE_I32,
+#else
+    TCG_TYPE_REG = TCG_TYPE_I64,
+#endif
+
+    /* An alias for the size of the native pointer.  We don't currently
+       support any hosts with 64-bit registers and 32-bit pointers.  */
+    TCG_TYPE_PTR = TCG_TYPE_REG,
+
+    /* An alias for the size of the target "long", aka register.  */
+#if TARGET_LONG_BITS == 64
+    TCG_TYPE_TL = TCG_TYPE_I64,
+#else
+    TCG_TYPE_TL = TCG_TYPE_I32,
+#endif
+} TCGType;
+
+typedef tcg_target_ulong TCGArg;
+
+/* Define a type and accessor macros for varables.  Using a struct is
+   nice because it gives some level of type safely.  Ideally the compiler
+   be able to see through all this.  However in practice this is not true,
+   expecially on targets with braindamaged ABIs (e.g. i386).
+   We use plain int by default to avoid this runtime overhead.
+   Users of tcg_gen_* don't need to know about any of this, and should
+   treat TCGv as an opaque type.
+   In additon we do typechecking for different types of variables.  TCGv_i32
+   and TCGv_i64 are 32/64-bit variables respectively.  TCGv and TCGv_ptr
+   are aliases for target_ulong and host pointer sized values respectively.
+ */
+
+#ifdef CONFIG_DEBUG_TCG
+#define DEBUG_TCGV 1
+#endif
+
+#ifdef DEBUG_TCGV
+
+typedef struct
+{
+    int i32;
+} TCGv_i32;
+
+typedef struct
+{
+    int i64;
+} TCGv_i64;
+
+#define MAKE_TCGV_I32(i) __extension__                  \
+    ({ TCGv_i32 make_tcgv_tmp = {i}; make_tcgv_tmp;})
+#define MAKE_TCGV_I64(i) __extension__                  \
+    ({ TCGv_i64 make_tcgv_tmp = {i}; make_tcgv_tmp;})
+#define GET_TCGV_I32(t) ((t).i32)
+#define GET_TCGV_I64(t) ((t).i64)
+#if TCG_TARGET_REG_BITS == 32
+#define TCGV_LOW(t) MAKE_TCGV_I32(GET_TCGV_I64(t))
+#define TCGV_HIGH(t) MAKE_TCGV_I32(GET_TCGV_I64(t) + 1)
+#endif
+
+#else /* !DEBUG_TCGV */
+
+typedef int TCGv_i32;
+typedef int TCGv_i64;
+#define MAKE_TCGV_I32(x) (x)
+#define MAKE_TCGV_I64(x) (x)
+#define GET_TCGV_I32(t) (t)
+#define GET_TCGV_I64(t) (t)
+
+#if TCG_TARGET_REG_BITS == 32
+#define TCGV_LOW(t) (t)
+#define TCGV_HIGH(t) ((t) + 1)
+#endif
+
+#endif /* DEBUG_TCGV */
+
+#define TCGV_EQUAL_I32(a, b) (GET_TCGV_I32(a) == GET_TCGV_I32(b))
+#define TCGV_EQUAL_I64(a, b) (GET_TCGV_I64(a) == GET_TCGV_I64(b))
+
+/* Dummy definition to avoid compiler warnings.  */
+#define TCGV_UNUSED_I32(x) x = MAKE_TCGV_I32(-1)
+#define TCGV_UNUSED_I64(x) x = MAKE_TCGV_I64(-1)
+
+/* call flags */
+#define TCG_CALL_TYPE_MASK      0x000f
+#define TCG_CALL_TYPE_STD       0x0000 /* standard C call */
+#define TCG_CALL_TYPE_REGPARM_1 0x0001 /* i386 style regparm call (1 reg) */
+#define TCG_CALL_TYPE_REGPARM_2 0x0002 /* i386 style regparm call (2 regs) */
+#define TCG_CALL_TYPE_REGPARM   0x0003 /* i386 style regparm call (3 regs) */
+/* A pure function only reads its arguments and TCG global variables
+   and cannot raise exceptions. Hence a call to a pure function can be
+   safely suppressed if the return value is not used. */
+#define TCG_CALL_PURE           0x0010
+/* A const function only reads its arguments and does not use TCG
+   global variables. Hence a call to such a function does not
+   save TCG global variables back to their canonical location. */
+#define TCG_CALL_CONST          0x0020
+
+/* used to align parameters */
+#define TCG_CALL_DUMMY_TCGV     MAKE_TCGV_I32(-1)
+#define TCG_CALL_DUMMY_ARG      ((TCGArg)(-1))
+
+typedef enum {
+    TCG_COND_EQ,
+    TCG_COND_NE,
+    TCG_COND_LT,
+    TCG_COND_GE,
+    TCG_COND_LE,
+    TCG_COND_GT,
+    /* unsigned */
+    TCG_COND_LTU,
+    TCG_COND_GEU,
+    TCG_COND_LEU,
+    TCG_COND_GTU,
+} TCGCond;
+
+/* Invert the sense of the comparison.  */
+static inline TCGCond tcg_invert_cond(TCGCond c)
+{
+    return (TCGCond)(c ^ 1);
+}
+
+/* Swap the operands in a comparison.  */
+static inline TCGCond tcg_swap_cond(TCGCond c)
+{
+    int mask = (c < TCG_COND_LT ? 0 : c < TCG_COND_LTU ? 7 : 15);
+    return (TCGCond)(c ^ mask);
+}
+
+static inline TCGCond tcg_unsigned_cond(TCGCond c)
+{
+    return (c >= TCG_COND_LT && c <= TCG_COND_GT ? c + 4 : c);
+}
+
+#define TEMP_VAL_DEAD  0
+#define TEMP_VAL_REG   1
+#define TEMP_VAL_MEM   2
+#define TEMP_VAL_CONST 3
+
+/* XXX: optimize memory layout */
+typedef struct TCGTemp {
+    TCGType base_type;
+    TCGType type;
+    int val_type;
+    int reg;
+    tcg_target_long val;
+    int mem_reg;
+    tcg_target_long mem_offset;
+    unsigned int fixed_reg:1;
+    unsigned int mem_coherent:1;
+    unsigned int mem_allocated:1;
+    unsigned int temp_local:1; /* If true, the temp is saved accross
+                                  basic blocks. Otherwise, it is not
+                                  preserved accross basic blocks. */
+    unsigned int temp_allocated:1; /* never used for code gen */
+    /* index of next free temp of same base type, -1 if end */
+    int next_free_temp;
+    const char *name;
+} TCGTemp;
+
+typedef struct TCGHelperInfo {
+    tcg_target_ulong func;
+    const char *name;
+} TCGHelperInfo;
+
+typedef struct TCGContext TCGContext;
+
+struct TCGContext {
+    uint8_t *pool_cur, *pool_end;
+    TCGPool *pool_first, *pool_current;
+    TCGLabel *labels;
+    int nb_labels;
+    TCGTemp *temps; /* globals first, temps after */
+    int nb_globals;
+    int nb_temps;
+    /* index of free temps, -1 if none */
+    int first_free_temp[TCG_TYPE_COUNT * 2];
+
+    /* goto_tb support */
+    uint8_t *code_buf;
+    uintptr_t *tb_next;
+    uint16_t *tb_next_offset;
+    uint16_t *tb_jmp_offset; /* != NULL if USE_DIRECT_JUMP */
+
+    /* liveness analysis */
+    uint16_t *op_dead_iargs; /* for each operation, each bit tells if the
+                                corresponding input argument is dead */
+
+    /* tells in which temporary a given register is. It does not take
+       into account fixed registers */
+    int reg_to_temp[TCG_TARGET_NB_REGS];
+    TCGRegSet reserved_regs;
+    tcg_target_long current_frame_offset;
+    tcg_target_long frame_start;
+    tcg_target_long frame_end;
+    int frame_reg;
+
+    uint8_t *code_ptr;
+    TCGTemp static_temps[TCG_MAX_TEMPS];
+
+    TCGHelperInfo *helpers;
+    int nb_helpers;
+    int allocated_helpers;
+    int helpers_sorted;
+
+#ifdef CONFIG_PROFILER
+    /* profiling info */
+    int64_t tb_count1;
+    int64_t tb_count;
+    int64_t op_count; /* total insn count */
+    int op_count_max; /* max insn per TB */
+    int64_t temp_count;
+    int temp_count_max;
+    int64_t del_op_count;
+    int64_t code_in_len;
+    int64_t code_out_len;
+    int64_t interm_time;
+    int64_t code_time;
+    int64_t la_time;
+    int64_t restore_count;
+    int64_t restore_time;
+#endif
+};
+
+extern TCGContext tcg_ctx;
+extern uint16_t *gen_opc_ptr;
+extern TCGArg *gen_opparam_ptr;
+extern uint16_t gen_opc_buf[];
+extern TCGArg gen_opparam_buf[];
+
+/* pool based memory allocation */
+
+void *tcg_malloc_internal(TCGContext *s, int size);
+void tcg_pool_reset(TCGContext *s);
+void tcg_pool_delete(TCGContext *s);
+
+static inline void *tcg_malloc(int size)
+{
+    TCGContext *s = &tcg_ctx;
+    uint8_t *ptr, *ptr_end;
+    size = (size + sizeof(void *) - 1) & ~(sizeof(void *) - 1);
+    ptr = s->pool_cur;
+    ptr_end = ptr + size;
+    if (unlikely(ptr_end > s->pool_end)) {
+        return tcg_malloc_internal(&tcg_ctx, size);
+    } else {
+        s->pool_cur = ptr_end;
+        return ptr;
+    }
+}
+
+void tcg_context_init(TCGContext *s);
+void tcg_prologue_init(TCGContext *s);
+void tcg_func_start(TCGContext *s);
+
+int tcg_gen_code(TCGContext *s, uint8_t *gen_code_buf);
+int tcg_gen_code_search_pc(TCGContext *s, uint8_t *gen_code_buf, intptr_t offset);
+
+void tcg_set_frame(TCGContext *s, int reg,
+                   tcg_target_long start, tcg_target_long size);
+
+TCGv_i32 tcg_global_reg_new_i32(int reg, const char *name);
+TCGv_i32 tcg_global_mem_new_i32(int reg, tcg_target_long offset,
+                                const char *name);
+TCGv_i32 tcg_temp_new_internal_i32(int temp_local);
+static inline TCGv_i32 tcg_temp_new_i32(void)
+{
+    return tcg_temp_new_internal_i32(0);
+}
+static inline TCGv_i32 tcg_temp_local_new_i32(void)
+{
+    return tcg_temp_new_internal_i32(1);
+}
+void tcg_temp_free_i32(TCGv_i32 arg);
+char *tcg_get_arg_str_i32(TCGContext *s, char *buf, int buf_size, TCGv_i32 arg);
+
+TCGv_i64 tcg_global_reg_new_i64(int reg, const char *name);
+TCGv_i64 tcg_global_mem_new_i64(int reg, tcg_target_long offset,
+                                const char *name);
+TCGv_i64 tcg_temp_new_internal_i64(int temp_local);
+static inline TCGv_i64 tcg_temp_new_i64(void)
+{
+    return tcg_temp_new_internal_i64(0);
+}
+static inline TCGv_i64 tcg_temp_local_new_i64(void)
+{
+    return tcg_temp_new_internal_i64(1);
+}
+void tcg_temp_free_i64(TCGv_i64 arg);
+char *tcg_get_arg_str_i64(TCGContext *s, char *buf, int buf_size, TCGv_i64 arg);
+
+void tcg_dump_info(FILE *f,
+                   int (*cpu_fprintf)(FILE *f, const char *fmt, ...));
+
+#define TCG_CT_ALIAS  0x80
+#define TCG_CT_IALIAS 0x40
+#define TCG_CT_REG    0x01
+#define TCG_CT_CONST  0x02 /* any constant of register size */
+
+typedef struct TCGArgConstraint {
+    uint16_t ct;
+    uint8_t alias_index;
+    union {
+        TCGRegSet regs;
+    } u;
+} TCGArgConstraint;
+
+#define TCG_MAX_OP_ARGS 16
+
+#define TCG_OPF_BB_END     0x01 /* instruction defines the end of a basic
+                                   block */
+#define TCG_OPF_CALL_CLOBBER 0x02 /* instruction clobbers call registers
+                                   and potentially update globals. */
+#define TCG_OPF_SIDE_EFFECTS 0x04 /* instruction has side effects : it
+                                     cannot be removed if its output
+                                     are not used */
+
+typedef struct TCGOpDef {
+    const char *name;
+    uint8_t nb_oargs, nb_iargs, nb_cargs, nb_args;
+    uint8_t flags;
+    TCGArgConstraint *args_ct;
+    int *sorted_args;
+#if defined(CONFIG_DEBUG_TCG)
+    int used;
+#endif
+} TCGOpDef;
+
+typedef struct TCGTargetOpDef {
+    TCGOpcode op;
+    const char *args_ct_str[TCG_MAX_OP_ARGS];
+} TCGTargetOpDef;
+
+#ifndef VBOX
+#define tcg_abort() \
+do {\
+    fprintf(stderr, "%s:%d: tcg fatal error\n", __FILE__, __LINE__);\
+    abort();\
+} while (0)
+#else  /* VBOX */
+# define tcg_abort() \
+    do {\
+        remAbort(-1, "TCG fatal error: "__FILE__":" RT_XSTR(__LINE__));     \
+    } while (0)
+extern void qemu_qsort(void* base, size_t nmemb, size_t size,
+                       int(*compar)(const void*, const void*));
+#define tcg_exit(status) \
+    do {\
+        remAbort(-1, "TCG exit: "__FILE__":" RT_XSTR(__LINE__));\
+    } while (0)
+#endif /* VBOX */
+
+void tcg_add_target_add_op_defs(const TCGTargetOpDef *tdefs);
+
+#if TCG_TARGET_REG_BITS == 32
+#define tcg_const_ptr tcg_const_i32
+#define tcg_add_ptr tcg_add_i32
+#define tcg_sub_ptr tcg_sub_i32
+#define TCGv_ptr TCGv_i32
+#define GET_TCGV_PTR GET_TCGV_I32
+#define tcg_global_reg_new_ptr tcg_global_reg_new_i32
+#define tcg_global_mem_new_ptr tcg_global_mem_new_i32
+#define tcg_temp_new_ptr tcg_temp_new_i32
+#define tcg_temp_free_ptr tcg_temp_free_i32
+#else
+#define tcg_const_ptr tcg_const_i64
+#define tcg_add_ptr tcg_add_i64
+#define tcg_sub_ptr tcg_sub_i64
+#define TCGv_ptr TCGv_i64
+#define GET_TCGV_PTR GET_TCGV_I64
+#define tcg_global_reg_new_ptr tcg_global_reg_new_i64
+#define tcg_global_mem_new_ptr tcg_global_mem_new_i64
+#define tcg_temp_new_ptr tcg_temp_new_i64
+#define tcg_temp_free_ptr tcg_temp_free_i64
+#endif
+
+void tcg_gen_callN(TCGContext *s, TCGv_ptr func, unsigned int flags,
+                   int sizemask, TCGArg ret, int nargs, TCGArg *args);
+
+void tcg_gen_shifti_i64(TCGv_i64 ret, TCGv_i64 arg1,
+                        int c, int right, int arith);
+
+/* only used for debugging purposes */
+void tcg_register_helper(void *func, const char *name);
+const char *tcg_helper_get_name(TCGContext *s, void *func);
+void tcg_dump_ops(TCGContext *s, FILE *outfile);
+
+void dump_ops(const uint16_t *opc_buf, const TCGArg *opparam_buf);
+TCGv_i32 tcg_const_i32(int32_t val);
+TCGv_i64 tcg_const_i64(int64_t val);
+TCGv_i32 tcg_const_local_i32(int32_t val);
+TCGv_i64 tcg_const_local_i64(int64_t val);
+
+#ifndef VBOX
+extern uint8_t code_gen_prologue[];
+#else
+extern uint8_t *code_gen_prologue;
+#endif
+#if defined(_ARCH_PPC) && !defined(_ARCH_PPC64)
+#define tcg_qemu_tb_exec(tb_ptr) \
+    ((intptr_t REGPARM __attribute__ ((longcall)) (*)(void *))code_gen_prologue)(tb_ptr)
+#else
+# if defined(VBOX) && defined(GCC_WITH_BUGGY_REGPARM) && !defined(__MINGW64__)
+#  define tcg_qemu_tb_exec(tb_ptr, ret)        \
+    __asm__ __volatile__("call *%%ecx" : "=a"(ret) : "a"(tb_ptr), "c" (&code_gen_prologue[0]) : "memory", "%edx", "cc")
+# else
+#define tcg_qemu_tb_exec(tb_ptr) ((intptr_t REGPARM (*)(void *))code_gen_prologue)(tb_ptr)
+# endif
+#endif
diff --git a/src/recompiler/tests/Makefile b/src/recompiler/tests/Makefile
new file mode 100644
index 00000000..ff7f787a
--- /dev/null
+++ b/src/recompiler/tests/Makefile
@@ -0,0 +1,109 @@
+-include ../config-host.mak
+
+$(call set-vpath, $(SRC_PATH)/tests)
+
+CFLAGS=-Wall -O2 -g -fno-strict-aliasing
+#CFLAGS+=-msse2
+LDFLAGS=
+
+ifeq ($(ARCH),i386)
+TESTS=linux-test testthread sha1-i386 test-i386
+endif
+ifeq ($(ARCH),x86_64)
+TESTS=test-x86_64
+endif
+TESTS+=sha1# test_path
+#TESTS+=test_path
+#TESTS+=runcom
+
+QEMU=../i386-linux-user/qemu-i386
+
+all: $(TESTS)
+
+hello-i386: hello-i386.c
+	$(CC) -nostdlib $(CFLAGS) -static $(LDFLAGS) -o $@ $<
+	strip $@
+
+testthread: testthread.c
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $< -lpthread
+
+test_path: test_path.c
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $<
+	./$@ || { rm $@; exit 1; }
+
+# i386/x86_64 emulation test (test various opcodes) */
+test-i386: test-i386.c test-i386-code16.S test-i386-vm86.S \
+           test-i386.h test-i386-shift.h test-i386-muldiv.h
+	$(CC) -m32 $(CFLAGS) $(LDFLAGS) -static -o $@ \
+              $(<D)/test-i386.c $(<D)/test-i386-code16.S $(<D)/test-i386-vm86.S -lm
+
+test-x86_64: test-i386.c \
+           test-i386.h test-i386-shift.h test-i386-muldiv.h
+	$(CC) -m64 $(CFLAGS) $(LDFLAGS) -static -o $@ $(<D)/test-i386.c -lm
+
+ifeq ($(ARCH),i386)
+test: test-i386
+	./test-i386 > test-i386.ref
+else
+test:
+endif
+	$(QEMU) test-i386 > test-i386.out
+	@if diff -u test-i386.ref test-i386.out ; then echo "Auto Test OK"; fi
+
+.PHONY: test-mmap
+test-mmap: test-mmap.c
+	$(CC) $(CFLAGS) -Wall -static -O2 $(LDFLAGS) -o $@ $<
+	-./test-mmap
+	-$(QEMU) ./test-mmap
+	-$(QEMU) -p 8192 ./test-mmap 8192
+	-$(QEMU) -p 16384 ./test-mmap 16384
+	-$(QEMU) -p 32768 ./test-mmap 32768
+
+# generic Linux and CPU test
+linux-test: linux-test.c
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $< -lm
+
+# speed test
+sha1-i386: sha1.c
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $<
+
+sha1: sha1.c
+	$(HOST_CC) $(CFLAGS) $(LDFLAGS) -o $@ $<
+
+speed: sha1 sha1-i386
+	time ./sha1
+	time $(QEMU) ./sha1-i386
+
+# vm86 test
+runcom: runcom.c
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $<
+
+# NOTE: -fomit-frame-pointer is currently needed : this is a bug in libqemu
+qruncom: qruncom.c ../ioport-user.c ../i386-user/libqemu.a
+	$(CC) $(CFLAGS) -fomit-frame-pointer $(LDFLAGS) -I../target-i386 -I.. -I../i386-user -I../fpu \
+              -o $@ $(filter %.c, $^) -L../i386-user -lqemu -lm
+
+# arm test
+hello-arm: hello-arm.o
+	arm-linux-ld -o $@ $<
+
+hello-arm.o: hello-arm.c
+	arm-linux-gcc -Wall -g -O2 -c -o $@ $<
+
+test-arm-iwmmxt: test-arm-iwmmxt.s
+	cpp < $< | arm-linux-gnu-gcc -Wall -static -march=iwmmxt -mabi=aapcs -x assembler - -o $@
+
+# MIPS test
+hello-mips: hello-mips.c
+	mips-linux-gnu-gcc -nostdlib -static -mno-abicalls -fno-PIC -mabi=32 -Wall -Wextra -g -O2 -o $@ $<
+
+hello-mipsel: hello-mips.c
+	mipsel-linux-gnu-gcc -nostdlib -static -mno-abicalls -fno-PIC -mabi=32 -Wall -Wextra -g -O2 -o $@ $<
+
+# testsuite for the CRIS port.
+test-cris:
+	$(MAKE) -C cris check
+
+clean:
+	rm -f *~ *.o test-i386.out test-i386.ref \
+           test-x86_64.log test-x86_64.ref qruncom $(TESTS)
diff --git a/src/recompiler/tests/hello-arm.c b/src/recompiler/tests/hello-arm.c
new file mode 100644
index 00000000..e0daa7ad
--- /dev/null
+++ b/src/recompiler/tests/hello-arm.c
@@ -0,0 +1,113 @@
+#define __NR_SYSCALL_BASE	0x900000
+#define __NR_exit1			(__NR_SYSCALL_BASE+  1)
+#define __NR_write			(__NR_SYSCALL_BASE+  4)
+
+#define __sys2(x) #x
+#define __sys1(x) __sys2(x)
+
+#ifndef __syscall
+#define __syscall(name) "swi\t" __sys1(__NR_##name) "\n\t"
+#endif
+
+#define __syscall_return(type, res)					\
+do {									\
+	return (type) (res);						\
+} while (0)
+
+#define _syscall0(type,name)						\
+type name(void) {							\
+  long __res;								\
+  __asm__ __volatile__ (						\
+  __syscall(name)							\
+  "mov %0,r0"								\
+  :"=r" (__res) : : "r0","lr");						\
+  __syscall_return(type,__res);						\
+}
+
+#define _syscall1(type,name,type1,arg1)					\
+type name(type1 arg1) {							\
+  long __res;								\
+  __asm__ __volatile__ (						\
+  "mov\tr0,%1\n\t"							\
+  __syscall(name)							\
+  "mov %0,r0"								\
+        : "=r" (__res)							\
+        : "r" ((long)(arg1))						\
+	: "r0","lr");							\
+  __syscall_return(type,__res);						\
+}
+
+#define _syscall2(type,name,type1,arg1,type2,arg2)			\
+type name(type1 arg1,type2 arg2) {					\
+  long __res;								\
+  __asm__ __volatile__ (						\
+  "mov\tr0,%1\n\t"							\
+  "mov\tr1,%2\n\t"							\
+  __syscall(name)							\
+  "mov\t%0,r0"								\
+        : "=r" (__res)							\
+        : "r" ((long)(arg1)),"r" ((long)(arg2))				\
+	: "r0","r1","lr");						\
+  __syscall_return(type,__res);						\
+}
+
+
+#define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3)		\
+type name(type1 arg1,type2 arg2,type3 arg3) {				\
+  long __res;								\
+  __asm__ __volatile__ (						\
+  "mov\tr0,%1\n\t"							\
+  "mov\tr1,%2\n\t"							\
+  "mov\tr2,%3\n\t"							\
+  __syscall(name)							\
+  "mov\t%0,r0"								\
+        : "=r" (__res)							\
+        : "r" ((long)(arg1)),"r" ((long)(arg2)),"r" ((long)(arg3))	\
+        : "r0","r1","r2","lr");						\
+  __syscall_return(type,__res);						\
+}
+
+
+#define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4)		\
+type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4) {				\
+  long __res;										\
+  __asm__ __volatile__ (								\
+  "mov\tr0,%1\n\t"									\
+  "mov\tr1,%2\n\t"									\
+  "mov\tr2,%3\n\t"									\
+  "mov\tr3,%4\n\t"									\
+  __syscall(name)									\
+  "mov\t%0,r0"										\
+  	: "=r" (__res)									\
+  	: "r" ((long)(arg1)),"r" ((long)(arg2)),"r" ((long)(arg3)),"r" ((long)(arg4))	\
+  	: "r0","r1","r2","r3","lr");							\
+  __syscall_return(type,__res);								\
+}
+
+
+#define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,type5,arg5)	\
+type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5) {			\
+  long __res;										\
+  __asm__ __volatile__ (								\
+  "mov\tr0,%1\n\t"									\
+  "mov\tr1,%2\n\t"									\
+  "mov\tr2,%3\n\t"									\
+  "mov\tr3,%4\n\t"									\
+  "mov\tr4,%5\n\t"									\
+  __syscall(name)									\
+  "mov\t%0,r0"										\
+  	: "=r" (__res)									\
+  	: "r" ((long)(arg1)),"r" ((long)(arg2)),"r" ((long)(arg3)),"r" ((long)(arg4)),	\
+	  "r" ((long)(arg5))								\
+	: "r0","r1","r2","r3","r4","lr");						\
+  __syscall_return(type,__res);								\
+}
+
+_syscall1(int,exit1,int,status);
+_syscall3(int,write,int,fd,const char *,buf, int, len);
+
+void _start(void)
+{
+    write(1, "Hello World\n", 12);
+    exit1(0);
+}
diff --git a/src/recompiler/tests/hello-i386.c b/src/recompiler/tests/hello-i386.c
new file mode 100644
index 00000000..e00245d3
--- /dev/null
+++ b/src/recompiler/tests/hello-i386.c
@@ -0,0 +1,26 @@
+#include <asm/unistd.h>
+
+extern inline volatile void exit(int status)
+{
+  int __res;
+  __asm__ volatile ("movl %%ecx,%%ebx\n"\
+		    "int $0x80" \
+		    :  "=a" (__res) : "0" (__NR_exit),"c" ((long)(status)));
+}
+
+extern inline int write(int fd, const char * buf, int len)
+{
+  int status;
+  __asm__ volatile ("pushl %%ebx\n"\
+		    "movl %%esi,%%ebx\n"\
+		    "int $0x80\n" \
+		    "popl %%ebx\n"\
+		    : "=a" (status) \
+		    : "0" (__NR_write),"S" ((long)(fd)),"c" ((long)(buf)),"d" ((long)(len)));
+}
+
+void _start(void)
+{
+    write(1, "Hello World\n", 12);
+    exit(0);
+}
diff --git a/src/recompiler/tests/linux-test.c b/src/recompiler/tests/linux-test.c
new file mode 100644
index 00000000..cce80373
--- /dev/null
+++ b/src/recompiler/tests/linux-test.c
@@ -0,0 +1,545 @@
+/*
+ *  linux and CPU test
+ *
+ *  Copyright (c) 2003 Fabrice Bellard
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Oracle GPL Disclaimer: For the avoidance of doubt, except that if any license choice
+ * other than GPL or LGPL is available it will apply instead, Oracle elects to use only
+ * the General Public License version 2 (GPLv2) at this time for any software where
+ * a choice of GPL license versions is made available with the language indicating
+ * that GPLv2 or any later version may be used, or where a choice of which version
+ * of the GPL is applied is otherwise unspecified.
+ */
+
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <errno.h>
+#include <utime.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/uio.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <sched.h>
+#include <dirent.h>
+#include <setjmp.h>
+#include <sys/shm.h>
+
+#define TESTPATH "/tmp/linux-test.tmp"
+#define TESTPORT 7654
+#define STACK_SIZE 16384
+
+void error1(const char *filename, int line, const char *fmt, ...)
+{
+    va_list ap;
+    va_start(ap, fmt);
+    fprintf(stderr, "%s:%d: ", filename, line);
+    vfprintf(stderr, fmt, ap);
+    fprintf(stderr, "\n");
+    va_end(ap);
+    exit(1);
+}
+
+int __chk_error(const char *filename, int line, int ret)
+{
+    if (ret < 0) {
+        error1(filename, line, "%m (ret=%d, errno=%d)",
+               ret, errno);
+    }
+    return ret;
+}
+
+#define error(fmt, ...) error1(__FILE__, __LINE__, fmt, ## __VA_ARGS__)
+
+#define chk_error(ret) __chk_error(__FILE__, __LINE__, (ret))
+
+/*******************************************************/
+
+#define FILE_BUF_SIZE 300
+
+void test_file(void)
+{
+    int fd, i, len, ret;
+    uint8_t buf[FILE_BUF_SIZE];
+    uint8_t buf2[FILE_BUF_SIZE];
+    uint8_t buf3[FILE_BUF_SIZE];
+    char cur_dir[1024];
+    struct stat st;
+    struct utimbuf tbuf;
+    struct iovec vecs[2];
+    DIR *dir;
+    struct dirent *de;
+
+    /* clean up, just in case */
+    unlink(TESTPATH "/file1");
+    unlink(TESTPATH "/file2");
+    unlink(TESTPATH "/file3");
+    rmdir(TESTPATH);
+
+    if (getcwd(cur_dir, sizeof(cur_dir)) == NULL)
+        error("getcwd");
+
+    chk_error(mkdir(TESTPATH, 0755));
+
+    chk_error(chdir(TESTPATH));
+
+    /* open/read/write/close/readv/writev/lseek */
+
+    fd = chk_error(open("file1", O_WRONLY | O_TRUNC | O_CREAT, 0644));
+    for(i=0;i < FILE_BUF_SIZE; i++)
+        buf[i] = i;
+    len = chk_error(write(fd, buf, FILE_BUF_SIZE / 2));
+    if (len != (FILE_BUF_SIZE / 2))
+        error("write");
+    vecs[0].iov_base = buf + (FILE_BUF_SIZE / 2);
+    vecs[0].iov_len = 16;
+    vecs[1].iov_base = buf + (FILE_BUF_SIZE / 2) + 16;
+    vecs[1].iov_len = (FILE_BUF_SIZE / 2) - 16;
+    len = chk_error(writev(fd, vecs, 2));
+    if (len != (FILE_BUF_SIZE / 2))
+     error("writev");
+    chk_error(close(fd));
+
+    chk_error(rename("file1", "file2"));
+
+    fd = chk_error(open("file2", O_RDONLY));
+
+    len = chk_error(read(fd, buf2, FILE_BUF_SIZE));
+    if (len != FILE_BUF_SIZE)
+        error("read");
+    if (memcmp(buf, buf2, FILE_BUF_SIZE) != 0)
+        error("memcmp");
+
+#define FOFFSET 16
+    ret = chk_error(lseek(fd, FOFFSET, SEEK_SET));
+    if (ret != 16)
+        error("lseek");
+    vecs[0].iov_base = buf3;
+    vecs[0].iov_len = 32;
+    vecs[1].iov_base = buf3 + 32;
+    vecs[1].iov_len = FILE_BUF_SIZE - FOFFSET - 32;
+    len = chk_error(readv(fd, vecs, 2));
+    if (len != FILE_BUF_SIZE - FOFFSET)
+        error("readv");
+    if (memcmp(buf + FOFFSET, buf3, FILE_BUF_SIZE - FOFFSET) != 0)
+        error("memcmp");
+
+    chk_error(close(fd));
+
+    /* access */
+    chk_error(access("file2", R_OK));
+
+    /* stat/chmod/utime/truncate */
+
+    chk_error(chmod("file2", 0600));
+    tbuf.actime = 1001;
+    tbuf.modtime = 1000;
+    chk_error(truncate("file2", 100));
+    chk_error(utime("file2", &tbuf));
+    chk_error(stat("file2", &st));
+    if (st.st_size != 100)
+        error("stat size");
+    if (!S_ISREG(st.st_mode))
+        error("stat mode");
+    if ((st.st_mode & 0777) != 0600)
+        error("stat mode2");
+    if (st.st_atime != 1001 ||
+        st.st_mtime != 1000)
+        error("stat time");
+
+    chk_error(stat(TESTPATH, &st));
+    if (!S_ISDIR(st.st_mode))
+        error("stat mode");
+
+    /* fstat */
+    fd = chk_error(open("file2", O_RDWR));
+    chk_error(ftruncate(fd, 50));
+    chk_error(fstat(fd, &st));
+    chk_error(close(fd));
+
+    if (st.st_size != 50)
+        error("stat size");
+    if (!S_ISREG(st.st_mode))
+        error("stat mode");
+
+    /* symlink/lstat */
+    chk_error(symlink("file2", "file3"));
+    chk_error(lstat("file3", &st));
+    if (!S_ISLNK(st.st_mode))
+        error("stat mode");
+
+    /* getdents */
+    dir = opendir(TESTPATH);
+    if (!dir)
+        error("opendir");
+    len = 0;
+    for(;;) {
+        de = readdir(dir);
+        if (!de)
+            break;
+        if (strcmp(de->d_name, ".") != 0 &&
+            strcmp(de->d_name, "..") != 0 &&
+            strcmp(de->d_name, "file2") != 0 &&
+            strcmp(de->d_name, "file3") != 0)
+            error("readdir");
+        len++;
+    }
+    closedir(dir);
+    if (len != 4)
+        error("readdir");
+
+    chk_error(unlink("file3"));
+    chk_error(unlink("file2"));
+    chk_error(chdir(cur_dir));
+    chk_error(rmdir(TESTPATH));
+}
+
+void test_fork(void)
+{
+    int pid, status;
+
+    pid = chk_error(fork());
+    if (pid == 0) {
+        /* child */
+        exit(2);
+    }
+    chk_error(waitpid(pid, &status, 0));
+    if (!WIFEXITED(status) || WEXITSTATUS(status) != 2)
+        error("waitpid status=0x%x", status);
+}
+
+void test_time(void)
+{
+    struct timeval tv, tv2;
+    struct timespec ts, rem;
+    struct rusage rusg1, rusg2;
+    int ti, i;
+
+    chk_error(gettimeofday(&tv, NULL));
+    rem.tv_sec = 1;
+    ts.tv_sec = 0;
+    ts.tv_nsec = 20 * 1000000;
+    chk_error(nanosleep(&ts, &rem));
+    if (rem.tv_sec != 1)
+        error("nanosleep");
+    chk_error(gettimeofday(&tv2, NULL));
+    ti = tv2.tv_sec - tv.tv_sec;
+    if (ti >= 2)
+        error("gettimeofday");
+
+    chk_error(getrusage(RUSAGE_SELF, &rusg1));
+    for(i = 0;i < 10000; i++);
+    chk_error(getrusage(RUSAGE_SELF, &rusg2));
+    if ((rusg2.ru_utime.tv_sec - rusg1.ru_utime.tv_sec) < 0 ||
+        (rusg2.ru_stime.tv_sec - rusg1.ru_stime.tv_sec) < 0)
+        error("getrusage");
+}
+
+void pstrcpy(char *buf, int buf_size, const char *str)
+{
+    int c;
+    char *q = buf;
+
+    if (buf_size <= 0)
+        return;
+
+    for(;;) {
+        c = *str++;
+        if (c == 0 || q >= buf + buf_size - 1)
+            break;
+        *q++ = c;
+    }
+    *q = '\0';
+}
+
+/* strcat and truncate. */
+char *pstrcat(char *buf, int buf_size, const char *s)
+{
+    int len;
+    len = strlen(buf);
+    if (len < buf_size)
+        pstrcpy(buf + len, buf_size - len, s);
+    return buf;
+}
+
+int server_socket(void)
+{
+    int val, fd;
+    struct sockaddr_in sockaddr;
+
+    /* server socket */
+    fd = chk_error(socket(PF_INET, SOCK_STREAM, 0));
+
+    val = 1;
+    chk_error(setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val)));
+
+    sockaddr.sin_family = AF_INET;
+    sockaddr.sin_port = htons(TESTPORT);
+    sockaddr.sin_addr.s_addr = 0;
+    chk_error(bind(fd, (struct sockaddr *)&sockaddr, sizeof(sockaddr)));
+    chk_error(listen(fd, 0));
+    return fd;
+
+}
+
+int client_socket(void)
+{
+    int fd;
+    struct sockaddr_in sockaddr;
+
+    /* server socket */
+    fd = chk_error(socket(PF_INET, SOCK_STREAM, 0));
+    sockaddr.sin_family = AF_INET;
+    sockaddr.sin_port = htons(TESTPORT);
+    inet_aton("127.0.0.1", &sockaddr.sin_addr);
+    chk_error(connect(fd, (struct sockaddr *)&sockaddr, sizeof(sockaddr)));
+    return fd;
+}
+
+const char socket_msg[] = "hello socket\n";
+
+void test_socket(void)
+{
+    int server_fd, client_fd, fd, pid, ret, val;
+    struct sockaddr_in sockaddr;
+    socklen_t len;
+    char buf[512];
+
+    server_fd = server_socket();
+
+    /* test a few socket options */
+    len = sizeof(val);
+    chk_error(getsockopt(server_fd, SOL_SOCKET, SO_TYPE, &val, &len));
+    if (val != SOCK_STREAM)
+        error("getsockopt");
+
+    pid = chk_error(fork());
+    if (pid == 0) {
+        client_fd = client_socket();
+        send(client_fd, socket_msg, sizeof(socket_msg), 0);
+        close(client_fd);
+        exit(0);
+    }
+    len = sizeof(sockaddr);
+    fd = chk_error(accept(server_fd, (struct sockaddr *)&sockaddr, &len));
+
+    ret = chk_error(recv(fd, buf, sizeof(buf), 0));
+    if (ret != sizeof(socket_msg))
+        error("recv");
+    if (memcmp(buf, socket_msg, sizeof(socket_msg)) != 0)
+        error("socket_msg");
+    chk_error(close(fd));
+    chk_error(close(server_fd));
+}
+
+#define WCOUNT_MAX 512
+
+void test_pipe(void)
+{
+    fd_set rfds, wfds;
+    int fds[2], fd_max, ret;
+    uint8_t ch;
+    int wcount, rcount;
+
+    chk_error(pipe(fds));
+    chk_error(fcntl(fds[0], F_SETFL, O_NONBLOCK));
+    chk_error(fcntl(fds[1], F_SETFL, O_NONBLOCK));
+    wcount = 0;
+    rcount = 0;
+    for(;;) {
+        FD_ZERO(&rfds);
+        fd_max = fds[0];
+        FD_SET(fds[0], &rfds);
+
+        FD_ZERO(&wfds);
+        FD_SET(fds[1], &wfds);
+        if (fds[1] > fd_max)
+            fd_max = fds[1];
+
+        ret = chk_error(select(fd_max + 1, &rfds, &wfds, NULL, NULL));
+        if (ret > 0) {
+            if (FD_ISSET(fds[0], &rfds)) {
+                chk_error(read(fds[0], &ch, 1));
+                rcount++;
+                if (rcount >= WCOUNT_MAX)
+                    break;
+            }
+            if (FD_ISSET(fds[1], &wfds)) {
+                ch = 'a';
+                chk_error(write(fds[0], &ch, 1));
+                wcount++;
+            }
+        }
+    }
+    chk_error(close(fds[0]));
+    chk_error(close(fds[1]));
+}
+
+int thread1_res;
+int thread2_res;
+
+int thread1_func(void *arg)
+{
+    int i;
+    for(i=0;i<5;i++) {
+        thread1_res++;
+        usleep(10 * 1000);
+    }
+    return 0;
+}
+
+int thread2_func(void *arg)
+{
+    int i;
+    for(i=0;i<6;i++) {
+        thread2_res++;
+        usleep(10 * 1000);
+    }
+    return 0;
+}
+
+void test_clone(void)
+{
+    uint8_t *stack1, *stack2;
+    int pid1, pid2, status1, status2;
+
+    stack1 = malloc(STACK_SIZE);
+    pid1 = chk_error(clone(thread1_func, stack1 + STACK_SIZE,
+                           CLONE_VM | CLONE_FS | CLONE_FILES | SIGCHLD, "hello1"));
+
+    stack2 = malloc(STACK_SIZE);
+    pid2 = chk_error(clone(thread2_func, stack2 + STACK_SIZE,
+                           CLONE_VM | CLONE_FS | CLONE_FILES | SIGCHLD, "hello2"));
+
+    while (waitpid(pid1, &status1, 0) != pid1);
+    while (waitpid(pid2, &status2, 0) != pid2);
+    if (thread1_res != 5 ||
+        thread2_res != 6)
+        error("clone");
+}
+
+/***********************************/
+
+volatile int alarm_count;
+jmp_buf jmp_env;
+
+void sig_alarm(int sig)
+{
+    if (sig != SIGALRM)
+        error("signal");
+    alarm_count++;
+}
+
+void sig_segv(int sig, siginfo_t *info, void *puc)
+{
+    if (sig != SIGSEGV)
+        error("signal");
+    longjmp(jmp_env, 1);
+}
+
+void test_signal(void)
+{
+    struct sigaction act;
+    struct itimerval it, oit;
+
+    /* timer test */
+
+    alarm_count = 0;
+
+    act.sa_handler = sig_alarm;
+    sigemptyset(&act.sa_mask);
+    act.sa_flags = 0;
+    chk_error(sigaction(SIGALRM, &act, NULL));
+
+    it.it_interval.tv_sec = 0;
+    it.it_interval.tv_usec = 10 * 1000;
+    it.it_value.tv_sec = 0;
+    it.it_value.tv_usec = 10 * 1000;
+    chk_error(setitimer(ITIMER_REAL, &it, NULL));
+    chk_error(getitimer(ITIMER_REAL, &oit));
+    if (oit.it_value.tv_sec != it.it_value.tv_sec ||
+        oit.it_value.tv_usec != it.it_value.tv_usec)
+        error("itimer");
+
+    while (alarm_count < 5) {
+        usleep(10 * 1000);
+    }
+
+    it.it_interval.tv_sec = 0;
+    it.it_interval.tv_usec = 0;
+    it.it_value.tv_sec = 0;
+    it.it_value.tv_usec = 0;
+    memset(&oit, 0xff, sizeof(oit));
+    chk_error(setitimer(ITIMER_REAL, &it, &oit));
+    if (oit.it_value.tv_sec != 0 ||
+        oit.it_value.tv_usec != 10 * 1000)
+        error("setitimer");
+
+    /* SIGSEGV test */
+    act.sa_sigaction = sig_segv;
+    sigemptyset(&act.sa_mask);
+    act.sa_flags = SA_SIGINFO;
+    chk_error(sigaction(SIGSEGV, &act, NULL));
+    if (setjmp(jmp_env) == 0) {
+        *(uint8_t *)0 = 0;
+    }
+
+    act.sa_handler = SIG_DFL;
+    sigemptyset(&act.sa_mask);
+    act.sa_flags = 0;
+    chk_error(sigaction(SIGSEGV, &act, NULL));
+}
+
+#define SHM_SIZE 32768
+
+void test_shm(void)
+{
+    void *ptr;
+    int shmid;
+
+    shmid = chk_error(shmget(IPC_PRIVATE, SHM_SIZE, IPC_CREAT | 0777));
+    ptr = shmat(shmid, NULL, 0);
+    if (!ptr)
+        error("shmat");
+
+    memset(ptr, 0, SHM_SIZE);
+
+    chk_error(shmctl(shmid, IPC_RMID, 0));
+    chk_error(shmdt(ptr));
+}
+
+int main(int argc, char **argv)
+{
+    test_file();
+    test_fork();
+    test_time();
+    test_socket();
+    //    test_clone();
+    test_signal();
+    test_shm();
+    return 0;
+}
diff --git a/src/recompiler/tests/pi_10.com b/src/recompiler/tests/pi_10.com
new file mode 100644
index 00000000..8993ba1a
--- /dev/null
+++ b/src/recompiler/tests/pi_10.com
diff --git a/src/recompiler/tests/qruncom.c b/src/recompiler/tests/qruncom.c
new file mode 100644
index 00000000..079f7a29
--- /dev/null
+++ b/src/recompiler/tests/qruncom.c
@@ -0,0 +1,284 @@
+/*
+ * Example of use of user mode libqemu: launch a basic .com DOS
+ * executable
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <signal.h>
+#include <malloc.h>
+
+#include "cpu.h"
+
+//#define SIGTEST
+
+int cpu_get_pic_interrupt(CPUState *env)
+{
+    return -1;
+}
+
+uint64_t cpu_get_tsc(CPUState *env)
+{
+    return 0;
+}
+
+static void set_gate(void *ptr, unsigned int type, unsigned int dpl,
+                     unsigned long addr, unsigned int sel)
+{
+    unsigned int e1, e2;
+    e1 = (addr & 0xffff) | (sel << 16);
+    e2 = (addr & 0xffff0000) | 0x8000 | (dpl << 13) | (type << 8);
+    stl((uint8_t *)ptr, e1);
+    stl((uint8_t *)ptr + 4, e2);
+}
+
+uint64_t idt_table[256];
+
+/* only dpl matters as we do only user space emulation */
+static void set_idt(int n, unsigned int dpl)
+{
+    set_gate(idt_table + n, 0, dpl, 0, 0);
+}
+
+void qemu_free(void *ptr)
+{
+    free(ptr);
+}
+
+void *qemu_malloc(size_t size)
+{
+    return malloc(size);
+}
+
+void *qemu_mallocz(size_t size)
+{
+    void *ptr;
+    ptr = qemu_malloc(size);
+    if (!ptr)
+        return NULL;
+    memset(ptr, 0, size);
+    return ptr;
+}
+
+void *qemu_vmalloc(size_t size)
+{
+    return memalign(4096, size);
+}
+
+void qemu_vfree(void *ptr)
+{
+    free(ptr);
+}
+
+void qemu_printf(const char *fmt, ...)
+{
+    va_list ap;
+    va_start(ap, fmt);
+    vprintf(fmt, ap);
+    va_end(ap);
+}
+
+/* XXX: this is a bug in helper2.c */
+int errno;
+
+/**********************************************/
+
+#define COM_BASE_ADDR    0x10100
+
+static void usage(void)
+{
+    printf("qruncom version 0.1 (c) 2003 Fabrice Bellard\n"
+           "usage: qruncom file.com\n"
+           "user mode libqemu demo: run simple .com DOS executables\n");
+    exit(1);
+}
+
+static inline uint8_t *seg_to_linear(unsigned int seg, unsigned int reg)
+{
+    return (uint8_t *)((seg << 4) + (reg & 0xffff));
+}
+
+static inline void pushw(CPUState *env, int val)
+{
+    env->regs[R_ESP] = (env->regs[R_ESP] & ~0xffff) | ((env->regs[R_ESP] - 2) & 0xffff);
+    *(uint16_t *)seg_to_linear(env->segs[R_SS].selector, env->regs[R_ESP]) = val;
+}
+
+static void host_segv_handler(int host_signum, siginfo_t *info,
+                              void *puc)
+{
+    if (cpu_signal_handler(host_signum, info, puc)) {
+        return;
+    }
+    abort();
+}
+
+int main(int argc, char **argv)
+{
+    uint8_t *vm86_mem;
+    const char *filename;
+    int fd, ret, seg;
+    CPUState *env;
+
+    if (argc != 2)
+        usage();
+    filename = argv[1];
+
+    vm86_mem = mmap((void *)0x00000000, 0x110000,
+                    PROT_WRITE | PROT_READ | PROT_EXEC,
+                    MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0);
+    if (vm86_mem == MAP_FAILED) {
+        perror("mmap");
+        exit(1);
+    }
+
+    /* load the MSDOS .com executable */
+    fd = open(filename, O_RDONLY);
+    if (fd < 0) {
+        perror(filename);
+        exit(1);
+    }
+    ret = read(fd, vm86_mem + COM_BASE_ADDR, 65536 - 256);
+    if (ret < 0) {
+        perror("read");
+        exit(1);
+    }
+    close(fd);
+
+    /* install exception handler for CPU emulator */
+    {
+        struct sigaction act;
+
+        sigfillset(&act.sa_mask);
+        act.sa_flags = SA_SIGINFO;
+        //        act.sa_flags |= SA_ONSTACK;
+
+        act.sa_sigaction = host_segv_handler;
+        sigaction(SIGSEGV, &act, NULL);
+        sigaction(SIGBUS, &act, NULL);
+    }
+
+    //    cpu_set_log(CPU_LOG_TB_IN_ASM | CPU_LOG_TB_OUT_ASM | CPU_LOG_EXEC);
+
+    env = cpu_init("qemu32");
+
+    cpu_x86_set_cpl(env, 3);
+
+    env->cr[0] = CR0_PG_MASK | CR0_WP_MASK | CR0_PE_MASK;
+    /* NOTE: hflags duplicates some of the virtual CPU state */
+    env->hflags |= HF_PE_MASK | VM_MASK;
+
+    /* flags setup : we activate the IRQs by default as in user
+       mode. We also activate the VM86 flag to run DOS code */
+    env->eflags |= IF_MASK | VM_MASK;
+
+    /* init basic registers */
+    env->eip = 0x100;
+    env->regs[R_ESP] = 0xfffe;
+    seg = (COM_BASE_ADDR - 0x100) >> 4;
+
+    cpu_x86_load_seg_cache(env, R_CS, seg,
+                           (seg << 4), 0xffff, 0);
+    cpu_x86_load_seg_cache(env, R_SS, seg,
+                           (seg << 4), 0xffff, 0);
+    cpu_x86_load_seg_cache(env, R_DS, seg,
+                           (seg << 4), 0xffff, 0);
+    cpu_x86_load_seg_cache(env, R_ES, seg,
+                           (seg << 4), 0xffff, 0);
+    cpu_x86_load_seg_cache(env, R_FS, seg,
+                           (seg << 4), 0xffff, 0);
+    cpu_x86_load_seg_cache(env, R_GS, seg,
+                           (seg << 4), 0xffff, 0);
+
+    /* exception support */
+    env->idt.base = (unsigned long)idt_table;
+    env->idt.limit = sizeof(idt_table) - 1;
+    set_idt(0, 0);
+    set_idt(1, 0);
+    set_idt(2, 0);
+    set_idt(3, 3);
+    set_idt(4, 3);
+    set_idt(5, 3);
+    set_idt(6, 0);
+    set_idt(7, 0);
+    set_idt(8, 0);
+    set_idt(9, 0);
+    set_idt(10, 0);
+    set_idt(11, 0);
+    set_idt(12, 0);
+    set_idt(13, 0);
+    set_idt(14, 0);
+    set_idt(15, 0);
+    set_idt(16, 0);
+    set_idt(17, 0);
+    set_idt(18, 0);
+    set_idt(19, 0);
+
+    /* put return code */
+    *seg_to_linear(env->segs[R_CS].selector, 0) = 0xb4; /* mov ah, $0 */
+    *seg_to_linear(env->segs[R_CS].selector, 1) = 0x00;
+    *seg_to_linear(env->segs[R_CS].selector, 2) = 0xcd; /* int $0x21 */
+    *seg_to_linear(env->segs[R_CS].selector, 3) = 0x21;
+    pushw(env, 0x0000);
+
+    /* the value of these registers seem to be assumed by pi_10.com */
+    env->regs[R_ESI] = 0x100;
+    env->regs[R_ECX] = 0xff;
+    env->regs[R_EBP] = 0x0900;
+    env->regs[R_EDI] = 0xfffe;
+
+    /* inform the emulator of the mmaped memory */
+    page_set_flags(0x00000000, 0x110000,
+                   PAGE_WRITE | PAGE_READ | PAGE_EXEC | PAGE_VALID);
+
+    for(;;) {
+        ret = cpu_x86_exec(env);
+        switch(ret) {
+        case EXCP0D_GPF:
+            {
+                int int_num, ah;
+                int_num = *(uint8_t *)(env->segs[R_CS].base + env->eip + 1);
+                if (int_num != 0x21)
+                    goto unknown_int;
+                ah = (env->regs[R_EAX] >> 8) & 0xff;
+                switch(ah) {
+                case 0x00: /* exit */
+                    exit(0);
+                case 0x02: /* write char */
+                    {
+                        uint8_t c = env->regs[R_EDX];
+                        write(1, &c, 1);
+                    }
+                    break;
+                case 0x09: /* write string */
+                    {
+                        uint8_t c;
+                        for(;;) {
+                            c = *seg_to_linear(env->segs[R_DS].selector, env->regs[R_EAX]);
+                            if (c == '$')
+                                break;
+                            write(1, &c, 1);
+                        }
+                        env->regs[R_EAX] = (env->regs[R_EAX] & ~0xff) | '$';
+                    }
+                    break;
+                default:
+                unknown_int:
+                    fprintf(stderr, "unsupported int 0x%02x\n", int_num);
+                    cpu_dump_state(env, stderr, fprintf, 0);
+                    //                    exit(1);
+                }
+                env->eip += 2;
+            }
+            break;
+        default:
+            fprintf(stderr, "unhandled cpu_exec return code (0x%x)\n", ret);
+            cpu_dump_state(env, stderr, fprintf, 0);
+            exit(1);
+        }
+    }
+}
diff --git a/src/recompiler/tests/runcom.c b/src/recompiler/tests/runcom.c
new file mode 100644
index 00000000..63805666
--- /dev/null
+++ b/src/recompiler/tests/runcom.c
@@ -0,0 +1,195 @@
+/*
+ * Simple example of use of vm86: launch a basic .com DOS executable
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <signal.h>
+
+#include <linux/unistd.h>
+#include <asm/vm86.h>
+
+//#define SIGTEST
+
+#undef __syscall_return
+#define __syscall_return(type, res) \
+do { \
+	return (type) (res); \
+} while (0)
+
+_syscall2(int, vm86, int, func, struct vm86plus_struct *, v86)
+
+#define COM_BASE_ADDR    0x10100
+
+static void usage(void)
+{
+    printf("runcom version 0.1 (c) 2003 Fabrice Bellard\n"
+           "usage: runcom file.com\n"
+           "VM86 Run simple .com DOS executables (linux vm86 test mode)\n");
+    exit(1);
+}
+
+static inline void set_bit(uint8_t *a, unsigned int bit)
+{
+    a[bit / 8] |= (1 << (bit % 8));
+}
+
+static inline uint8_t *seg_to_linear(unsigned int seg, unsigned int reg)
+{
+    return (uint8_t *)((seg << 4) + (reg & 0xffff));
+}
+
+static inline void pushw(struct vm86_regs *r, int val)
+{
+    r->esp = (r->esp & ~0xffff) | ((r->esp - 2) & 0xffff);
+    *(uint16_t *)seg_to_linear(r->ss, r->esp) = val;
+}
+
+void dump_regs(struct vm86_regs *r)
+{
+    fprintf(stderr,
+            "EAX=%08lx EBX=%08lx ECX=%08lx EDX=%08lx\n"
+            "ESI=%08lx EDI=%08lx EBP=%08lx ESP=%08lx\n"
+            "EIP=%08lx EFL=%08lx\n"
+            "CS=%04x DS=%04x ES=%04x SS=%04x FS=%04x GS=%04x\n",
+            r->eax, r->ebx, r->ecx, r->edx, r->esi, r->edi, r->ebp, r->esp,
+            r->eip, r->eflags,
+            r->cs, r->ds, r->es, r->ss, r->fs, r->gs);
+}
+
+#ifdef SIGTEST
+void alarm_handler(int sig)
+{
+    fprintf(stderr, "alarm signal=%d\n", sig);
+    alarm(1);
+}
+#endif
+
+int main(int argc, char **argv)
+{
+    uint8_t *vm86_mem;
+    const char *filename;
+    int fd, ret, seg;
+    struct vm86plus_struct ctx;
+    struct vm86_regs *r;
+
+    if (argc != 2)
+        usage();
+    filename = argv[1];
+
+    vm86_mem = mmap((void *)0x00000000, 0x110000,
+                    PROT_WRITE | PROT_READ | PROT_EXEC,
+                    MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0);
+    if (vm86_mem == MAP_FAILED) {
+        perror("mmap");
+        exit(1);
+    }
+#ifdef SIGTEST
+    {
+        struct sigaction act;
+
+        act.sa_handler = alarm_handler;
+        sigemptyset(&act.sa_mask);
+        act.sa_flags = 0;
+        sigaction(SIGALRM, &act, NULL);
+        alarm(1);
+    }
+#endif
+
+    /* load the MSDOS .com executable */
+    fd = open(filename, O_RDONLY);
+    if (fd < 0) {
+        perror(filename);
+        exit(1);
+    }
+    ret = read(fd, vm86_mem + COM_BASE_ADDR, 65536 - 256);
+    if (ret < 0) {
+        perror("read");
+        exit(1);
+    }
+    close(fd);
+
+    memset(&ctx, 0, sizeof(ctx));
+    /* init basic registers */
+    r = &ctx.regs;
+    r->eip = 0x100;
+    r->esp = 0xfffe;
+    seg = (COM_BASE_ADDR - 0x100) >> 4;
+    r->cs = seg;
+    r->ss = seg;
+    r->ds = seg;
+    r->es = seg;
+    r->fs = seg;
+    r->gs = seg;
+    r->eflags = VIF_MASK;
+
+    /* put return code */
+    set_bit((uint8_t *)&ctx.int_revectored, 0x21);
+    *seg_to_linear(r->cs, 0) = 0xb4; /* mov ah, $0 */
+    *seg_to_linear(r->cs, 1) = 0x00;
+    *seg_to_linear(r->cs, 2) = 0xcd; /* int $0x21 */
+    *seg_to_linear(r->cs, 3) = 0x21;
+    pushw(&ctx.regs, 0x0000);
+
+    /* the value of these registers seem to be assumed by pi_10.com */
+    r->esi = 0x100;
+    r->ecx = 0xff;
+    r->ebp = 0x0900;
+    r->edi = 0xfffe;
+
+    for(;;) {
+        ret = vm86(VM86_ENTER, &ctx);
+        switch(VM86_TYPE(ret)) {
+        case VM86_INTx:
+            {
+                int int_num, ah;
+
+                int_num = VM86_ARG(ret);
+                if (int_num != 0x21)
+                    goto unknown_int;
+                ah = (r->eax >> 8) & 0xff;
+                switch(ah) {
+                case 0x00: /* exit */
+                    exit(0);
+                case 0x02: /* write char */
+                    {
+                        uint8_t c = r->edx;
+                        write(1, &c, 1);
+                    }
+                    break;
+                case 0x09: /* write string */
+                    {
+                        uint8_t c;
+                        for(;;) {
+                            c = *seg_to_linear(r->ds, r->edx);
+                            if (c == '$')
+                                break;
+                            write(1, &c, 1);
+                        }
+                        r->eax = (r->eax & ~0xff) | '$';
+                    }
+                    break;
+                default:
+                unknown_int:
+                    fprintf(stderr, "unsupported int 0x%02x\n", int_num);
+                    dump_regs(&ctx.regs);
+                    //                    exit(1);
+                }
+            }
+            break;
+        case VM86_SIGNAL:
+            /* a signal came, we just ignore that */
+            break;
+        case VM86_STI:
+            break;
+        default:
+            fprintf(stderr, "unhandled vm86 return code (0x%x)\n", ret);
+            dump_regs(&ctx.regs);
+            exit(1);
+        }
+    }
+}
diff --git a/src/recompiler/tests/sha1.c b/src/recompiler/tests/sha1.c
new file mode 100644
index 00000000..93b7c8e8
--- /dev/null
+++ b/src/recompiler/tests/sha1.c
@@ -0,0 +1,240 @@
+
+/* from valgrind tests */
+
+/* ================ sha1.c ================ */
+/*
+SHA-1 in C
+By Steve Reid <steve@edmweb.com>
+100% Public Domain
+
+Test Vectors (from FIPS PUB 180-1)
+"abc"
+  A9993E36 4706816A BA3E2571 7850C26C 9CD0D89D
+"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"
+  84983E44 1C3BD26E BAAE4AA1 F95129E5 E54670F1
+A million repetitions of "a"
+  34AA973C D4C4DAA4 F61EEB2B DBAD2731 6534016F
+*/
+
+/* #define LITTLE_ENDIAN * This should be #define'd already, if true. */
+/* #define SHA1HANDSOFF * Copies data before messing with it. */
+
+#define SHA1HANDSOFF
+
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+/* ================ sha1.h ================ */
+/*
+SHA-1 in C
+By Steve Reid <steve@edmweb.com>
+100% Public Domain
+*/
+
+typedef struct {
+    uint32_t state[5];
+    uint32_t count[2];
+    unsigned char buffer[64];
+} SHA1_CTX;
+
+void SHA1Transform(uint32_t state[5], const unsigned char buffer[64]);
+void SHA1Init(SHA1_CTX* context);
+void SHA1Update(SHA1_CTX* context, const unsigned char* data, uint32_t len);
+void SHA1Final(unsigned char digest[20], SHA1_CTX* context);
+/* ================ end of sha1.h ================ */
+#include <endian.h>
+
+#define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits))))
+
+/* blk0() and blk() perform the initial expand. */
+/* I got the idea of expanding during the round function from SSLeay */
+#if BYTE_ORDER == LITTLE_ENDIAN
+#define blk0(i) (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) \
+    |(rol(block->l[i],8)&0x00FF00FF))
+#elif BYTE_ORDER == BIG_ENDIAN
+#define blk0(i) block->l[i]
+#else
+#error "Endianness not defined!"
+#endif
+#define blk(i) (block->l[i&15] = rol(block->l[(i+13)&15]^block->l[(i+8)&15] \
+    ^block->l[(i+2)&15]^block->l[i&15],1))
+
+/* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */
+#define R0(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk0(i)+0x5A827999+rol(v,5);w=rol(w,30);
+#define R1(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk(i)+0x5A827999+rol(v,5);w=rol(w,30);
+#define R2(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0x6ED9EBA1+rol(v,5);w=rol(w,30);
+#define R3(v,w,x,y,z,i) z+=(((w|x)&y)|(w&x))+blk(i)+0x8F1BBCDC+rol(v,5);w=rol(w,30);
+#define R4(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0xCA62C1D6+rol(v,5);w=rol(w,30);
+
+
+/* Hash a single 512-bit block. This is the core of the algorithm. */
+
+void SHA1Transform(uint32_t state[5], const unsigned char buffer[64])
+{
+uint32_t a, b, c, d, e;
+typedef union {
+    unsigned char c[64];
+    uint32_t l[16];
+} CHAR64LONG16;
+#ifdef SHA1HANDSOFF
+CHAR64LONG16 block[1];  /* use array to appear as a pointer */
+    memcpy(block, buffer, 64);
+#else
+    /* The following had better never be used because it causes the
+     * pointer-to-const buffer to be cast into a pointer to non-const.
+     * And the result is written through.  I threw a "const" in, hoping
+     * this will cause a diagnostic.
+     */
+CHAR64LONG16* block = (const CHAR64LONG16*)buffer;
+#endif
+    /* Copy context->state[] to working vars */
+    a = state[0];
+    b = state[1];
+    c = state[2];
+    d = state[3];
+    e = state[4];
+    /* 4 rounds of 20 operations each. Loop unrolled. */
+    R0(a,b,c,d,e, 0); R0(e,a,b,c,d, 1); R0(d,e,a,b,c, 2); R0(c,d,e,a,b, 3);
+    R0(b,c,d,e,a, 4); R0(a,b,c,d,e, 5); R0(e,a,b,c,d, 6); R0(d,e,a,b,c, 7);
+    R0(c,d,e,a,b, 8); R0(b,c,d,e,a, 9); R0(a,b,c,d,e,10); R0(e,a,b,c,d,11);
+    R0(d,e,a,b,c,12); R0(c,d,e,a,b,13); R0(b,c,d,e,a,14); R0(a,b,c,d,e,15);
+    R1(e,a,b,c,d,16); R1(d,e,a,b,c,17); R1(c,d,e,a,b,18); R1(b,c,d,e,a,19);
+    R2(a,b,c,d,e,20); R2(e,a,b,c,d,21); R2(d,e,a,b,c,22); R2(c,d,e,a,b,23);
+    R2(b,c,d,e,a,24); R2(a,b,c,d,e,25); R2(e,a,b,c,d,26); R2(d,e,a,b,c,27);
+    R2(c,d,e,a,b,28); R2(b,c,d,e,a,29); R2(a,b,c,d,e,30); R2(e,a,b,c,d,31);
+    R2(d,e,a,b,c,32); R2(c,d,e,a,b,33); R2(b,c,d,e,a,34); R2(a,b,c,d,e,35);
+    R2(e,a,b,c,d,36); R2(d,e,a,b,c,37); R2(c,d,e,a,b,38); R2(b,c,d,e,a,39);
+    R3(a,b,c,d,e,40); R3(e,a,b,c,d,41); R3(d,e,a,b,c,42); R3(c,d,e,a,b,43);
+    R3(b,c,d,e,a,44); R3(a,b,c,d,e,45); R3(e,a,b,c,d,46); R3(d,e,a,b,c,47);
+    R3(c,d,e,a,b,48); R3(b,c,d,e,a,49); R3(a,b,c,d,e,50); R3(e,a,b,c,d,51);
+    R3(d,e,a,b,c,52); R3(c,d,e,a,b,53); R3(b,c,d,e,a,54); R3(a,b,c,d,e,55);
+    R3(e,a,b,c,d,56); R3(d,e,a,b,c,57); R3(c,d,e,a,b,58); R3(b,c,d,e,a,59);
+    R4(a,b,c,d,e,60); R4(e,a,b,c,d,61); R4(d,e,a,b,c,62); R4(c,d,e,a,b,63);
+    R4(b,c,d,e,a,64); R4(a,b,c,d,e,65); R4(e,a,b,c,d,66); R4(d,e,a,b,c,67);
+    R4(c,d,e,a,b,68); R4(b,c,d,e,a,69); R4(a,b,c,d,e,70); R4(e,a,b,c,d,71);
+    R4(d,e,a,b,c,72); R4(c,d,e,a,b,73); R4(b,c,d,e,a,74); R4(a,b,c,d,e,75);
+    R4(e,a,b,c,d,76); R4(d,e,a,b,c,77); R4(c,d,e,a,b,78); R4(b,c,d,e,a,79);
+    /* Add the working vars back into context.state[] */
+    state[0] += a;
+    state[1] += b;
+    state[2] += c;
+    state[3] += d;
+    state[4] += e;
+    /* Wipe variables */
+    a = b = c = d = e = 0;
+#ifdef SHA1HANDSOFF
+    memset(block, '\0', sizeof(block));
+#endif
+}
+
+
+/* SHA1Init - Initialize new context */
+
+void SHA1Init(SHA1_CTX* context)
+{
+    /* SHA1 initialization constants */
+    context->state[0] = 0x67452301;
+    context->state[1] = 0xEFCDAB89;
+    context->state[2] = 0x98BADCFE;
+    context->state[3] = 0x10325476;
+    context->state[4] = 0xC3D2E1F0;
+    context->count[0] = context->count[1] = 0;
+}
+
+
+/* Run your data through this. */
+
+void SHA1Update(SHA1_CTX* context, const unsigned char* data, uint32_t len)
+{
+uint32_t i;
+uint32_t j;
+
+    j = context->count[0];
+    if ((context->count[0] += len << 3) < j)
+	context->count[1]++;
+    context->count[1] += (len>>29);
+    j = (j >> 3) & 63;
+    if ((j + len) > 63) {
+        memcpy(&context->buffer[j], data, (i = 64-j));
+        SHA1Transform(context->state, context->buffer);
+        for ( ; i + 63 < len; i += 64) {
+            SHA1Transform(context->state, &data[i]);
+        }
+        j = 0;
+    }
+    else i = 0;
+    memcpy(&context->buffer[j], &data[i], len - i);
+}
+
+
+/* Add padding and return the message digest. */
+
+void SHA1Final(unsigned char digest[20], SHA1_CTX* context)
+{
+unsigned i;
+unsigned char finalcount[8];
+unsigned char c;
+
+#if 0	/* untested "improvement" by DHR */
+    /* Convert context->count to a sequence of bytes
+     * in finalcount.  Second element first, but
+     * big-endian order within element.
+     * But we do it all backwards.
+     */
+    unsigned char *fcp = &finalcount[8];
+
+    for (i = 0; i < 2; i++)
+    {
+	uint32_t t = context->count[i];
+	int j;
+
+	for (j = 0; j < 4; t >>= 8, j++)
+	    *--fcp = (unsigned char) t;
+    }
+#else
+    for (i = 0; i < 8; i++) {
+        finalcount[i] = (unsigned char)((context->count[(i >= 4 ? 0 : 1)]
+         >> ((3-(i & 3)) * 8) ) & 255);  /* Endian independent */
+    }
+#endif
+    c = 0200;
+    SHA1Update(context, &c, 1);
+    while ((context->count[0] & 504) != 448) {
+	c = 0000;
+        SHA1Update(context, &c, 1);
+    }
+    SHA1Update(context, finalcount, 8);  /* Should cause a SHA1Transform() */
+    for (i = 0; i < 20; i++) {
+        digest[i] = (unsigned char)
+         ((context->state[i>>2] >> ((3-(i & 3)) * 8) ) & 255);
+    }
+    /* Wipe variables */
+    memset(context, '\0', sizeof(*context));
+    memset(&finalcount, '\0', sizeof(finalcount));
+}
+/* ================ end of sha1.c ================ */
+
+#define BUFSIZE 4096
+
+int
+main(int argc, char **argv)
+{
+    SHA1_CTX ctx;
+    unsigned char hash[20], buf[BUFSIZE];
+    int i;
+
+    for(i=0;i<BUFSIZE;i++)
+        buf[i] = i;
+
+    SHA1Init(&ctx);
+    for(i=0;i<1000;i++)
+        SHA1Update(&ctx, buf, BUFSIZE);
+    SHA1Final(hash, &ctx);
+
+    printf("SHA1=");
+    for(i=0;i<20;i++)
+        printf("%02x", hash[i]);
+    printf("\n");
+    return 0;
+}
diff --git a/src/recompiler/tests/test-i386-code16.S b/src/recompiler/tests/test-i386-code16.S
new file mode 100644
index 00000000..816c24b9
--- /dev/null
+++ b/src/recompiler/tests/test-i386-code16.S
@@ -0,0 +1,79 @@
+        .code16
+        .globl code16_start
+        .globl code16_end
+
+CS_SEG = 0xf
+
+code16_start:
+
+        .globl code16_func1
+
+        /* basic test */
+code16_func1 = . - code16_start
+        mov $1, %eax
+        data32 lret
+
+/* test push/pop in 16 bit mode */
+        .globl code16_func2
+code16_func2 = . - code16_start
+        xor %eax, %eax
+        mov $0x12345678, %ebx
+        movl %esp, %ecx
+        push %bx
+        subl %esp, %ecx
+        pop %ax
+        data32 lret
+
+/* test various jmp opcodes */
+        .globl code16_func3
+code16_func3 = . - code16_start
+        jmp 1f
+        nop
+1:
+        mov $4, %eax
+        mov $0x12345678, %ebx
+        xor %bx, %bx
+        jz 2f
+        add $2, %ax
+2:
+
+        call myfunc
+
+        lcall $CS_SEG, $(myfunc2 - code16_start)
+
+        ljmp $CS_SEG, $(myjmp1 - code16_start)
+myjmp1_next:
+
+        cs lcall *myfunc2_addr - code16_start
+
+        cs ljmp *myjmp2_addr - code16_start
+myjmp2_next:
+
+        data32 lret
+
+myfunc2_addr:
+        .short myfunc2 - code16_start
+        .short CS_SEG
+
+myjmp2_addr:
+        .short myjmp2 - code16_start
+        .short CS_SEG
+
+myjmp1:
+        add $8, %ax
+        jmp myjmp1_next
+
+myjmp2:
+        add $16, %ax
+        jmp myjmp2_next
+
+myfunc:
+        add $1, %ax
+        ret
+
+myfunc2:
+        add $4, %ax
+        lret
+
+
+code16_end:
diff --git a/src/recompiler/tests/test-i386-muldiv.h b/src/recompiler/tests/test-i386-muldiv.h
new file mode 100644
index 00000000..015f59e1
--- /dev/null
+++ b/src/recompiler/tests/test-i386-muldiv.h
@@ -0,0 +1,76 @@
+
+void glue(glue(test_, OP), b)(long op0, long op1)
+{
+    long res, s1, s0, flags;
+    s0 = op0;
+    s1 = op1;
+    res = s0;
+    flags = 0;
+    asm ("push %4\n\t"
+         "popf\n\t"
+         stringify(OP)"b %b2\n\t"
+         "pushf\n\t"
+         "pop %1\n\t"
+         : "=a" (res), "=g" (flags)
+         : "q" (s1), "0" (res), "1" (flags));
+    printf("%-10s A=" FMTLX " B=" FMTLX " R=" FMTLX " CC=%04lx\n",
+           stringify(OP) "b", s0, s1, res, flags & CC_MASK);
+}
+
+void glue(glue(test_, OP), w)(long op0h, long op0, long op1)
+{
+    long res, s1, flags, resh;
+    s1 = op1;
+    resh = op0h;
+    res = op0;
+    flags = 0;
+    asm ("push %5\n\t"
+         "popf\n\t"
+         stringify(OP) "w %w3\n\t"
+         "pushf\n\t"
+         "pop %1\n\t"
+         : "=a" (res), "=g" (flags), "=d" (resh)
+         : "q" (s1), "0" (res), "1" (flags), "2" (resh));
+    printf("%-10s AH=" FMTLX " AL=" FMTLX " B=" FMTLX " RH=" FMTLX " RL=" FMTLX " CC=%04lx\n",
+           stringify(OP) "w", op0h, op0, s1, resh, res, flags & CC_MASK);
+}
+
+void glue(glue(test_, OP), l)(long op0h, long op0, long op1)
+{
+    long res, s1, flags, resh;
+    s1 = op1;
+    resh = op0h;
+    res = op0;
+    flags = 0;
+    asm ("push %5\n\t"
+         "popf\n\t"
+         stringify(OP) "l %k3\n\t"
+         "pushf\n\t"
+         "pop %1\n\t"
+         : "=a" (res), "=g" (flags), "=d" (resh)
+         : "q" (s1), "0" (res), "1" (flags), "2" (resh));
+    printf("%-10s AH=" FMTLX " AL=" FMTLX " B=" FMTLX " RH=" FMTLX " RL=" FMTLX " CC=%04lx\n",
+           stringify(OP) "l", op0h, op0, s1, resh, res, flags & CC_MASK);
+}
+
+#if defined(__x86_64__)
+void glue(glue(test_, OP), q)(long op0h, long op0, long op1)
+{
+    long res, s1, flags, resh;
+    s1 = op1;
+    resh = op0h;
+    res = op0;
+    flags = 0;
+    asm ("push %5\n\t"
+         "popf\n\t"
+         stringify(OP) "q %3\n\t"
+         "pushf\n\t"
+         "pop %1\n\t"
+         : "=a" (res), "=g" (flags), "=d" (resh)
+         : "q" (s1), "0" (res), "1" (flags), "2" (resh));
+    printf("%-10s AH=" FMTLX " AL=" FMTLX " B=" FMTLX " RH=" FMTLX " RL=" FMTLX " CC=%04lx\n",
+           stringify(OP) "q", op0h, op0, s1, resh, res, flags & CC_MASK);
+}
+#endif
+
+#undef OP
diff --git a/src/recompiler/tests/test-i386-shift.h b/src/recompiler/tests/test-i386-shift.h
new file mode 100644
index 00000000..3d8f84bf
--- /dev/null
+++ b/src/recompiler/tests/test-i386-shift.h
@@ -0,0 +1,185 @@
+
+#define exec_op glue(exec_, OP)
+#define exec_opq glue(glue(exec_, OP), q)
+#define exec_opl glue(glue(exec_, OP), l)
+#define exec_opw glue(glue(exec_, OP), w)
+#define exec_opb glue(glue(exec_, OP), b)
+
+#ifndef OP_SHIFTD
+
+#ifdef OP_NOBYTE
+#define EXECSHIFT(size, rsize, res, s1, s2, flags) \
+    asm ("push %4\n\t"\
+         "popf\n\t"\
+         stringify(OP) size " %" rsize "2, %" rsize "0\n\t" \
+         "pushf\n\t"\
+         "pop %1\n\t"\
+         : "=g" (res), "=g" (flags)\
+         : "r" (s1), "0" (res), "1" (flags));
+#else
+#define EXECSHIFT(size, rsize, res, s1, s2, flags) \
+    asm ("push %4\n\t"\
+         "popf\n\t"\
+         stringify(OP) size " %%cl, %" rsize "0\n\t" \
+         "pushf\n\t"\
+         "pop %1\n\t"\
+         : "=q" (res), "=g" (flags)\
+         : "c" (s1), "0" (res), "1" (flags));
+#endif
+
+#if defined(__x86_64__)
+void exec_opq(long s2, long s0, long s1, long iflags)
+{
+    long res, flags;
+    res = s0;
+    flags = iflags;
+    EXECSHIFT("q", "", res, s1, s2, flags);
+    /* overflow is undefined if count != 1 */
+    if (s1 != 1)
+      flags &= ~CC_O;
+    printf("%-10s A=" FMTLX " B=" FMTLX " R=" FMTLX " CCIN=%04lx CC=%04lx\n",
+           stringify(OP) "q", s0, s1, res, iflags, flags & CC_MASK);
+}
+#endif
+
+void exec_opl(long s2, long s0, long s1, long iflags)
+{
+    long res, flags;
+    res = s0;
+    flags = iflags;
+    EXECSHIFT("l", "k", res, s1, s2, flags);
+    /* overflow is undefined if count != 1 */
+    if (s1 != 1)
+      flags &= ~CC_O;
+    printf("%-10s A=" FMTLX " B=" FMTLX " R=" FMTLX " CCIN=%04lx CC=%04lx\n",
+           stringify(OP) "l", s0, s1, res, iflags, flags & CC_MASK);
+}
+
+void exec_opw(long s2, long s0, long s1, long iflags)
+{
+    long res, flags;
+    res = s0;
+    flags = iflags;
+    EXECSHIFT("w", "w", res, s1, s2, flags);
+    /* overflow is undefined if count != 1 */
+    if (s1 != 1)
+      flags &= ~CC_O;
+    printf("%-10s A=" FMTLX " B=" FMTLX " R=" FMTLX " CCIN=%04lx CC=%04lx\n",
+           stringify(OP) "w", s0, s1, res, iflags, flags & CC_MASK);
+}
+
+#else
+#define EXECSHIFT(size, rsize, res, s1, s2, flags) \
+    asm ("push %4\n\t"\
+         "popf\n\t"\
+         stringify(OP) size " %%cl, %" rsize "5, %" rsize "0\n\t" \
+         "pushf\n\t"\
+         "pop %1\n\t"\
+         : "=g" (res), "=g" (flags)\
+         : "c" (s1), "0" (res), "1" (flags), "r" (s2));
+
+#if defined(__x86_64__)
+void exec_opq(long s2, long s0, long s1, long iflags)
+{
+    long res, flags;
+    res = s0;
+    flags = iflags;
+    EXECSHIFT("q", "", res, s1, s2, flags);
+    /* overflow is undefined if count != 1 */
+    if (s1 != 1)
+      flags &= ~CC_O;
+    printf("%-10s A=" FMTLX " B=" FMTLX " C=" FMTLX " R=" FMTLX " CCIN=%04lx CC=%04lx\n",
+           stringify(OP) "q", s0, s2, s1, res, iflags, flags & CC_MASK);
+}
+#endif
+
+void exec_opl(long s2, long s0, long s1, long iflags)
+{
+    long res, flags;
+    res = s0;
+    flags = iflags;
+    EXECSHIFT("l", "k", res, s1, s2, flags);
+    /* overflow is undefined if count != 1 */
+    if (s1 != 1)
+      flags &= ~CC_O;
+    printf("%-10s A=" FMTLX " B=" FMTLX " C=" FMTLX " R=" FMTLX " CCIN=%04lx CC=%04lx\n",
+           stringify(OP) "l", s0, s2, s1, res, iflags, flags & CC_MASK);
+}
+
+void exec_opw(long s2, long s0, long s1, long iflags)
+{
+    long res, flags;
+    res = s0;
+    flags = iflags;
+    EXECSHIFT("w", "w", res, s1, s2, flags);
+    /* overflow is undefined if count != 1 */
+    if (s1 != 1)
+      flags &= ~CC_O;
+    printf("%-10s A=" FMTLX " B=" FMTLX " C=" FMTLX " R=" FMTLX " CCIN=%04lx CC=%04lx\n",
+           stringify(OP) "w", s0, s2, s1, res, iflags, flags & CC_MASK);
+}
+
+#endif
+
+#ifndef OP_NOBYTE
+void exec_opb(long s0, long s1, long iflags)
+{
+    long res, flags;
+    res = s0;
+    flags = iflags;
+    EXECSHIFT("b", "b", res, s1, 0, flags);
+    /* overflow is undefined if count != 1 */
+    if (s1 != 1)
+      flags &= ~CC_O;
+    printf("%-10s A=" FMTLX " B=" FMTLX " R=" FMTLX " CCIN=%04lx CC=%04lx\n",
+           stringify(OP) "b", s0, s1, res, iflags, flags & CC_MASK);
+}
+#endif
+
+void exec_op(long s2, long s0, long s1)
+{
+    s2 = i2l(s2);
+    s0 = i2l(s0);
+#if defined(__x86_64__)
+    exec_opq(s2, s0, s1, 0);
+#endif
+    exec_opl(s2, s0, s1, 0);
+#ifdef OP_SHIFTD
+    exec_opw(s2, s0, s1, 0);
+#else
+    exec_opw(s2, s0, s1, 0);
+#endif
+#ifndef OP_NOBYTE
+    exec_opb(s0, s1, 0);
+#endif
+#ifdef OP_CC
+#if defined(__x86_64__)
+    exec_opq(s2, s0, s1, CC_C);
+#endif
+    exec_opl(s2, s0, s1, CC_C);
+    exec_opw(s2, s0, s1, CC_C);
+    exec_opb(s0, s1, CC_C);
+#endif
+}
+
+void glue(test_, OP)(void)
+{
+    int i, n;
+#if defined(__x86_64__)
+    n = 64;
+#else
+    n = 32;
+#endif
+    for(i = 0; i < n; i++)
+        exec_op(0x21ad3d34, 0x12345678, i);
+    for(i = 0; i < n; i++)
+        exec_op(0x813f3421, 0x82345679, i);
+}
+
+void *glue(_test_, OP) __init_call = glue(test_, OP);
+
+#undef OP
+#undef OP_CC
+#undef OP_SHIFTD
+#undef OP_NOBYTE
+#undef EXECSHIFT
diff --git a/src/recompiler/tests/test-i386-ssse3.c b/src/recompiler/tests/test-i386-ssse3.c
new file mode 100644
index 00000000..0a42bd03
--- /dev/null
+++ b/src/recompiler/tests/test-i386-ssse3.c
@@ -0,0 +1,57 @@
+/* See if various MMX/SSE SSSE3 instructions give expected results */
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+int main(int argc, char *argv[]) {
+	char hello[16];
+	const char ehlo[8] = "EHLO    ";
+	uint64_t mask = 0x8080800302020001;
+
+	uint64_t a = 0x0000000000090007;
+	uint64_t b = 0x0000000000000000;
+	uint32_t c;
+	uint16_t d;
+
+	const char e[16] = "LLOaaaaaaaaaaaaa";
+	const char f[16] = "aaaaaaaaaaaaaaHE";
+
+	/* pshufb mm1/xmm1, mm2/xmm2 */
+	asm volatile ("movq    (%0), %%mm0" : : "r" (ehlo) : "mm0", "mm1");
+	asm volatile ("movq    %0, %%mm1" : : "m" (mask));
+	asm volatile ("pshufb  %mm1, %mm0");
+	asm volatile ("movq    %%mm0, %0" : "=m" (hello));
+	printf("%s\n", hello);
+
+	/* pshufb mm1/xmm1, m64/m128 */
+	asm volatile ("movq    (%0), %%mm0" : : "r" (ehlo) : "mm0");
+	asm volatile ("pshufb  %0, %%mm0" : : "m" (mask));
+	asm volatile ("movq    %%mm0, %0" : "=m" (hello));
+	printf("%s\n", hello);
+
+	/* psubsw mm1/xmm1, m64/m128 */
+	asm volatile ("movq    %0, %%mm0" : : "r" (a) : "mm0");
+	asm volatile ("phsubsw %0, %%mm0" : : "m" (b));
+	asm volatile ("movq    %%mm0, %0" : "=m" (a));
+	printf("%i - %i = %i\n", 9, 7, -(int16_t) a);
+
+	/* palignr mm1/xmm1, m64/m128, imm8 */
+	asm volatile ("movdqa  (%0), %%xmm0" : : "r" (e) : "xmm0");
+	asm volatile ("palignr $14, (%0), %%xmm0" : : "r" (f));
+	asm volatile ("movdqa  %%xmm0, (%0)" : : "r" (hello));
+	printf("%5.5s\n", hello);
+
+#if 1 /* SSE4 */
+	/* popcnt r64, r/m64 */
+	asm volatile ("movq    $0x8421000010009c63, %%rax" : : : "rax");
+	asm volatile ("popcnt  %%ax, %%dx" : : : "dx");
+	asm volatile ("popcnt  %%eax, %%ecx" : : : "ecx");
+	asm volatile ("popcnt  %rax, %rax");
+	asm volatile ("movq    %%rax, %0" : "=m" (a));
+	asm volatile ("movl    %%ecx, %0" : "=m" (c));
+	asm volatile ("movw    %%dx, %0" : "=m" (d));
+	printf("%i = %i\n%i = %i = %i\n", 13, (int) a, 9, c, d + 1);
+#endif
+
+	return 0;
+}
diff --git a/src/recompiler/tests/test-i386-vm86.S b/src/recompiler/tests/test-i386-vm86.S
new file mode 100644
index 00000000..3bb96c99
--- /dev/null
+++ b/src/recompiler/tests/test-i386-vm86.S
@@ -0,0 +1,103 @@
+        .code16
+        .globl vm86_code_start
+        .globl vm86_code_end
+
+#define GET_OFFSET(x) ((x) - vm86_code_start + 0x100)
+
+vm86_code_start:
+        movw $GET_OFFSET(hello_world), %dx
+        movb $0x09, %ah
+        int $0x21
+
+        /* prepare int 0x90 vector */
+        xorw %ax, %ax
+        movw %ax, %es
+        es movw $GET_OFFSET(int90_test), 0x90 * 4
+        es movw %cs, 0x90 * 4 + 2
+
+        /* launch int 0x90 */
+
+        int $0x90
+
+        /* test IF support */
+        movw $GET_OFFSET(IF_msg), %dx
+        movb $0x09, %ah
+        int $0x21
+
+        pushf
+        popw %dx
+        movb $0xff, %ah
+        int $0x21
+
+        cli
+        pushf
+        popw %dx
+        movb $0xff, %ah
+        int $0x21
+
+        sti
+        pushfl
+        popl %edx
+        movb $0xff, %ah
+        int $0x21
+
+#if 0
+        movw $GET_OFFSET(IF_msg1), %dx
+        movb $0x09, %ah
+        int $0x21
+
+        pushf
+        movw %sp, %bx
+        andw $~0x200, (%bx)
+        popf
+#else
+        cli
+#endif
+
+        pushf
+        popw %dx
+        movb $0xff, %ah
+        int $0x21
+
+        pushfl
+        movw %sp, %bx
+        orw $0x200, (%bx)
+        popfl
+
+        pushfl
+        popl %edx
+        movb $0xff, %ah
+        int $0x21
+
+        movb $0x00, %ah
+        int $0x21
+
+int90_test:
+        pushf
+        pop %dx
+        movb $0xff, %ah
+        int $0x21
+
+        movw %sp, %bx
+        movw 4(%bx), %dx
+        movb $0xff, %ah
+        int $0x21
+
+        movw $GET_OFFSET(int90_msg), %dx
+        movb $0x09, %ah
+        int $0x21
+        iret
+
+int90_msg:
+        .string "INT90 started\n$"
+
+hello_world:
+        .string "Hello VM86 world\n$"
+
+IF_msg:
+        .string "VM86 IF test\n$"
+
+IF_msg1:
+        .string "If you see a diff here, your Linux kernel is buggy, please update to 2.4.20 kernel\n$"
+
+vm86_code_end:
diff --git a/src/recompiler/tests/test-i386.c b/src/recompiler/tests/test-i386.c
new file mode 100644
index 00000000..c8f21a95
--- /dev/null
+++ b/src/recompiler/tests/test-i386.c
@@ -0,0 +1,2769 @@
+/*
+ *  x86 CPU test
+ *
+ *  Copyright (c) 2003 Fabrice Bellard
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Oracle GPL Disclaimer: For the avoidance of doubt, except that if any license choice
+ * other than GPL or LGPL is available it will apply instead, Oracle elects to use only
+ * the General Public License version 2 (GPLv2) at this time for any software where
+ * a choice of GPL license versions is made available with the language indicating
+ * that GPLv2 or any later version may be used, or where a choice of which version
+ * of the GPL is applied is otherwise unspecified.
+ */
+
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+#include <math.h>
+#include <signal.h>
+#include <setjmp.h>
+#include <errno.h>
+#include <sys/ucontext.h>
+#include <sys/mman.h>
+
+#if !defined(__x86_64__)
+//#define TEST_VM86
+#define TEST_SEGS
+#endif
+//#define LINUX_VM86_IOPL_FIX
+//#define TEST_P4_FLAGS
+#ifdef __SSE__
+#define TEST_SSE
+#define TEST_CMOV  1
+#define TEST_FCOMI 1
+#else
+#undef TEST_SSE
+#define TEST_CMOV  1
+#define TEST_FCOMI 1
+#endif
+
+#if defined(__x86_64__)
+#define FMT64X "%016lx"
+#define FMTLX "%016lx"
+#define X86_64_ONLY(x) x
+#else
+#define FMT64X "%016" PRIx64
+#define FMTLX "%08lx"
+#define X86_64_ONLY(x)
+#endif
+
+#ifdef TEST_VM86
+#include <asm/vm86.h>
+#endif
+
+#define xglue(x, y) x ## y
+#define glue(x, y) xglue(x, y)
+#define stringify(s)	tostring(s)
+#define tostring(s)	#s
+
+#define CC_C   	0x0001
+#define CC_P 	0x0004
+#define CC_A	0x0010
+#define CC_Z	0x0040
+#define CC_S    0x0080
+#define CC_O    0x0800
+
+#define __init_call	__attribute__ ((unused,__section__ ("initcall")))
+
+#define CC_MASK (CC_C | CC_P | CC_Z | CC_S | CC_O | CC_A)
+
+#if defined(__x86_64__)
+static inline long i2l(long v)
+{
+    return v | ((v ^ 0xabcd) << 32);
+}
+#else
+static inline long i2l(long v)
+{
+    return v;
+}
+#endif
+
+#define OP add
+#include "test-i386.h"
+
+#define OP sub
+#include "test-i386.h"
+
+#define OP xor
+#include "test-i386.h"
+
+#define OP and
+#include "test-i386.h"
+
+#define OP or
+#include "test-i386.h"
+
+#define OP cmp
+#include "test-i386.h"
+
+#define OP adc
+#define OP_CC
+#include "test-i386.h"
+
+#define OP sbb
+#define OP_CC
+#include "test-i386.h"
+
+#define OP inc
+#define OP_CC
+#define OP1
+#include "test-i386.h"
+
+#define OP dec
+#define OP_CC
+#define OP1
+#include "test-i386.h"
+
+#define OP neg
+#define OP_CC
+#define OP1
+#include "test-i386.h"
+
+#define OP not
+#define OP_CC
+#define OP1
+#include "test-i386.h"
+
+#undef CC_MASK
+#define CC_MASK (CC_C | CC_P | CC_Z | CC_S | CC_O)
+
+#define OP shl
+#include "test-i386-shift.h"
+
+#define OP shr
+#include "test-i386-shift.h"
+
+#define OP sar
+#include "test-i386-shift.h"
+
+#define OP rol
+#include "test-i386-shift.h"
+
+#define OP ror
+#include "test-i386-shift.h"
+
+#define OP rcr
+#define OP_CC
+#include "test-i386-shift.h"
+
+#define OP rcl
+#define OP_CC
+#include "test-i386-shift.h"
+
+#define OP shld
+#define OP_SHIFTD
+#define OP_NOBYTE
+#include "test-i386-shift.h"
+
+#define OP shrd
+#define OP_SHIFTD
+#define OP_NOBYTE
+#include "test-i386-shift.h"
+
+/* XXX: should be more precise ? */
+#undef CC_MASK
+#define CC_MASK (CC_C)
+
+#define OP bt
+#define OP_NOBYTE
+#include "test-i386-shift.h"
+
+#define OP bts
+#define OP_NOBYTE
+#include "test-i386-shift.h"
+
+#define OP btr
+#define OP_NOBYTE
+#include "test-i386-shift.h"
+
+#define OP btc
+#define OP_NOBYTE
+#include "test-i386-shift.h"
+
+/* lea test (modrm support) */
+#define TEST_LEAQ(STR)\
+{\
+    asm("lea " STR ", %0"\
+        : "=r" (res)\
+        : "a" (eax), "b" (ebx), "c" (ecx), "d" (edx), "S" (esi), "D" (edi));\
+    printf("lea %s = " FMTLX "\n", STR, res);\
+}
+
+#define TEST_LEA(STR)\
+{\
+    asm("lea " STR ", %0"\
+        : "=r" (res)\
+        : "a" (eax), "b" (ebx), "c" (ecx), "d" (edx), "S" (esi), "D" (edi));\
+    printf("lea %s = " FMTLX "\n", STR, res);\
+}
+
+#define TEST_LEA16(STR)\
+{\
+    asm(".code16 ; .byte 0x67 ; leal " STR ", %0 ; .code32"\
+        : "=wq" (res)\
+        : "a" (eax), "b" (ebx), "c" (ecx), "d" (edx), "S" (esi), "D" (edi));\
+    printf("lea %s = %08lx\n", STR, res);\
+}
+
+
+void test_lea(void)
+{
+    long eax, ebx, ecx, edx, esi, edi, res;
+    eax = i2l(0x0001);
+    ebx = i2l(0x0002);
+    ecx = i2l(0x0004);
+    edx = i2l(0x0008);
+    esi = i2l(0x0010);
+    edi = i2l(0x0020);
+
+    TEST_LEA("0x4000");
+
+    TEST_LEA("(%%eax)");
+    TEST_LEA("(%%ebx)");
+    TEST_LEA("(%%ecx)");
+    TEST_LEA("(%%edx)");
+    TEST_LEA("(%%esi)");
+    TEST_LEA("(%%edi)");
+
+    TEST_LEA("0x40(%%eax)");
+    TEST_LEA("0x40(%%ebx)");
+    TEST_LEA("0x40(%%ecx)");
+    TEST_LEA("0x40(%%edx)");
+    TEST_LEA("0x40(%%esi)");
+    TEST_LEA("0x40(%%edi)");
+
+    TEST_LEA("0x4000(%%eax)");
+    TEST_LEA("0x4000(%%ebx)");
+    TEST_LEA("0x4000(%%ecx)");
+    TEST_LEA("0x4000(%%edx)");
+    TEST_LEA("0x4000(%%esi)");
+    TEST_LEA("0x4000(%%edi)");
+
+    TEST_LEA("(%%eax, %%ecx)");
+    TEST_LEA("(%%ebx, %%edx)");
+    TEST_LEA("(%%ecx, %%ecx)");
+    TEST_LEA("(%%edx, %%ecx)");
+    TEST_LEA("(%%esi, %%ecx)");
+    TEST_LEA("(%%edi, %%ecx)");
+
+    TEST_LEA("0x40(%%eax, %%ecx)");
+    TEST_LEA("0x4000(%%ebx, %%edx)");
+
+    TEST_LEA("(%%ecx, %%ecx, 2)");
+    TEST_LEA("(%%edx, %%ecx, 4)");
+    TEST_LEA("(%%esi, %%ecx, 8)");
+
+    TEST_LEA("(,%%eax, 2)");
+    TEST_LEA("(,%%ebx, 4)");
+    TEST_LEA("(,%%ecx, 8)");
+
+    TEST_LEA("0x40(,%%eax, 2)");
+    TEST_LEA("0x40(,%%ebx, 4)");
+    TEST_LEA("0x40(,%%ecx, 8)");
+
+
+    TEST_LEA("-10(%%ecx, %%ecx, 2)");
+    TEST_LEA("-10(%%edx, %%ecx, 4)");
+    TEST_LEA("-10(%%esi, %%ecx, 8)");
+
+    TEST_LEA("0x4000(%%ecx, %%ecx, 2)");
+    TEST_LEA("0x4000(%%edx, %%ecx, 4)");
+    TEST_LEA("0x4000(%%esi, %%ecx, 8)");
+
+#if defined(__x86_64__)
+    TEST_LEAQ("0x4000");
+    TEST_LEAQ("0x4000(%%rip)");
+
+    TEST_LEAQ("(%%rax)");
+    TEST_LEAQ("(%%rbx)");
+    TEST_LEAQ("(%%rcx)");
+    TEST_LEAQ("(%%rdx)");
+    TEST_LEAQ("(%%rsi)");
+    TEST_LEAQ("(%%rdi)");
+
+    TEST_LEAQ("0x40(%%rax)");
+    TEST_LEAQ("0x40(%%rbx)");
+    TEST_LEAQ("0x40(%%rcx)");
+    TEST_LEAQ("0x40(%%rdx)");
+    TEST_LEAQ("0x40(%%rsi)");
+    TEST_LEAQ("0x40(%%rdi)");
+
+    TEST_LEAQ("0x4000(%%rax)");
+    TEST_LEAQ("0x4000(%%rbx)");
+    TEST_LEAQ("0x4000(%%rcx)");
+    TEST_LEAQ("0x4000(%%rdx)");
+    TEST_LEAQ("0x4000(%%rsi)");
+    TEST_LEAQ("0x4000(%%rdi)");
+
+    TEST_LEAQ("(%%rax, %%rcx)");
+    TEST_LEAQ("(%%rbx, %%rdx)");
+    TEST_LEAQ("(%%rcx, %%rcx)");
+    TEST_LEAQ("(%%rdx, %%rcx)");
+    TEST_LEAQ("(%%rsi, %%rcx)");
+    TEST_LEAQ("(%%rdi, %%rcx)");
+
+    TEST_LEAQ("0x40(%%rax, %%rcx)");
+    TEST_LEAQ("0x4000(%%rbx, %%rdx)");
+
+    TEST_LEAQ("(%%rcx, %%rcx, 2)");
+    TEST_LEAQ("(%%rdx, %%rcx, 4)");
+    TEST_LEAQ("(%%rsi, %%rcx, 8)");
+
+    TEST_LEAQ("(,%%rax, 2)");
+    TEST_LEAQ("(,%%rbx, 4)");
+    TEST_LEAQ("(,%%rcx, 8)");
+
+    TEST_LEAQ("0x40(,%%rax, 2)");
+    TEST_LEAQ("0x40(,%%rbx, 4)");
+    TEST_LEAQ("0x40(,%%rcx, 8)");
+
+
+    TEST_LEAQ("-10(%%rcx, %%rcx, 2)");
+    TEST_LEAQ("-10(%%rdx, %%rcx, 4)");
+    TEST_LEAQ("-10(%%rsi, %%rcx, 8)");
+
+    TEST_LEAQ("0x4000(%%rcx, %%rcx, 2)");
+    TEST_LEAQ("0x4000(%%rdx, %%rcx, 4)");
+    TEST_LEAQ("0x4000(%%rsi, %%rcx, 8)");
+#else
+    /* limited 16 bit addressing test */
+    TEST_LEA16("0x4000");
+    TEST_LEA16("(%%bx)");
+    TEST_LEA16("(%%si)");
+    TEST_LEA16("(%%di)");
+    TEST_LEA16("0x40(%%bx)");
+    TEST_LEA16("0x40(%%si)");
+    TEST_LEA16("0x40(%%di)");
+    TEST_LEA16("0x4000(%%bx)");
+    TEST_LEA16("0x4000(%%si)");
+    TEST_LEA16("(%%bx,%%si)");
+    TEST_LEA16("(%%bx,%%di)");
+    TEST_LEA16("0x40(%%bx,%%si)");
+    TEST_LEA16("0x40(%%bx,%%di)");
+    TEST_LEA16("0x4000(%%bx,%%si)");
+    TEST_LEA16("0x4000(%%bx,%%di)");
+#endif
+}
+
+#define TEST_JCC(JCC, v1, v2)\
+{\
+    int res;\
+    asm("movl $1, %0\n\t"\
+        "cmpl %2, %1\n\t"\
+        "j" JCC " 1f\n\t"\
+        "movl $0, %0\n\t"\
+        "1:\n\t"\
+        : "=r" (res)\
+        : "r" (v1), "r" (v2));\
+    printf("%-10s %d\n", "j" JCC, res);\
+\
+    asm("movl $0, %0\n\t"\
+        "cmpl %2, %1\n\t"\
+        "set" JCC " %b0\n\t"\
+        : "=r" (res)\
+        : "r" (v1), "r" (v2));\
+    printf("%-10s %d\n", "set" JCC, res);\
+ if (TEST_CMOV) {\
+    long val = i2l(1);\
+    long res = i2l(0x12345678);\
+X86_64_ONLY(\
+    asm("cmpl %2, %1\n\t"\
+        "cmov" JCC "q %3, %0\n\t"\
+        : "=r" (res)\
+        : "r" (v1), "r" (v2), "m" (val), "0" (res));\
+        printf("%-10s R=" FMTLX "\n", "cmov" JCC "q", res);)\
+    asm("cmpl %2, %1\n\t"\
+        "cmov" JCC "l %k3, %k0\n\t"\
+        : "=r" (res)\
+        : "r" (v1), "r" (v2), "m" (val), "0" (res));\
+        printf("%-10s R=" FMTLX "\n", "cmov" JCC "l", res);\
+    asm("cmpl %2, %1\n\t"\
+        "cmov" JCC "w %w3, %w0\n\t"\
+        : "=r" (res)\
+        : "r" (v1), "r" (v2), "r" (1), "0" (res));\
+        printf("%-10s R=" FMTLX "\n", "cmov" JCC "w", res);\
+ } \
+}
+
+/* various jump tests */
+void test_jcc(void)
+{
+    TEST_JCC("ne", 1, 1);
+    TEST_JCC("ne", 1, 0);
+
+    TEST_JCC("e", 1, 1);
+    TEST_JCC("e", 1, 0);
+
+    TEST_JCC("l", 1, 1);
+    TEST_JCC("l", 1, 0);
+    TEST_JCC("l", 1, -1);
+
+    TEST_JCC("le", 1, 1);
+    TEST_JCC("le", 1, 0);
+    TEST_JCC("le", 1, -1);
+
+    TEST_JCC("ge", 1, 1);
+    TEST_JCC("ge", 1, 0);
+    TEST_JCC("ge", -1, 1);
+
+    TEST_JCC("g", 1, 1);
+    TEST_JCC("g", 1, 0);
+    TEST_JCC("g", 1, -1);
+
+    TEST_JCC("b", 1, 1);
+    TEST_JCC("b", 1, 0);
+    TEST_JCC("b", 1, -1);
+
+    TEST_JCC("be", 1, 1);
+    TEST_JCC("be", 1, 0);
+    TEST_JCC("be", 1, -1);
+
+    TEST_JCC("ae", 1, 1);
+    TEST_JCC("ae", 1, 0);
+    TEST_JCC("ae", 1, -1);
+
+    TEST_JCC("a", 1, 1);
+    TEST_JCC("a", 1, 0);
+    TEST_JCC("a", 1, -1);
+
+
+    TEST_JCC("p", 1, 1);
+    TEST_JCC("p", 1, 0);
+
+    TEST_JCC("np", 1, 1);
+    TEST_JCC("np", 1, 0);
+
+    TEST_JCC("o", 0x7fffffff, 0);
+    TEST_JCC("o", 0x7fffffff, -1);
+
+    TEST_JCC("no", 0x7fffffff, 0);
+    TEST_JCC("no", 0x7fffffff, -1);
+
+    TEST_JCC("s", 0, 1);
+    TEST_JCC("s", 0, -1);
+    TEST_JCC("s", 0, 0);
+
+    TEST_JCC("ns", 0, 1);
+    TEST_JCC("ns", 0, -1);
+    TEST_JCC("ns", 0, 0);
+}
+
+#define TEST_LOOP(insn) \
+{\
+    for(i = 0; i < sizeof(ecx_vals) / sizeof(long); i++) {\
+        ecx = ecx_vals[i];\
+        for(zf = 0; zf < 2; zf++) {\
+    asm("test %2, %2\n\t"\
+        "movl $1, %0\n\t"\
+          insn " 1f\n\t" \
+        "movl $0, %0\n\t"\
+        "1:\n\t"\
+        : "=a" (res)\
+        : "c" (ecx), "b" (!zf)); \
+    printf("%-10s ECX=" FMTLX " ZF=%ld r=%d\n", insn, ecx, zf, res);      \
+        }\
+   }\
+}
+
+void test_loop(void)
+{
+    long ecx, zf;
+    const long ecx_vals[] = {
+        0,
+        1,
+        0x10000,
+        0x10001,
+#if defined(__x86_64__)
+        0x100000000L,
+        0x100000001L,
+#endif
+    };
+    int i, res;
+
+#if !defined(__x86_64__)
+    TEST_LOOP("jcxz");
+    TEST_LOOP("loopw");
+    TEST_LOOP("loopzw");
+    TEST_LOOP("loopnzw");
+#endif
+
+    TEST_LOOP("jecxz");
+    TEST_LOOP("loopl");
+    TEST_LOOP("loopzl");
+    TEST_LOOP("loopnzl");
+}
+
+#undef CC_MASK
+#ifdef TEST_P4_FLAGS
+#define CC_MASK (CC_C | CC_P | CC_Z | CC_S | CC_O | CC_A)
+#else
+#define CC_MASK (CC_O | CC_C)
+#endif
+
+#define OP mul
+#include "test-i386-muldiv.h"
+
+#define OP imul
+#include "test-i386-muldiv.h"
+
+void test_imulw2(long op0, long op1)
+{
+    long res, s1, s0, flags;
+    s0 = op0;
+    s1 = op1;
+    res = s0;
+    flags = 0;
+    asm volatile ("push %4\n\t"
+         "popf\n\t"
+         "imulw %w2, %w0\n\t"
+         "pushf\n\t"
+         "pop %1\n\t"
+         : "=q" (res), "=g" (flags)
+         : "q" (s1), "0" (res), "1" (flags));
+    printf("%-10s A=" FMTLX " B=" FMTLX " R=" FMTLX " CC=%04lx\n",
+           "imulw", s0, s1, res, flags & CC_MASK);
+}
+
+void test_imull2(long op0, long op1)
+{
+    long res, s1, s0, flags;
+    s0 = op0;
+    s1 = op1;
+    res = s0;
+    flags = 0;
+    asm volatile ("push %4\n\t"
+         "popf\n\t"
+         "imull %k2, %k0\n\t"
+         "pushf\n\t"
+         "pop %1\n\t"
+         : "=q" (res), "=g" (flags)
+         : "q" (s1), "0" (res), "1" (flags));
+    printf("%-10s A=" FMTLX " B=" FMTLX " R=" FMTLX " CC=%04lx\n",
+           "imull", s0, s1, res, flags & CC_MASK);
+}
+
+#if defined(__x86_64__)
+void test_imulq2(long op0, long op1)
+{
+    long res, s1, s0, flags;
+    s0 = op0;
+    s1 = op1;
+    res = s0;
+    flags = 0;
+    asm volatile ("push %4\n\t"
+         "popf\n\t"
+         "imulq %2, %0\n\t"
+         "pushf\n\t"
+         "pop %1\n\t"
+         : "=q" (res), "=g" (flags)
+         : "q" (s1), "0" (res), "1" (flags));
+    printf("%-10s A=" FMTLX " B=" FMTLX " R=" FMTLX " CC=%04lx\n",
+           "imulq", s0, s1, res, flags & CC_MASK);
+}
+#endif
+
+#define TEST_IMUL_IM(size, rsize, op0, op1)\
+{\
+    long res, flags, s1;\
+    flags = 0;\
+    res = 0;\
+    s1 = op1;\
+    asm volatile ("push %3\n\t"\
+         "popf\n\t"\
+         "imul" size " $" #op0 ", %" rsize "2, %" rsize "0\n\t" \
+         "pushf\n\t"\
+         "pop %1\n\t"\
+         : "=r" (res), "=g" (flags)\
+         : "r" (s1), "1" (flags), "0" (res));\
+    printf("%-10s A=" FMTLX " B=" FMTLX " R=" FMTLX " CC=%04lx\n",\
+           "imul" size " im", (long)op0, (long)op1, res, flags & CC_MASK);\
+}
+
+
+#undef CC_MASK
+#define CC_MASK (0)
+
+#define OP div
+#include "test-i386-muldiv.h"
+
+#define OP idiv
+#include "test-i386-muldiv.h"
+
+void test_mul(void)
+{
+    test_imulb(0x1234561d, 4);
+    test_imulb(3, -4);
+    test_imulb(0x80, 0x80);
+    test_imulb(0x10, 0x10);
+
+    test_imulw(0, 0x1234001d, 45);
+    test_imulw(0, 23, -45);
+    test_imulw(0, 0x8000, 0x8000);
+    test_imulw(0, 0x100, 0x100);
+
+    test_imull(0, 0x1234001d, 45);
+    test_imull(0, 23, -45);
+    test_imull(0, 0x80000000, 0x80000000);
+    test_imull(0, 0x10000, 0x10000);
+
+    test_mulb(0x1234561d, 4);
+    test_mulb(3, -4);
+    test_mulb(0x80, 0x80);
+    test_mulb(0x10, 0x10);
+
+    test_mulw(0, 0x1234001d, 45);
+    test_mulw(0, 23, -45);
+    test_mulw(0, 0x8000, 0x8000);
+    test_mulw(0, 0x100, 0x100);
+
+    test_mull(0, 0x1234001d, 45);
+    test_mull(0, 23, -45);
+    test_mull(0, 0x80000000, 0x80000000);
+    test_mull(0, 0x10000, 0x10000);
+
+    test_imulw2(0x1234001d, 45);
+    test_imulw2(23, -45);
+    test_imulw2(0x8000, 0x8000);
+    test_imulw2(0x100, 0x100);
+
+    test_imull2(0x1234001d, 45);
+    test_imull2(23, -45);
+    test_imull2(0x80000000, 0x80000000);
+    test_imull2(0x10000, 0x10000);
+
+    TEST_IMUL_IM("w", "w", 45, 0x1234);
+    TEST_IMUL_IM("w", "w", -45, 23);
+    TEST_IMUL_IM("w", "w", 0x8000, 0x80000000);
+    TEST_IMUL_IM("w", "w", 0x7fff, 0x1000);
+
+    TEST_IMUL_IM("l", "k", 45, 0x1234);
+    TEST_IMUL_IM("l", "k", -45, 23);
+    TEST_IMUL_IM("l", "k", 0x8000, 0x80000000);
+    TEST_IMUL_IM("l", "k", 0x7fff, 0x1000);
+
+    test_idivb(0x12341678, 0x127e);
+    test_idivb(0x43210123, -5);
+    test_idivb(0x12340004, -1);
+
+    test_idivw(0, 0x12345678, 12347);
+    test_idivw(0, -23223, -45);
+    test_idivw(0, 0x12348000, -1);
+    test_idivw(0x12343, 0x12345678, 0x81238567);
+
+    test_idivl(0, 0x12345678, 12347);
+    test_idivl(0, -233223, -45);
+    test_idivl(0, 0x80000000, -1);
+    test_idivl(0x12343, 0x12345678, 0x81234567);
+
+    test_divb(0x12341678, 0x127e);
+    test_divb(0x43210123, -5);
+    test_divb(0x12340004, -1);
+
+    test_divw(0, 0x12345678, 12347);
+    test_divw(0, -23223, -45);
+    test_divw(0, 0x12348000, -1);
+    test_divw(0x12343, 0x12345678, 0x81238567);
+
+    test_divl(0, 0x12345678, 12347);
+    test_divl(0, -233223, -45);
+    test_divl(0, 0x80000000, -1);
+    test_divl(0x12343, 0x12345678, 0x81234567);
+
+#if defined(__x86_64__)
+    test_imulq(0, 0x1234001d1234001d, 45);
+    test_imulq(0, 23, -45);
+    test_imulq(0, 0x8000000000000000, 0x8000000000000000);
+    test_imulq(0, 0x100000000, 0x100000000);
+
+    test_mulq(0, 0x1234001d1234001d, 45);
+    test_mulq(0, 23, -45);
+    test_mulq(0, 0x8000000000000000, 0x8000000000000000);
+    test_mulq(0, 0x100000000, 0x100000000);
+
+    test_imulq2(0x1234001d1234001d, 45);
+    test_imulq2(23, -45);
+    test_imulq2(0x8000000000000000, 0x8000000000000000);
+    test_imulq2(0x100000000, 0x100000000);
+
+    TEST_IMUL_IM("q", "", 45, 0x12341234);
+    TEST_IMUL_IM("q", "", -45, 23);
+    TEST_IMUL_IM("q", "", 0x8000, 0x8000000000000000);
+    TEST_IMUL_IM("q", "", 0x7fff, 0x10000000);
+
+    test_idivq(0, 0x12345678abcdef, 12347);
+    test_idivq(0, -233223, -45);
+    test_idivq(0, 0x8000000000000000, -1);
+    test_idivq(0x12343, 0x12345678, 0x81234567);
+
+    test_divq(0, 0x12345678abcdef, 12347);
+    test_divq(0, -233223, -45);
+    test_divq(0, 0x8000000000000000, -1);
+    test_divq(0x12343, 0x12345678, 0x81234567);
+#endif
+}
+
+#define TEST_BSX(op, size, op0)\
+{\
+    long res, val, resz;\
+    val = op0;\
+    asm("xor %1, %1\n"\
+        "mov $0x12345678, %0\n"\
+        #op " %" size "2, %" size "0 ; setz %b1" \
+        : "=&r" (res), "=&q" (resz)\
+        : "r" (val));\
+    printf("%-10s A=" FMTLX " R=" FMTLX " %ld\n", #op, val, res, resz);\
+}
+
+void test_bsx(void)
+{
+    TEST_BSX(bsrw, "w", 0);
+    TEST_BSX(bsrw, "w", 0x12340128);
+    TEST_BSX(bsfw, "w", 0);
+    TEST_BSX(bsfw, "w", 0x12340128);
+    TEST_BSX(bsrl, "k", 0);
+    TEST_BSX(bsrl, "k", 0x00340128);
+    TEST_BSX(bsfl, "k", 0);
+    TEST_BSX(bsfl, "k", 0x00340128);
+#if defined(__x86_64__)
+    TEST_BSX(bsrq, "", 0);
+    TEST_BSX(bsrq, "", 0x003401281234);
+    TEST_BSX(bsfq, "", 0);
+    TEST_BSX(bsfq, "", 0x003401281234);
+#endif
+}
+
+/**********************************************/
+
+union float64u {
+    double d;
+    uint64_t l;
+};
+
+union float64u q_nan = { .l = 0xFFF8000000000000LL };
+union float64u s_nan = { .l = 0xFFF0000000000000LL };
+
+void test_fops(double a, double b)
+{
+    printf("a=%f b=%f a+b=%f\n", a, b, a + b);
+    printf("a=%f b=%f a-b=%f\n", a, b, a - b);
+    printf("a=%f b=%f a*b=%f\n", a, b, a * b);
+    printf("a=%f b=%f a/b=%f\n", a, b, a / b);
+    printf("a=%f b=%f fmod(a, b)=%f\n", a, b, fmod(a, b));
+    printf("a=%f sqrt(a)=%f\n", a, sqrt(a));
+    printf("a=%f sin(a)=%f\n", a, sin(a));
+    printf("a=%f cos(a)=%f\n", a, cos(a));
+    printf("a=%f tan(a)=%f\n", a, tan(a));
+    printf("a=%f log(a)=%f\n", a, log(a));
+    printf("a=%f exp(a)=%f\n", a, exp(a));
+    printf("a=%f b=%f atan2(a, b)=%f\n", a, b, atan2(a, b));
+    /* just to test some op combining */
+    printf("a=%f asin(sin(a))=%f\n", a, asin(sin(a)));
+    printf("a=%f acos(cos(a))=%f\n", a, acos(cos(a)));
+    printf("a=%f atan(tan(a))=%f\n", a, atan(tan(a)));
+
+}
+
+void fpu_clear_exceptions(void)
+{
+    struct __attribute__((packed)) {
+        uint16_t fpuc;
+        uint16_t dummy1;
+        uint16_t fpus;
+        uint16_t dummy2;
+        uint16_t fptag;
+        uint16_t dummy3;
+        uint32_t ignored[4];
+        long double fpregs[8];
+    } float_env32;
+
+    asm volatile ("fnstenv %0\n" : : "m" (float_env32));
+    float_env32.fpus &= ~0x7f;
+    asm volatile ("fldenv %0\n" : : "m" (float_env32));
+}
+
+/* XXX: display exception bits when supported */
+#define FPUS_EMASK 0x0000
+//#define FPUS_EMASK 0x007f
+
+void test_fcmp(double a, double b)
+{
+    long eflags, fpus;
+
+    fpu_clear_exceptions();
+    asm("fcom %2\n"
+        "fstsw %%ax\n"
+        : "=a" (fpus)
+        : "t" (a), "u" (b));
+    printf("fcom(%f %f)=%04lx \n",
+           a, b, fpus & (0x4500 | FPUS_EMASK));
+    fpu_clear_exceptions();
+    asm("fucom %2\n"
+        "fstsw %%ax\n"
+        : "=a" (fpus)
+        : "t" (a), "u" (b));
+    printf("fucom(%f %f)=%04lx\n",
+           a, b, fpus & (0x4500 | FPUS_EMASK));
+    if (TEST_FCOMI) {
+        /* test f(u)comi instruction */
+        fpu_clear_exceptions();
+        asm("fcomi %3, %2\n"
+            "fstsw %%ax\n"
+            "pushf\n"
+            "pop %0\n"
+            : "=r" (eflags), "=a" (fpus)
+            : "t" (a), "u" (b));
+        printf("fcomi(%f %f)=%04lx %02lx\n",
+               a, b, fpus & FPUS_EMASK, eflags & (CC_Z | CC_P | CC_C));
+        fpu_clear_exceptions();
+        asm("fucomi %3, %2\n"
+            "fstsw %%ax\n"
+            "pushf\n"
+            "pop %0\n"
+            : "=r" (eflags), "=a" (fpus)
+            : "t" (a), "u" (b));
+        printf("fucomi(%f %f)=%04lx %02lx\n",
+               a, b, fpus & FPUS_EMASK, eflags & (CC_Z | CC_P | CC_C));
+    }
+    fpu_clear_exceptions();
+    asm volatile("fxam\n"
+                 "fstsw %%ax\n"
+                 : "=a" (fpus)
+                 : "t" (a));
+    printf("fxam(%f)=%04lx\n", a, fpus & 0x4700);
+    fpu_clear_exceptions();
+}
+
+void test_fcvt(double a)
+{
+    float fa;
+    long double la;
+    int16_t fpuc;
+    int i;
+    int64_t lla;
+    int ia;
+    int16_t wa;
+    double ra;
+
+    fa = a;
+    la = a;
+    printf("(float)%f = %f\n", a, fa);
+    printf("(long double)%f = %Lf\n", a, la);
+    printf("a=" FMT64X "\n", *(uint64_t *)&a);
+    printf("la=" FMT64X " %04x\n", *(uint64_t *)&la,
+           *(unsigned short *)((char *)(&la) + 8));
+
+    /* test all roundings */
+    asm volatile ("fstcw %0" : "=m" (fpuc));
+    for(i=0;i<4;i++) {
+        uint16_t val16;
+        val16 = (fpuc & ~0x0c00) | (i << 10);
+        asm volatile ("fldcw %0" : : "m" (val16));
+        asm volatile ("fist %0" : "=m" (wa) : "t" (a));
+        asm volatile ("fistl %0" : "=m" (ia) : "t" (a));
+        asm volatile ("fistpll %0" : "=m" (lla) : "t" (a) : "st");
+        asm volatile ("frndint ; fstl %0" : "=m" (ra) : "t" (a));
+        asm volatile ("fldcw %0" : : "m" (fpuc));
+        printf("(short)a = %d\n", wa);
+        printf("(int)a = %d\n", ia);
+        printf("(int64_t)a = " FMT64X "\n", lla);
+        printf("rint(a) = %f\n", ra);
+    }
+}
+
+#define TEST(N) \
+    asm("fld" #N : "=t" (a)); \
+    printf("fld" #N "= %f\n", a);
+
+void test_fconst(void)
+{
+    double a;
+    TEST(1);
+    TEST(l2t);
+    TEST(l2e);
+    TEST(pi);
+    TEST(lg2);
+    TEST(ln2);
+    TEST(z);
+}
+
+void test_fbcd(double a)
+{
+    unsigned short bcd[5];
+    double b;
+
+    asm("fbstp %0" : "=m" (bcd[0]) : "t" (a) : "st");
+    asm("fbld %1" : "=t" (b) : "m" (bcd[0]));
+    printf("a=%f bcd=%04x%04x%04x%04x%04x b=%f\n",
+           a, bcd[4], bcd[3], bcd[2], bcd[1], bcd[0], b);
+}
+
+#define TEST_ENV(env, save, restore)\
+{\
+    memset((env), 0xaa, sizeof(*(env)));\
+    for(i=0;i<5;i++)\
+        asm volatile ("fldl %0" : : "m" (dtab[i]));\
+    asm volatile (save " %0\n" : : "m" (*(env)));\
+    asm volatile (restore " %0\n": : "m" (*(env)));\
+    for(i=0;i<5;i++)\
+        asm volatile ("fstpl %0" : "=m" (rtab[i]));\
+    for(i=0;i<5;i++)\
+        printf("res[%d]=%f\n", i, rtab[i]);\
+    printf("fpuc=%04x fpus=%04x fptag=%04x\n",\
+           (env)->fpuc,\
+           (env)->fpus & 0xff00,\
+           (env)->fptag);\
+}
+
+void test_fenv(void)
+{
+    struct __attribute__((packed)) {
+        uint16_t fpuc;
+        uint16_t dummy1;
+        uint16_t fpus;
+        uint16_t dummy2;
+        uint16_t fptag;
+        uint16_t dummy3;
+        uint32_t ignored[4];
+        long double fpregs[8];
+    } float_env32;
+    struct __attribute__((packed)) {
+        uint16_t fpuc;
+        uint16_t fpus;
+        uint16_t fptag;
+        uint16_t ignored[4];
+        long double fpregs[8];
+    } float_env16;
+    double dtab[8];
+    double rtab[8];
+    int i;
+
+    for(i=0;i<8;i++)
+        dtab[i] = i + 1;
+
+    TEST_ENV(&float_env16, "data16 fnstenv", "data16 fldenv");
+    TEST_ENV(&float_env16, "data16 fnsave", "data16 frstor");
+    TEST_ENV(&float_env32, "fnstenv", "fldenv");
+    TEST_ENV(&float_env32, "fnsave", "frstor");
+
+    /* test for ffree */
+    for(i=0;i<5;i++)
+        asm volatile ("fldl %0" : : "m" (dtab[i]));
+    asm volatile("ffree %st(2)");
+    asm volatile ("fnstenv %0\n" : : "m" (float_env32));
+    asm volatile ("fninit");
+    printf("fptag=%04x\n", float_env32.fptag);
+}
+
+
+#define TEST_FCMOV(a, b, eflags, CC)\
+{\
+    double res;\
+    asm("push %3\n"\
+        "popf\n"\
+        "fcmov" CC " %2, %0\n"\
+        : "=t" (res)\
+        : "0" (a), "u" (b), "g" (eflags));\
+    printf("fcmov%s eflags=0x%04lx-> %f\n", \
+           CC, (long)eflags, res);\
+}
+
+void test_fcmov(void)
+{
+    double a, b;
+    long eflags, i;
+
+    a = 1.0;
+    b = 2.0;
+    for(i = 0; i < 4; i++) {
+        eflags = 0;
+        if (i & 1)
+            eflags |= CC_C;
+        if (i & 2)
+            eflags |= CC_Z;
+        TEST_FCMOV(a, b, eflags, "b");
+        TEST_FCMOV(a, b, eflags, "e");
+        TEST_FCMOV(a, b, eflags, "be");
+        TEST_FCMOV(a, b, eflags, "nb");
+        TEST_FCMOV(a, b, eflags, "ne");
+        TEST_FCMOV(a, b, eflags, "nbe");
+    }
+    TEST_FCMOV(a, b, 0, "u");
+    TEST_FCMOV(a, b, CC_P, "u");
+    TEST_FCMOV(a, b, 0, "nu");
+    TEST_FCMOV(a, b, CC_P, "nu");
+}
+
+void test_floats(void)
+{
+    test_fops(2, 3);
+    test_fops(1.4, -5);
+    test_fcmp(2, -1);
+    test_fcmp(2, 2);
+    test_fcmp(2, 3);
+    test_fcmp(2, q_nan.d);
+    test_fcmp(q_nan.d, -1);
+    test_fcmp(-1.0/0.0, -1);
+    test_fcmp(1.0/0.0, -1);
+    test_fcvt(0.5);
+    test_fcvt(-0.5);
+    test_fcvt(1.0/7.0);
+    test_fcvt(-1.0/9.0);
+    test_fcvt(32768);
+    test_fcvt(-1e20);
+    test_fcvt(-1.0/0.0);
+    test_fcvt(1.0/0.0);
+    test_fcvt(q_nan.d);
+    test_fconst();
+    test_fbcd(1234567890123456.0);
+    test_fbcd(-123451234567890.0);
+    test_fenv();
+    if (TEST_CMOV) {
+        test_fcmov();
+    }
+}
+
+/**********************************************/
+#if !defined(__x86_64__)
+
+#define TEST_BCD(op, op0, cc_in, cc_mask)\
+{\
+    int res, flags;\
+    res = op0;\
+    flags = cc_in;\
+    asm ("push %3\n\t"\
+         "popf\n\t"\
+         #op "\n\t"\
+         "pushf\n\t"\
+         "pop %1\n\t"\
+        : "=a" (res), "=g" (flags)\
+        : "0" (res), "1" (flags));\
+    printf("%-10s A=%08x R=%08x CCIN=%04x CC=%04x\n",\
+           #op, op0, res, cc_in, flags & cc_mask);\
+}
+
+void test_bcd(void)
+{
+    TEST_BCD(daa, 0x12340503, CC_A, (CC_C | CC_P | CC_Z | CC_S | CC_A));
+    TEST_BCD(daa, 0x12340506, CC_A, (CC_C | CC_P | CC_Z | CC_S | CC_A));
+    TEST_BCD(daa, 0x12340507, CC_A, (CC_C | CC_P | CC_Z | CC_S | CC_A));
+    TEST_BCD(daa, 0x12340559, CC_A, (CC_C | CC_P | CC_Z | CC_S | CC_A));
+    TEST_BCD(daa, 0x12340560, CC_A, (CC_C | CC_P | CC_Z | CC_S | CC_A));
+    TEST_BCD(daa, 0x1234059f, CC_A, (CC_C | CC_P | CC_Z | CC_S | CC_A));
+    TEST_BCD(daa, 0x123405a0, CC_A, (CC_C | CC_P | CC_Z | CC_S | CC_A));
+    TEST_BCD(daa, 0x12340503, 0, (CC_C | CC_P | CC_Z | CC_S | CC_A));
+    TEST_BCD(daa, 0x12340506, 0, (CC_C | CC_P | CC_Z | CC_S | CC_A));
+    TEST_BCD(daa, 0x12340503, CC_C, (CC_C | CC_P | CC_Z | CC_S | CC_A));
+    TEST_BCD(daa, 0x12340506, CC_C, (CC_C | CC_P | CC_Z | CC_S | CC_A));
+    TEST_BCD(daa, 0x12340503, CC_C | CC_A, (CC_C | CC_P | CC_Z | CC_S | CC_A));
+    TEST_BCD(daa, 0x12340506, CC_C | CC_A, (CC_C | CC_P | CC_Z | CC_S | CC_A));
+
+    TEST_BCD(das, 0x12340503, CC_A, (CC_C | CC_P | CC_Z | CC_S | CC_A));
+    TEST_BCD(das, 0x12340506, CC_A, (CC_C | CC_P | CC_Z | CC_S | CC_A));
+    TEST_BCD(das, 0x12340507, CC_A, (CC_C | CC_P | CC_Z | CC_S | CC_A));
+    TEST_BCD(das, 0x12340559, CC_A, (CC_C | CC_P | CC_Z | CC_S | CC_A));
+    TEST_BCD(das, 0x12340560, CC_A, (CC_C | CC_P | CC_Z | CC_S | CC_A));
+    TEST_BCD(das, 0x1234059f, CC_A, (CC_C | CC_P | CC_Z | CC_S | CC_A));
+    TEST_BCD(das, 0x123405a0, CC_A, (CC_C | CC_P | CC_Z | CC_S | CC_A));
+    TEST_BCD(das, 0x12340503, 0, (CC_C | CC_P | CC_Z | CC_S | CC_A));
+    TEST_BCD(das, 0x12340506, 0, (CC_C | CC_P | CC_Z | CC_S | CC_A));
+    TEST_BCD(das, 0x12340503, CC_C, (CC_C | CC_P | CC_Z | CC_S | CC_A));
+    TEST_BCD(das, 0x12340506, CC_C, (CC_C | CC_P | CC_Z | CC_S | CC_A));
+    TEST_BCD(das, 0x12340503, CC_C | CC_A, (CC_C | CC_P | CC_Z | CC_S | CC_A));
+    TEST_BCD(das, 0x12340506, CC_C | CC_A, (CC_C | CC_P | CC_Z | CC_S | CC_A));
+
+    TEST_BCD(aaa, 0x12340205, CC_A, (CC_C | CC_A));
+    TEST_BCD(aaa, 0x12340306, CC_A, (CC_C | CC_A));
+    TEST_BCD(aaa, 0x1234040a, CC_A, (CC_C | CC_A));
+    TEST_BCD(aaa, 0x123405fa, CC_A, (CC_C | CC_A));
+    TEST_BCD(aaa, 0x12340205, 0, (CC_C | CC_A));
+    TEST_BCD(aaa, 0x12340306, 0, (CC_C | CC_A));
+    TEST_BCD(aaa, 0x1234040a, 0, (CC_C | CC_A));
+    TEST_BCD(aaa, 0x123405fa, 0, (CC_C | CC_A));
+
+    TEST_BCD(aas, 0x12340205, CC_A, (CC_C | CC_A));
+    TEST_BCD(aas, 0x12340306, CC_A, (CC_C | CC_A));
+    TEST_BCD(aas, 0x1234040a, CC_A, (CC_C | CC_A));
+    TEST_BCD(aas, 0x123405fa, CC_A, (CC_C | CC_A));
+    TEST_BCD(aas, 0x12340205, 0, (CC_C | CC_A));
+    TEST_BCD(aas, 0x12340306, 0, (CC_C | CC_A));
+    TEST_BCD(aas, 0x1234040a, 0, (CC_C | CC_A));
+    TEST_BCD(aas, 0x123405fa, 0, (CC_C | CC_A));
+
+    TEST_BCD(aam, 0x12340547, CC_A, (CC_C | CC_P | CC_Z | CC_S | CC_O | CC_A));
+    TEST_BCD(aad, 0x12340407, CC_A, (CC_C | CC_P | CC_Z | CC_S | CC_O | CC_A));
+}
+#endif
+
+#define TEST_XCHG(op, size, opconst)\
+{\
+    long op0, op1;\
+    op0 = i2l(0x12345678);\
+    op1 = i2l(0xfbca7654);\
+    asm(#op " %" size "0, %" size "1" \
+        : "=q" (op0), opconst (op1) \
+        : "0" (op0));\
+    printf("%-10s A=" FMTLX " B=" FMTLX "\n",\
+           #op, op0, op1);\
+}
+
+#define TEST_CMPXCHG(op, size, opconst, eax)\
+{\
+    long op0, op1, op2;\
+    op0 = i2l(0x12345678);\
+    op1 = i2l(0xfbca7654);\
+    op2 = i2l(eax);\
+    asm(#op " %" size "0, %" size "1" \
+        : "=q" (op0), opconst (op1) \
+        : "0" (op0), "a" (op2));\
+    printf("%-10s EAX=" FMTLX " A=" FMTLX " C=" FMTLX "\n",\
+           #op, op2, op0, op1);\
+}
+
+void test_xchg(void)
+{
+#if defined(__x86_64__)
+    TEST_XCHG(xchgq, "", "+q");
+#endif
+    TEST_XCHG(xchgl, "k", "+q");
+    TEST_XCHG(xchgw, "w", "+q");
+    TEST_XCHG(xchgb, "b", "+q");
+
+#if defined(__x86_64__)
+    TEST_XCHG(xchgq, "", "=m");
+#endif
+    TEST_XCHG(xchgl, "k", "+m");
+    TEST_XCHG(xchgw, "w", "+m");
+    TEST_XCHG(xchgb, "b", "+m");
+
+#if defined(__x86_64__)
+    TEST_XCHG(xaddq, "", "+q");
+#endif
+    TEST_XCHG(xaddl, "k", "+q");
+    TEST_XCHG(xaddw, "w", "+q");
+    TEST_XCHG(xaddb, "b", "+q");
+
+    {
+        int res;
+        res = 0x12345678;
+        asm("xaddl %1, %0" : "=r" (res) : "0" (res));
+        printf("xaddl same res=%08x\n", res);
+    }
+
+#if defined(__x86_64__)
+    TEST_XCHG(xaddq, "", "+m");
+#endif
+    TEST_XCHG(xaddl, "k", "+m");
+    TEST_XCHG(xaddw, "w", "+m");
+    TEST_XCHG(xaddb, "b", "+m");
+
+#if defined(__x86_64__)
+    TEST_CMPXCHG(cmpxchgq, "", "+q", 0xfbca7654);
+#endif
+    TEST_CMPXCHG(cmpxchgl, "k", "+q", 0xfbca7654);
+    TEST_CMPXCHG(cmpxchgw, "w", "+q", 0xfbca7654);
+    TEST_CMPXCHG(cmpxchgb, "b", "+q", 0xfbca7654);
+
+#if defined(__x86_64__)
+    TEST_CMPXCHG(cmpxchgq, "", "+q", 0xfffefdfc);
+#endif
+    TEST_CMPXCHG(cmpxchgl, "k", "+q", 0xfffefdfc);
+    TEST_CMPXCHG(cmpxchgw, "w", "+q", 0xfffefdfc);
+    TEST_CMPXCHG(cmpxchgb, "b", "+q", 0xfffefdfc);
+
+#if defined(__x86_64__)
+    TEST_CMPXCHG(cmpxchgq, "", "+m", 0xfbca7654);
+#endif
+    TEST_CMPXCHG(cmpxchgl, "k", "+m", 0xfbca7654);
+    TEST_CMPXCHG(cmpxchgw, "w", "+m", 0xfbca7654);
+    TEST_CMPXCHG(cmpxchgb, "b", "+m", 0xfbca7654);
+
+#if defined(__x86_64__)
+    TEST_CMPXCHG(cmpxchgq, "", "+m", 0xfffefdfc);
+#endif
+    TEST_CMPXCHG(cmpxchgl, "k", "+m", 0xfffefdfc);
+    TEST_CMPXCHG(cmpxchgw, "w", "+m", 0xfffefdfc);
+    TEST_CMPXCHG(cmpxchgb, "b", "+m", 0xfffefdfc);
+
+    {
+        uint64_t op0, op1, op2;
+        long eax, edx;
+        long i, eflags;
+
+        for(i = 0; i < 2; i++) {
+            op0 = 0x123456789abcdLL;
+            eax = i2l(op0 & 0xffffffff);
+            edx = i2l(op0 >> 32);
+            if (i == 0)
+                op1 = 0xfbca765423456LL;
+            else
+                op1 = op0;
+            op2 = 0x6532432432434LL;
+            asm("cmpxchg8b %2\n"
+                "pushf\n"
+                "pop %3\n"
+                : "=a" (eax), "=d" (edx), "=m" (op1), "=g" (eflags)
+                : "0" (eax), "1" (edx), "m" (op1), "b" ((int)op2), "c" ((int)(op2 >> 32)));
+            printf("cmpxchg8b: eax=" FMTLX " edx=" FMTLX " op1=" FMT64X " CC=%02lx\n",
+                   eax, edx, op1, eflags & CC_Z);
+        }
+    }
+}
+
+#ifdef TEST_SEGS
+/**********************************************/
+/* segmentation tests */
+
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <asm/ldt.h>
+#include <linux/version.h>
+
+static inline int modify_ldt(int func, void * ptr, unsigned long bytecount)
+{
+    return syscall(__NR_modify_ldt, func, ptr, bytecount);
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 5, 66)
+#define modify_ldt_ldt_s user_desc
+#endif
+
+#define MK_SEL(n) (((n) << 3) | 7)
+
+uint8_t seg_data1[4096];
+uint8_t seg_data2[4096];
+
+#define TEST_LR(op, size, seg, mask)\
+{\
+    int res, res2;\
+    uint16_t mseg = seg;\
+    res = 0x12345678;\
+    asm (op " %" size "2, %" size "0\n" \
+         "movl $0, %1\n"\
+         "jnz 1f\n"\
+         "movl $1, %1\n"\
+         "1:\n"\
+         : "=r" (res), "=r" (res2) : "m" (mseg), "0" (res));\
+    printf(op ": Z=%d %08x\n", res2, res & ~(mask));\
+}
+
+#define TEST_ARPL(op, size, op1, op2)\
+{\
+    long a, b, c;                               \
+    a = (op1);                                  \
+    b = (op2);                                  \
+    asm volatile(op " %" size "3, %" size "0\n"\
+                 "movl $0,%1\n"\
+                 "jnz 1f\n"\
+                 "movl $1,%1\n"\
+                 "1:\n"\
+                 : "=r" (a), "=r" (c) : "0" (a), "r" (b));    \
+    printf(op size " A=" FMTLX " B=" FMTLX " R=" FMTLX " z=%ld\n",\
+           (long)(op1), (long)(op2), a, c);\
+}
+
+/* NOTE: we use Linux modify_ldt syscall */
+void test_segs(void)
+{
+    struct modify_ldt_ldt_s ldt;
+    long long ldt_table[3];
+    int res, res2;
+    char tmp;
+    struct {
+        uint32_t offset;
+        uint16_t seg;
+    } __attribute__((packed)) segoff;
+
+    ldt.entry_number = 1;
+    ldt.base_addr = (unsigned long)&seg_data1;
+    ldt.limit = (sizeof(seg_data1) + 0xfff) >> 12;
+    ldt.seg_32bit = 1;
+    ldt.contents = MODIFY_LDT_CONTENTS_DATA;
+    ldt.read_exec_only = 0;
+    ldt.limit_in_pages = 1;
+    ldt.seg_not_present = 0;
+    ldt.useable = 1;
+    modify_ldt(1, &ldt, sizeof(ldt)); /* write ldt entry */
+
+    ldt.entry_number = 2;
+    ldt.base_addr = (unsigned long)&seg_data2;
+    ldt.limit = (sizeof(seg_data2) + 0xfff) >> 12;
+    ldt.seg_32bit = 1;
+    ldt.contents = MODIFY_LDT_CONTENTS_DATA;
+    ldt.read_exec_only = 0;
+    ldt.limit_in_pages = 1;
+    ldt.seg_not_present = 0;
+    ldt.useable = 1;
+    modify_ldt(1, &ldt, sizeof(ldt)); /* write ldt entry */
+
+    modify_ldt(0, &ldt_table, sizeof(ldt_table)); /* read ldt entries */
+#if 0
+    {
+        int i;
+        for(i=0;i<3;i++)
+            printf("%d: %016Lx\n", i, ldt_table[i]);
+    }
+#endif
+    /* do some tests with fs or gs */
+    asm volatile ("movl %0, %%fs" : : "r" (MK_SEL(1)));
+
+    seg_data1[1] = 0xaa;
+    seg_data2[1] = 0x55;
+
+    asm volatile ("fs movzbl 0x1, %0" : "=r" (res));
+    printf("FS[1] = %02x\n", res);
+
+    asm volatile ("pushl %%gs\n"
+                  "movl %1, %%gs\n"
+                  "gs movzbl 0x1, %0\n"
+                  "popl %%gs\n"
+                  : "=r" (res)
+                  : "r" (MK_SEL(2)));
+    printf("GS[1] = %02x\n", res);
+
+    /* tests with ds/ss (implicit segment case) */
+    tmp = 0xa5;
+    asm volatile ("pushl %%ebp\n\t"
+                  "pushl %%ds\n\t"
+                  "movl %2, %%ds\n\t"
+                  "movl %3, %%ebp\n\t"
+                  "movzbl 0x1, %0\n\t"
+                  "movzbl (%%ebp), %1\n\t"
+                  "popl %%ds\n\t"
+                  "popl %%ebp\n\t"
+                  : "=r" (res), "=r" (res2)
+                  : "r" (MK_SEL(1)), "r" (&tmp));
+    printf("DS[1] = %02x\n", res);
+    printf("SS[tmp] = %02x\n", res2);
+
+    segoff.seg = MK_SEL(2);
+    segoff.offset = 0xabcdef12;
+    asm volatile("lfs %2, %0\n\t"
+                 "movl %%fs, %1\n\t"
+                 : "=r" (res), "=g" (res2)
+                 : "m" (segoff));
+    printf("FS:reg = %04x:%08x\n", res2, res);
+
+    TEST_LR("larw", "w", MK_SEL(2), 0x0100);
+    TEST_LR("larl", "", MK_SEL(2), 0x0100);
+    TEST_LR("lslw", "w", MK_SEL(2), 0);
+    TEST_LR("lsll", "", MK_SEL(2), 0);
+
+    TEST_LR("larw", "w", 0xfff8, 0);
+    TEST_LR("larl", "", 0xfff8, 0);
+    TEST_LR("lslw", "w", 0xfff8, 0);
+    TEST_LR("lsll", "", 0xfff8, 0);
+
+    TEST_ARPL("arpl", "w", 0x12345678 | 3, 0x762123c | 1);
+    TEST_ARPL("arpl", "w", 0x12345678 | 1, 0x762123c | 3);
+    TEST_ARPL("arpl", "w", 0x12345678 | 1, 0x762123c | 1);
+}
+
+/* 16 bit code test */
+extern char code16_start, code16_end;
+extern char code16_func1;
+extern char code16_func2;
+extern char code16_func3;
+
+void test_code16(void)
+{
+    struct modify_ldt_ldt_s ldt;
+    int res, res2;
+
+    /* build a code segment */
+    ldt.entry_number = 1;
+    ldt.base_addr = (unsigned long)&code16_start;
+    ldt.limit = &code16_end - &code16_start;
+    ldt.seg_32bit = 0;
+    ldt.contents = MODIFY_LDT_CONTENTS_CODE;
+    ldt.read_exec_only = 0;
+    ldt.limit_in_pages = 0;
+    ldt.seg_not_present = 0;
+    ldt.useable = 1;
+    modify_ldt(1, &ldt, sizeof(ldt)); /* write ldt entry */
+
+    /* call the first function */
+    asm volatile ("lcall %1, %2"
+                  : "=a" (res)
+                  : "i" (MK_SEL(1)), "i" (&code16_func1): "memory", "cc");
+    printf("func1() = 0x%08x\n", res);
+    asm volatile ("lcall %2, %3"
+                  : "=a" (res), "=c" (res2)
+                  : "i" (MK_SEL(1)), "i" (&code16_func2): "memory", "cc");
+    printf("func2() = 0x%08x spdec=%d\n", res, res2);
+    asm volatile ("lcall %1, %2"
+                  : "=a" (res)
+                  : "i" (MK_SEL(1)), "i" (&code16_func3): "memory", "cc");
+    printf("func3() = 0x%08x\n", res);
+}
+#endif
+
+#if defined(__x86_64__)
+asm(".globl func_lret\n"
+    "func_lret:\n"
+    "movl $0x87654641, %eax\n"
+    "lretq\n");
+#else
+asm(".globl func_lret\n"
+    "func_lret:\n"
+    "movl $0x87654321, %eax\n"
+    "lret\n"
+
+    ".globl func_iret\n"
+    "func_iret:\n"
+    "movl $0xabcd4321, %eax\n"
+    "iret\n");
+#endif
+
+extern char func_lret;
+extern char func_iret;
+
+void test_misc(void)
+{
+    char table[256];
+    long res, i;
+
+    for(i=0;i<256;i++) table[i] = 256 - i;
+    res = 0x12345678;
+    asm ("xlat" : "=a" (res) : "b" (table), "0" (res));
+    printf("xlat: EAX=" FMTLX "\n", res);
+
+#if defined(__x86_64__)
+#if 0
+    {
+        /* XXX: see if Intel Core2 and AMD64 behavior really
+           differ. Here we implemented the Intel way which is not
+           compatible yet with QEMU. */
+        static struct __attribute__((packed)) {
+            uint64_t offset;
+            uint16_t seg;
+        } desc;
+        long cs_sel;
+
+        asm volatile ("mov %%cs, %0" : "=r" (cs_sel));
+
+        asm volatile ("push %1\n"
+                      "call func_lret\n"
+                      : "=a" (res)
+                      : "r" (cs_sel) : "memory", "cc");
+        printf("func_lret=" FMTLX "\n", res);
+
+        desc.offset = (long)&func_lret;
+        desc.seg = cs_sel;
+
+        asm volatile ("xor %%rax, %%rax\n"
+                      "rex64 lcall *(%%rcx)\n"
+                      : "=a" (res)
+                      : "c" (&desc)
+                      : "memory", "cc");
+        printf("func_lret2=" FMTLX "\n", res);
+
+        asm volatile ("push %2\n"
+                      "mov $ 1f, %%rax\n"
+                      "push %%rax\n"
+                      "rex64 ljmp *(%%rcx)\n"
+                      "1:\n"
+                      : "=a" (res)
+                      : "c" (&desc), "b" (cs_sel)
+                      : "memory", "cc");
+        printf("func_lret3=" FMTLX "\n", res);
+    }
+#endif
+#else
+    asm volatile ("push %%cs ; call %1"
+                  : "=a" (res)
+                  : "m" (func_lret): "memory", "cc");
+    printf("func_lret=" FMTLX "\n", res);
+
+    asm volatile ("pushf ; push %%cs ; call %1"
+                  : "=a" (res)
+                  : "m" (func_iret): "memory", "cc");
+    printf("func_iret=" FMTLX "\n", res);
+#endif
+
+#if defined(__x86_64__)
+    /* specific popl test */
+    asm volatile ("push $12345432 ; push $0x9abcdef ; pop (%%rsp) ; pop %0"
+                  : "=g" (res));
+    printf("popl esp=" FMTLX "\n", res);
+#else
+    /* specific popl test */
+    asm volatile ("pushl $12345432 ; pushl $0x9abcdef ; popl (%%esp) ; popl %0"
+                  : "=g" (res));
+    printf("popl esp=" FMTLX "\n", res);
+
+    /* specific popw test */
+    asm volatile ("pushl $12345432 ; pushl $0x9abcdef ; popw (%%esp) ; addl $2, %%esp ; popl %0"
+                  : "=g" (res));
+    printf("popw esp=" FMTLX "\n", res);
+#endif
+}
+
+uint8_t str_buffer[4096];
+
+#define TEST_STRING1(OP, size, DF, REP)\
+{\
+    long esi, edi, eax, ecx, eflags;\
+\
+    esi = (long)(str_buffer + sizeof(str_buffer) / 2);\
+    edi = (long)(str_buffer + sizeof(str_buffer) / 2) + 16;\
+    eax = i2l(0x12345678);\
+    ecx = 17;\
+\
+    asm volatile ("push $0\n\t"\
+                  "popf\n\t"\
+                  DF "\n\t"\
+                  REP #OP size "\n\t"\
+                  "cld\n\t"\
+                  "pushf\n\t"\
+                  "pop %4\n\t"\
+                  : "=S" (esi), "=D" (edi), "=a" (eax), "=c" (ecx), "=g" (eflags)\
+                  : "0" (esi), "1" (edi), "2" (eax), "3" (ecx));\
+    printf("%-10s ESI=" FMTLX " EDI=" FMTLX " EAX=" FMTLX " ECX=" FMTLX " EFL=%04x\n",\
+           REP #OP size, esi, edi, eax, ecx,\
+           (int)(eflags & (CC_C | CC_P | CC_Z | CC_S | CC_O | CC_A)));\
+}
+
+#define TEST_STRING(OP, REP)\
+    TEST_STRING1(OP, "b", "", REP);\
+    TEST_STRING1(OP, "w", "", REP);\
+    TEST_STRING1(OP, "l", "", REP);\
+    X86_64_ONLY(TEST_STRING1(OP, "q", "", REP));\
+    TEST_STRING1(OP, "b", "std", REP);\
+    TEST_STRING1(OP, "w", "std", REP);\
+    TEST_STRING1(OP, "l", "std", REP);\
+    X86_64_ONLY(TEST_STRING1(OP, "q", "std", REP))
+
+void test_string(void)
+{
+    int i;
+    for(i = 0;i < sizeof(str_buffer); i++)
+        str_buffer[i] = i + 0x56;
+   TEST_STRING(stos, "");
+   TEST_STRING(stos, "rep ");
+   TEST_STRING(lods, ""); /* to verify stos */
+   TEST_STRING(lods, "rep ");
+   TEST_STRING(movs, "");
+   TEST_STRING(movs, "rep ");
+   TEST_STRING(lods, ""); /* to verify stos */
+
+   /* XXX: better tests */
+   TEST_STRING(scas, "");
+   TEST_STRING(scas, "repz ");
+   TEST_STRING(scas, "repnz ");
+   TEST_STRING(cmps, "");
+   TEST_STRING(cmps, "repz ");
+   TEST_STRING(cmps, "repnz ");
+}
+
+#ifdef TEST_VM86
+/* VM86 test */
+
+static inline void set_bit(uint8_t *a, unsigned int bit)
+{
+    a[bit / 8] |= (1 << (bit % 8));
+}
+
+static inline uint8_t *seg_to_linear(unsigned int seg, unsigned int reg)
+{
+    return (uint8_t *)((seg << 4) + (reg & 0xffff));
+}
+
+static inline void pushw(struct vm86_regs *r, int val)
+{
+    r->esp = (r->esp & ~0xffff) | ((r->esp - 2) & 0xffff);
+    *(uint16_t *)seg_to_linear(r->ss, r->esp) = val;
+}
+
+static inline int vm86(int func, struct vm86plus_struct *v86)
+{
+    return syscall(__NR_vm86, func, v86);
+}
+
+extern char vm86_code_start;
+extern char vm86_code_end;
+
+#define VM86_CODE_CS 0x100
+#define VM86_CODE_IP 0x100
+
+void test_vm86(void)
+{
+    struct vm86plus_struct ctx;
+    struct vm86_regs *r;
+    uint8_t *vm86_mem;
+    int seg, ret;
+
+    vm86_mem = mmap((void *)0x00000000, 0x110000,
+                    PROT_WRITE | PROT_READ | PROT_EXEC,
+                    MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0);
+    if (vm86_mem == MAP_FAILED) {
+        printf("ERROR: could not map vm86 memory");
+        return;
+    }
+    memset(&ctx, 0, sizeof(ctx));
+
+    /* init basic registers */
+    r = &ctx.regs;
+    r->eip = VM86_CODE_IP;
+    r->esp = 0xfffe;
+    seg = VM86_CODE_CS;
+    r->cs = seg;
+    r->ss = seg;
+    r->ds = seg;
+    r->es = seg;
+    r->fs = seg;
+    r->gs = seg;
+    r->eflags = VIF_MASK;
+
+    /* move code to proper address. We use the same layout as a .com
+       dos program. */
+    memcpy(vm86_mem + (VM86_CODE_CS << 4) + VM86_CODE_IP,
+           &vm86_code_start, &vm86_code_end - &vm86_code_start);
+
+    /* mark int 0x21 as being emulated */
+    set_bit((uint8_t *)&ctx.int_revectored, 0x21);
+
+    for(;;) {
+        ret = vm86(VM86_ENTER, &ctx);
+        switch(VM86_TYPE(ret)) {
+        case VM86_INTx:
+            {
+                int int_num, ah, v;
+
+                int_num = VM86_ARG(ret);
+                if (int_num != 0x21)
+                    goto unknown_int;
+                ah = (r->eax >> 8) & 0xff;
+                switch(ah) {
+                case 0x00: /* exit */
+                    goto the_end;
+                case 0x02: /* write char */
+                    {
+                        uint8_t c = r->edx;
+                        putchar(c);
+                    }
+                    break;
+                case 0x09: /* write string */
+                    {
+                        uint8_t c, *ptr;
+                        ptr = seg_to_linear(r->ds, r->edx);
+                        for(;;) {
+                            c = *ptr++;
+                            if (c == '$')
+                                break;
+                            putchar(c);
+                        }
+                        r->eax = (r->eax & ~0xff) | '$';
+                    }
+                    break;
+                case 0xff: /* extension: write eflags number in edx */
+                    v = (int)r->edx;
+#ifndef LINUX_VM86_IOPL_FIX
+                    v &= ~0x3000;
+#endif
+                    printf("%08x\n", v);
+                    break;
+                default:
+                unknown_int:
+                    printf("unsupported int 0x%02x\n", int_num);
+                    goto the_end;
+                }
+            }
+            break;
+        case VM86_SIGNAL:
+            /* a signal came, we just ignore that */
+            break;
+        case VM86_STI:
+            break;
+        default:
+            printf("ERROR: unhandled vm86 return code (0x%x)\n", ret);
+            goto the_end;
+        }
+    }
+ the_end:
+    printf("VM86 end\n");
+    munmap(vm86_mem, 0x110000);
+}
+#endif
+
+/* exception tests */
+#if defined(__i386__) && !defined(REG_EAX)
+#define REG_EAX EAX
+#define REG_EBX EBX
+#define REG_ECX ECX
+#define REG_EDX EDX
+#define REG_ESI ESI
+#define REG_EDI EDI
+#define REG_EBP EBP
+#define REG_ESP ESP
+#define REG_EIP EIP
+#define REG_EFL EFL
+#define REG_TRAPNO TRAPNO
+#define REG_ERR ERR
+#endif
+
+#if defined(__x86_64__)
+#define REG_EIP REG_RIP
+#endif
+
+jmp_buf jmp_env;
+int v1;
+int tab[2];
+
+void sig_handler(int sig, siginfo_t *info, void *puc)
+{
+    struct ucontext *uc = puc;
+
+    printf("si_signo=%d si_errno=%d si_code=%d",
+           info->si_signo, info->si_errno, info->si_code);
+    printf(" si_addr=0x%08lx",
+           (unsigned long)info->si_addr);
+    printf("\n");
+
+    printf("trapno=" FMTLX " err=" FMTLX,
+           (long)uc->uc_mcontext.gregs[REG_TRAPNO],
+           (long)uc->uc_mcontext.gregs[REG_ERR]);
+    printf(" EIP=" FMTLX, (long)uc->uc_mcontext.gregs[REG_EIP]);
+    printf("\n");
+    longjmp(jmp_env, 1);
+}
+
+void test_exceptions(void)
+{
+    struct sigaction act;
+    volatile int val;
+
+    act.sa_sigaction = sig_handler;
+    sigemptyset(&act.sa_mask);
+    act.sa_flags = SA_SIGINFO | SA_NODEFER;
+    sigaction(SIGFPE, &act, NULL);
+    sigaction(SIGILL, &act, NULL);
+    sigaction(SIGSEGV, &act, NULL);
+    sigaction(SIGBUS, &act, NULL);
+    sigaction(SIGTRAP, &act, NULL);
+
+    /* test division by zero reporting */
+    printf("DIVZ exception:\n");
+    if (setjmp(jmp_env) == 0) {
+        /* now divide by zero */
+        v1 = 0;
+        v1 = 2 / v1;
+    }
+
+#if !defined(__x86_64__)
+    printf("BOUND exception:\n");
+    if (setjmp(jmp_env) == 0) {
+        /* bound exception */
+        tab[0] = 1;
+        tab[1] = 10;
+        asm volatile ("bound %0, %1" : : "r" (11), "m" (tab[0]));
+    }
+#endif
+
+#ifdef TEST_SEGS
+    printf("segment exceptions:\n");
+    if (setjmp(jmp_env) == 0) {
+        /* load an invalid segment */
+        asm volatile ("movl %0, %%fs" : : "r" ((0x1234 << 3) | 1));
+    }
+    if (setjmp(jmp_env) == 0) {
+        /* null data segment is valid */
+        asm volatile ("movl %0, %%fs" : : "r" (3));
+        /* null stack segment */
+        asm volatile ("movl %0, %%ss" : : "r" (3));
+    }
+
+    {
+        struct modify_ldt_ldt_s ldt;
+        ldt.entry_number = 1;
+        ldt.base_addr = (unsigned long)&seg_data1;
+        ldt.limit = (sizeof(seg_data1) + 0xfff) >> 12;
+        ldt.seg_32bit = 1;
+        ldt.contents = MODIFY_LDT_CONTENTS_DATA;
+        ldt.read_exec_only = 0;
+        ldt.limit_in_pages = 1;
+        ldt.seg_not_present = 1;
+        ldt.useable = 1;
+        modify_ldt(1, &ldt, sizeof(ldt)); /* write ldt entry */
+
+        if (setjmp(jmp_env) == 0) {
+            /* segment not present */
+            asm volatile ("movl %0, %%fs" : : "r" (MK_SEL(1)));
+        }
+    }
+#endif
+
+    /* test SEGV reporting */
+    printf("PF exception:\n");
+    if (setjmp(jmp_env) == 0) {
+        val = 1;
+        /* we add a nop to test a weird PC retrieval case */
+        asm volatile ("nop");
+        /* now store in an invalid address */
+        *(char *)0x1234 = 1;
+    }
+
+    /* test SEGV reporting */
+    printf("PF exception:\n");
+    if (setjmp(jmp_env) == 0) {
+        val = 1;
+        /* read from an invalid address */
+        v1 = *(char *)0x1234;
+    }
+
+    /* test illegal instruction reporting */
+    printf("UD2 exception:\n");
+    if (setjmp(jmp_env) == 0) {
+        /* now execute an invalid instruction */
+        asm volatile("ud2");
+    }
+    printf("lock nop exception:\n");
+    if (setjmp(jmp_env) == 0) {
+        /* now execute an invalid instruction */
+        asm volatile("lock nop");
+    }
+
+    printf("INT exception:\n");
+    if (setjmp(jmp_env) == 0) {
+        asm volatile ("int $0xfd");
+    }
+    if (setjmp(jmp_env) == 0) {
+        asm volatile ("int $0x01");
+    }
+    if (setjmp(jmp_env) == 0) {
+        asm volatile (".byte 0xcd, 0x03");
+    }
+    if (setjmp(jmp_env) == 0) {
+        asm volatile ("int $0x04");
+    }
+    if (setjmp(jmp_env) == 0) {
+        asm volatile ("int $0x05");
+    }
+
+    printf("INT3 exception:\n");
+    if (setjmp(jmp_env) == 0) {
+        asm volatile ("int3");
+    }
+
+    printf("CLI exception:\n");
+    if (setjmp(jmp_env) == 0) {
+        asm volatile ("cli");
+    }
+
+    printf("STI exception:\n");
+    if (setjmp(jmp_env) == 0) {
+        asm volatile ("cli");
+    }
+
+#if !defined(__x86_64__)
+    printf("INTO exception:\n");
+    if (setjmp(jmp_env) == 0) {
+        /* overflow exception */
+        asm volatile ("addl $1, %0 ; into" : : "r" (0x7fffffff));
+    }
+#endif
+
+    printf("OUTB exception:\n");
+    if (setjmp(jmp_env) == 0) {
+        asm volatile ("outb %%al, %%dx" : : "d" (0x4321), "a" (0));
+    }
+
+    printf("INB exception:\n");
+    if (setjmp(jmp_env) == 0) {
+        asm volatile ("inb %%dx, %%al" : "=a" (val) : "d" (0x4321));
+    }
+
+    printf("REP OUTSB exception:\n");
+    if (setjmp(jmp_env) == 0) {
+        asm volatile ("rep outsb" : : "d" (0x4321), "S" (tab), "c" (1));
+    }
+
+    printf("REP INSB exception:\n");
+    if (setjmp(jmp_env) == 0) {
+        asm volatile ("rep insb" : : "d" (0x4321), "D" (tab), "c" (1));
+    }
+
+    printf("HLT exception:\n");
+    if (setjmp(jmp_env) == 0) {
+        asm volatile ("hlt");
+    }
+
+    printf("single step exception:\n");
+    val = 0;
+    if (setjmp(jmp_env) == 0) {
+        asm volatile ("pushf\n"
+                      "orl $0x00100, (%%esp)\n"
+                      "popf\n"
+                      "movl $0xabcd, %0\n"
+                      "movl $0x0, %0\n" : "=m" (val) : : "cc", "memory");
+    }
+    printf("val=0x%x\n", val);
+}
+
+#if !defined(__x86_64__)
+/* specific precise single step test */
+void sig_trap_handler(int sig, siginfo_t *info, void *puc)
+{
+    struct ucontext *uc = puc;
+    printf("EIP=" FMTLX "\n", (long)uc->uc_mcontext.gregs[REG_EIP]);
+}
+
+const uint8_t sstep_buf1[4] = { 1, 2, 3, 4};
+uint8_t sstep_buf2[4];
+
+void test_single_step(void)
+{
+    struct sigaction act;
+    volatile int val;
+    int i;
+
+    val = 0;
+    act.sa_sigaction = sig_trap_handler;
+    sigemptyset(&act.sa_mask);
+    act.sa_flags = SA_SIGINFO;
+    sigaction(SIGTRAP, &act, NULL);
+    asm volatile ("pushf\n"
+                  "orl $0x00100, (%%esp)\n"
+                  "popf\n"
+                  "movl $0xabcd, %0\n"
+
+                  /* jmp test */
+                  "movl $3, %%ecx\n"
+                  "1:\n"
+                  "addl $1, %0\n"
+                  "decl %%ecx\n"
+                  "jnz 1b\n"
+
+                  /* movsb: the single step should stop at each movsb iteration */
+                  "movl $sstep_buf1, %%esi\n"
+                  "movl $sstep_buf2, %%edi\n"
+                  "movl $0, %%ecx\n"
+                  "rep movsb\n"
+                  "movl $3, %%ecx\n"
+                  "rep movsb\n"
+                  "movl $1, %%ecx\n"
+                  "rep movsb\n"
+
+                  /* cmpsb: the single step should stop at each cmpsb iteration */
+                  "movl $sstep_buf1, %%esi\n"
+                  "movl $sstep_buf2, %%edi\n"
+                  "movl $0, %%ecx\n"
+                  "rep cmpsb\n"
+                  "movl $4, %%ecx\n"
+                  "rep cmpsb\n"
+
+                  /* getpid() syscall: single step should skip one
+                     instruction */
+                  "movl $20, %%eax\n"
+                  "int $0x80\n"
+                  "movl $0, %%eax\n"
+
+                  /* when modifying SS, trace is not done on the next
+                     instruction */
+                  "movl %%ss, %%ecx\n"
+                  "movl %%ecx, %%ss\n"
+                  "addl $1, %0\n"
+                  "movl $1, %%eax\n"
+                  "movl %%ecx, %%ss\n"
+                  "jmp 1f\n"
+                  "addl $1, %0\n"
+                  "1:\n"
+                  "movl $1, %%eax\n"
+                  "pushl %%ecx\n"
+                  "popl %%ss\n"
+                  "addl $1, %0\n"
+                  "movl $1, %%eax\n"
+
+                  "pushf\n"
+                  "andl $~0x00100, (%%esp)\n"
+                  "popf\n"
+                  : "=m" (val)
+                  :
+                  : "cc", "memory", "eax", "ecx", "esi", "edi");
+    printf("val=%d\n", val);
+    for(i = 0; i < 4; i++)
+        printf("sstep_buf2[%d] = %d\n", i, sstep_buf2[i]);
+}
+
+/* self modifying code test */
+uint8_t code[] = {
+    0xb8, 0x1, 0x00, 0x00, 0x00, /* movl $1, %eax */
+    0xc3, /* ret */
+};
+
+asm(".section \".data\"\n"
+    "smc_code2:\n"
+    "movl 4(%esp), %eax\n"
+    "movl %eax, smc_patch_addr2 + 1\n"
+    "nop\n"
+    "nop\n"
+    "nop\n"
+    "nop\n"
+    "nop\n"
+    "nop\n"
+    "nop\n"
+    "nop\n"
+    "smc_patch_addr2:\n"
+    "movl $1, %eax\n"
+    "ret\n"
+    ".previous\n"
+    );
+
+typedef int FuncType(void);
+extern int smc_code2(int);
+void test_self_modifying_code(void)
+{
+    int i;
+    printf("self modifying code:\n");
+    printf("func1 = 0x%x\n", ((FuncType *)code)());
+    for(i = 2; i <= 4; i++) {
+        code[1] = i;
+        printf("func%d = 0x%x\n", i, ((FuncType *)code)());
+    }
+
+    /* more difficult test : the modified code is just after the
+       modifying instruction. It is forbidden in Intel specs, but it
+       is used by old DOS programs */
+    for(i = 2; i <= 4; i++) {
+        printf("smc_code2(%d) = %d\n", i, smc_code2(i));
+    }
+}
+#endif
+
+long enter_stack[4096];
+
+#if defined(__x86_64__)
+#define RSP "%%rsp"
+#define RBP "%%rbp"
+#else
+#define RSP "%%esp"
+#define RBP "%%ebp"
+#endif
+
+#define TEST_ENTER(size, stack_type, level)\
+{\
+    long esp_save, esp_val, ebp_val, ebp_save, i;\
+    stack_type *ptr, *stack_end, *stack_ptr;\
+    memset(enter_stack, 0, sizeof(enter_stack));\
+    stack_end = stack_ptr = (stack_type *)(enter_stack + 4096);\
+    ebp_val = (long)stack_ptr;\
+    for(i=1;i<=32;i++)\
+       *--stack_ptr = i;\
+    esp_val = (long)stack_ptr;\
+    asm("mov " RSP ", %[esp_save]\n"\
+        "mov " RBP ", %[ebp_save]\n"\
+        "mov %[esp_val], " RSP "\n"\
+        "mov %[ebp_val], " RBP "\n"\
+        "enter" size " $8, $" #level "\n"\
+        "mov " RSP ", %[esp_val]\n"\
+        "mov " RBP ", %[ebp_val]\n"\
+        "mov %[esp_save], " RSP "\n"\
+        "mov %[ebp_save], " RBP "\n"\
+        : [esp_save] "=r" (esp_save),\
+        [ebp_save] "=r" (ebp_save),\
+        [esp_val] "=r" (esp_val),\
+        [ebp_val] "=r" (ebp_val)\
+        :  "[esp_val]" (esp_val),\
+        "[ebp_val]" (ebp_val));\
+    printf("level=%d:\n", level);\
+    printf("esp_val=" FMTLX "\n", esp_val - (long)stack_end);\
+    printf("ebp_val=" FMTLX "\n", ebp_val - (long)stack_end);\
+    for(ptr = (stack_type *)esp_val; ptr < stack_end; ptr++)\
+        printf(FMTLX "\n", (long)ptr[0]);\
+}
+
+static void test_enter(void)
+{
+#if defined(__x86_64__)
+    TEST_ENTER("q", uint64_t, 0);
+    TEST_ENTER("q", uint64_t, 1);
+    TEST_ENTER("q", uint64_t, 2);
+    TEST_ENTER("q", uint64_t, 31);
+#else
+    TEST_ENTER("l", uint32_t, 0);
+    TEST_ENTER("l", uint32_t, 1);
+    TEST_ENTER("l", uint32_t, 2);
+    TEST_ENTER("l", uint32_t, 31);
+#endif
+
+    TEST_ENTER("w", uint16_t, 0);
+    TEST_ENTER("w", uint16_t, 1);
+    TEST_ENTER("w", uint16_t, 2);
+    TEST_ENTER("w", uint16_t, 31);
+}
+
+#ifdef TEST_SSE
+
+typedef int __m64 __attribute__ ((__mode__ (__V2SI__)));
+typedef float __m128 __attribute__ ((__mode__(__V4SF__)));
+
+typedef union {
+    double d[2];
+    float s[4];
+    uint32_t l[4];
+    uint64_t q[2];
+    __m128 dq;
+} XMMReg;
+
+static uint64_t __attribute__((aligned(16))) test_values[4][2] = {
+    { 0x456723c698694873, 0xdc515cff944a58ec },
+    { 0x1f297ccd58bad7ab, 0x41f21efba9e3e146 },
+    { 0x007c62c2085427f8, 0x231be9e8cde7438d },
+    { 0x0f76255a085427f8, 0xc233e9e8c4c9439a },
+};
+
+#define SSE_OP(op)\
+{\
+    asm volatile (#op " %2, %0" : "=x" (r.dq) : "0" (a.dq), "x" (b.dq));\
+    printf("%-9s: a=" FMT64X "" FMT64X " b=" FMT64X "" FMT64X " r=" FMT64X "" FMT64X "\n",\
+           #op,\
+           a.q[1], a.q[0],\
+           b.q[1], b.q[0],\
+           r.q[1], r.q[0]);\
+}
+
+#define SSE_OP2(op)\
+{\
+    int i;\
+    for(i=0;i<2;i++) {\
+    a.q[0] = test_values[2*i][0];\
+    a.q[1] = test_values[2*i][1];\
+    b.q[0] = test_values[2*i+1][0];\
+    b.q[1] = test_values[2*i+1][1];\
+    SSE_OP(op);\
+    }\
+}
+
+#define MMX_OP2(op)\
+{\
+    int i;\
+    for(i=0;i<2;i++) {\
+    a.q[0] = test_values[2*i][0];\
+    b.q[0] = test_values[2*i+1][0];\
+    asm volatile (#op " %2, %0" : "=y" (r.q[0]) : "0" (a.q[0]), "y" (b.q[0]));\
+    printf("%-9s: a=" FMT64X " b=" FMT64X " r=" FMT64X "\n",\
+           #op,\
+           a.q[0],\
+           b.q[0],\
+           r.q[0]);\
+    }\
+    SSE_OP2(op);\
+}
+
+#define SHUF_OP(op, ib)\
+{\
+    a.q[0] = test_values[0][0];\
+    a.q[1] = test_values[0][1];\
+    b.q[0] = test_values[1][0];\
+    b.q[1] = test_values[1][1];\
+    asm volatile (#op " $" #ib ", %2, %0" : "=x" (r.dq) : "0" (a.dq), "x" (b.dq));\
+    printf("%-9s: a=" FMT64X "" FMT64X " b=" FMT64X "" FMT64X " ib=%02x r=" FMT64X "" FMT64X "\n",\
+           #op,\
+           a.q[1], a.q[0],\
+           b.q[1], b.q[0],\
+           ib,\
+           r.q[1], r.q[0]);\
+}
+
+#define PSHUF_OP(op, ib)\
+{\
+    int i;\
+    for(i=0;i<2;i++) {\
+    a.q[0] = test_values[2*i][0];\
+    a.q[1] = test_values[2*i][1];\
+    asm volatile (#op " $" #ib ", %1, %0" : "=x" (r.dq) : "x" (a.dq));\
+    printf("%-9s: a=" FMT64X "" FMT64X " ib=%02x r=" FMT64X "" FMT64X "\n",\
+           #op,\
+           a.q[1], a.q[0],\
+           ib,\
+           r.q[1], r.q[0]);\
+    }\
+}
+
+#define SHIFT_IM(op, ib)\
+{\
+    int i;\
+    for(i=0;i<2;i++) {\
+    a.q[0] = test_values[2*i][0];\
+    a.q[1] = test_values[2*i][1];\
+    asm volatile (#op " $" #ib ", %0" : "=x" (r.dq) : "0" (a.dq));\
+    printf("%-9s: a=" FMT64X "" FMT64X " ib=%02x r=" FMT64X "" FMT64X "\n",\
+           #op,\
+           a.q[1], a.q[0],\
+           ib,\
+           r.q[1], r.q[0]);\
+    }\
+}
+
+#define SHIFT_OP(op, ib)\
+{\
+    int i;\
+    SHIFT_IM(op, ib);\
+    for(i=0;i<2;i++) {\
+    a.q[0] = test_values[2*i][0];\
+    a.q[1] = test_values[2*i][1];\
+    b.q[0] = ib;\
+    b.q[1] = 0;\
+    asm volatile (#op " %2, %0" : "=x" (r.dq) : "0" (a.dq), "x" (b.dq));\
+    printf("%-9s: a=" FMT64X "" FMT64X " b=" FMT64X "" FMT64X " r=" FMT64X "" FMT64X "\n",\
+           #op,\
+           a.q[1], a.q[0],\
+           b.q[1], b.q[0],\
+           r.q[1], r.q[0]);\
+    }\
+}
+
+#define MOVMSK(op)\
+{\
+    int i, reg;\
+    for(i=0;i<2;i++) {\
+    a.q[0] = test_values[2*i][0];\
+    a.q[1] = test_values[2*i][1];\
+    asm volatile (#op " %1, %0" : "=r" (reg) : "x" (a.dq));\
+    printf("%-9s: a=" FMT64X "" FMT64X " r=%08x\n",\
+           #op,\
+           a.q[1], a.q[0],\
+           reg);\
+    }\
+}
+
+#define SSE_OPS(a) \
+SSE_OP(a ## ps);\
+SSE_OP(a ## ss);
+
+#define SSE_OPD(a) \
+SSE_OP(a ## pd);\
+SSE_OP(a ## sd);
+
+#define SSE_COMI(op, field)\
+{\
+    unsigned int eflags;\
+    XMMReg a, b;\
+    a.field[0] = a1;\
+    b.field[0] = b1;\
+    asm volatile (#op " %2, %1\n"\
+        "pushf\n"\
+        "pop %0\n"\
+        : "=m" (eflags)\
+        : "x" (a.dq), "x" (b.dq));\
+    printf("%-9s: a=%f b=%f cc=%04x\n",\
+           #op, a1, b1,\
+           eflags & (CC_C | CC_P | CC_Z | CC_S | CC_O | CC_A));\
+}
+
+void test_sse_comi(double a1, double b1)
+{
+    SSE_COMI(ucomiss, s);
+    SSE_COMI(ucomisd, d);
+    SSE_COMI(comiss, s);
+    SSE_COMI(comisd, d);
+}
+
+#define CVT_OP_XMM(op)\
+{\
+    asm volatile (#op " %1, %0" : "=x" (r.dq) : "x" (a.dq));\
+    printf("%-9s: a=" FMT64X "" FMT64X " r=" FMT64X "" FMT64X "\n",\
+           #op,\
+           a.q[1], a.q[0],\
+           r.q[1], r.q[0]);\
+}
+
+/* Force %xmm0 usage to avoid the case where both register index are 0
+   to test intruction decoding more extensively */
+#define CVT_OP_XMM2MMX(op)\
+{\
+    asm volatile (#op " %1, %0" : "=y" (r.q[0]) : "x" (a.dq) \
+                  : "%xmm0"); \
+    asm volatile("emms\n"); \
+    printf("%-9s: a=" FMT64X "" FMT64X " r=" FMT64X "\n",\
+           #op,\
+           a.q[1], a.q[0],\
+           r.q[0]);\
+}
+
+#define CVT_OP_MMX2XMM(op)\
+{\
+    asm volatile (#op " %1, %0" : "=x" (r.dq) : "y" (a.q[0]));\
+    asm volatile("emms\n"); \
+    printf("%-9s: a=" FMT64X " r=" FMT64X "" FMT64X "\n",\
+           #op,\
+           a.q[0],\
+           r.q[1], r.q[0]);\
+}
+
+#define CVT_OP_REG2XMM(op)\
+{\
+    asm volatile (#op " %1, %0" : "=x" (r.dq) : "r" (a.l[0]));\
+    printf("%-9s: a=%08x r=" FMT64X "" FMT64X "\n",\
+           #op,\
+           a.l[0],\
+           r.q[1], r.q[0]);\
+}
+
+#define CVT_OP_XMM2REG(op)\
+{\
+    asm volatile (#op " %1, %0" : "=r" (r.l[0]) : "x" (a.dq));\
+    printf("%-9s: a=" FMT64X "" FMT64X " r=%08x\n",\
+           #op,\
+           a.q[1], a.q[0],\
+           r.l[0]);\
+}
+
+struct fpxstate {
+    uint16_t fpuc;
+    uint16_t fpus;
+    uint16_t fptag;
+    uint16_t fop;
+    uint32_t fpuip;
+    uint16_t cs_sel;
+    uint16_t dummy0;
+    uint32_t fpudp;
+    uint16_t ds_sel;
+    uint16_t dummy1;
+    uint32_t mxcsr;
+    uint32_t mxcsr_mask;
+    uint8_t fpregs1[8 * 16];
+    uint8_t xmm_regs[8 * 16];
+    uint8_t dummy2[224];
+};
+
+static struct fpxstate fpx_state __attribute__((aligned(16)));
+static struct fpxstate fpx_state2 __attribute__((aligned(16)));
+
+void test_fxsave(void)
+{
+    struct fpxstate *fp = &fpx_state;
+    struct fpxstate *fp2 = &fpx_state2;
+    int i, nb_xmm;
+    XMMReg a, b;
+    a.q[0] = test_values[0][0];
+    a.q[1] = test_values[0][1];
+    b.q[0] = test_values[1][0];
+    b.q[1] = test_values[1][1];
+
+    asm("movdqa %2, %%xmm0\n"
+        "movdqa %3, %%xmm7\n"
+#if defined(__x86_64__)
+        "movdqa %2, %%xmm15\n"
+#endif
+        " fld1\n"
+        " fldpi\n"
+        " fldln2\n"
+        " fxsave %0\n"
+        " fxrstor %0\n"
+        " fxsave %1\n"
+        " fninit\n"
+        : "=m" (*(uint32_t *)fp2), "=m" (*(uint32_t *)fp)
+        : "m" (a), "m" (b));
+    printf("fpuc=%04x\n", fp->fpuc);
+    printf("fpus=%04x\n", fp->fpus);
+    printf("fptag=%04x\n", fp->fptag);
+    for(i = 0; i < 3; i++) {
+        printf("ST%d: " FMT64X " %04x\n",
+               i,
+               *(uint64_t *)&fp->fpregs1[i * 16],
+               *(uint16_t *)&fp->fpregs1[i * 16 + 8]);
+    }
+    printf("mxcsr=%08x\n", fp->mxcsr & 0x1f80);
+#if defined(__x86_64__)
+    nb_xmm = 16;
+#else
+    nb_xmm = 8;
+#endif
+    for(i = 0; i < nb_xmm; i++) {
+        printf("xmm%d: " FMT64X "" FMT64X "\n",
+               i,
+               *(uint64_t *)&fp->xmm_regs[i * 16],
+               *(uint64_t *)&fp->xmm_regs[i * 16 + 8]);
+    }
+}
+
+void test_sse(void)
+{
+    XMMReg r, a, b;
+    int i;
+
+    MMX_OP2(punpcklbw);
+    MMX_OP2(punpcklwd);
+    MMX_OP2(punpckldq);
+    MMX_OP2(packsswb);
+    MMX_OP2(pcmpgtb);
+    MMX_OP2(pcmpgtw);
+    MMX_OP2(pcmpgtd);
+    MMX_OP2(packuswb);
+    MMX_OP2(punpckhbw);
+    MMX_OP2(punpckhwd);
+    MMX_OP2(punpckhdq);
+    MMX_OP2(packssdw);
+    MMX_OP2(pcmpeqb);
+    MMX_OP2(pcmpeqw);
+    MMX_OP2(pcmpeqd);
+
+    MMX_OP2(paddq);
+    MMX_OP2(pmullw);
+    MMX_OP2(psubusb);
+    MMX_OP2(psubusw);
+    MMX_OP2(pminub);
+    MMX_OP2(pand);
+    MMX_OP2(paddusb);
+    MMX_OP2(paddusw);
+    MMX_OP2(pmaxub);
+    MMX_OP2(pandn);
+
+    MMX_OP2(pmulhuw);
+    MMX_OP2(pmulhw);
+
+    MMX_OP2(psubsb);
+    MMX_OP2(psubsw);
+    MMX_OP2(pminsw);
+    MMX_OP2(por);
+    MMX_OP2(paddsb);
+    MMX_OP2(paddsw);
+    MMX_OP2(pmaxsw);
+    MMX_OP2(pxor);
+    MMX_OP2(pmuludq);
+    MMX_OP2(pmaddwd);
+    MMX_OP2(psadbw);
+    MMX_OP2(psubb);
+    MMX_OP2(psubw);
+    MMX_OP2(psubd);
+    MMX_OP2(psubq);
+    MMX_OP2(paddb);
+    MMX_OP2(paddw);
+    MMX_OP2(paddd);
+
+    MMX_OP2(pavgb);
+    MMX_OP2(pavgw);
+
+    asm volatile ("pinsrw $1, %1, %0" : "=y" (r.q[0]) : "r" (0x12345678));
+    printf("%-9s: r=" FMT64X "\n", "pinsrw", r.q[0]);
+
+    asm volatile ("pinsrw $5, %1, %0" : "=x" (r.dq) : "r" (0x12345678));
+    printf("%-9s: r=" FMT64X "" FMT64X "\n", "pinsrw", r.q[1], r.q[0]);
+
+    a.q[0] = test_values[0][0];
+    a.q[1] = test_values[0][1];
+    asm volatile ("pextrw $1, %1, %0" : "=r" (r.l[0]) : "y" (a.q[0]));
+    printf("%-9s: r=%08x\n", "pextrw", r.l[0]);
+
+    asm volatile ("pextrw $5, %1, %0" : "=r" (r.l[0]) : "x" (a.dq));
+    printf("%-9s: r=%08x\n", "pextrw", r.l[0]);
+
+    asm volatile ("pmovmskb %1, %0" : "=r" (r.l[0]) : "y" (a.q[0]));
+    printf("%-9s: r=%08x\n", "pmovmskb", r.l[0]);
+
+    asm volatile ("pmovmskb %1, %0" : "=r" (r.l[0]) : "x" (a.dq));
+    printf("%-9s: r=%08x\n", "pmovmskb", r.l[0]);
+
+    {
+        r.q[0] = -1;
+        r.q[1] = -1;
+
+        a.q[0] = test_values[0][0];
+        a.q[1] = test_values[0][1];
+        b.q[0] = test_values[1][0];
+        b.q[1] = test_values[1][1];
+        asm volatile("maskmovq %1, %0" :
+                     : "y" (a.q[0]), "y" (b.q[0]), "D" (&r)
+                     : "memory");
+        printf("%-9s: r=" FMT64X " a=" FMT64X " b=" FMT64X "\n",
+               "maskmov",
+               r.q[0],
+               a.q[0],
+               b.q[0]);
+        asm volatile("maskmovdqu %1, %0" :
+                     : "x" (a.dq), "x" (b.dq), "D" (&r)
+                     : "memory");
+        printf("%-9s: r=" FMT64X "" FMT64X " a=" FMT64X "" FMT64X " b=" FMT64X "" FMT64X "\n",
+               "maskmov",
+               r.q[1], r.q[0],
+               a.q[1], a.q[0],
+               b.q[1], b.q[0]);
+    }
+
+    asm volatile ("emms");
+
+    SSE_OP2(punpcklqdq);
+    SSE_OP2(punpckhqdq);
+    SSE_OP2(andps);
+    SSE_OP2(andpd);
+    SSE_OP2(andnps);
+    SSE_OP2(andnpd);
+    SSE_OP2(orps);
+    SSE_OP2(orpd);
+    SSE_OP2(xorps);
+    SSE_OP2(xorpd);
+
+    SSE_OP2(unpcklps);
+    SSE_OP2(unpcklpd);
+    SSE_OP2(unpckhps);
+    SSE_OP2(unpckhpd);
+
+    SHUF_OP(shufps, 0x78);
+    SHUF_OP(shufpd, 0x02);
+
+    PSHUF_OP(pshufd, 0x78);
+    PSHUF_OP(pshuflw, 0x78);
+    PSHUF_OP(pshufhw, 0x78);
+
+    SHIFT_OP(psrlw, 7);
+    SHIFT_OP(psrlw, 16);
+    SHIFT_OP(psraw, 7);
+    SHIFT_OP(psraw, 16);
+    SHIFT_OP(psllw, 7);
+    SHIFT_OP(psllw, 16);
+
+    SHIFT_OP(psrld, 7);
+    SHIFT_OP(psrld, 32);
+    SHIFT_OP(psrad, 7);
+    SHIFT_OP(psrad, 32);
+    SHIFT_OP(pslld, 7);
+    SHIFT_OP(pslld, 32);
+
+    SHIFT_OP(psrlq, 7);
+    SHIFT_OP(psrlq, 32);
+    SHIFT_OP(psllq, 7);
+    SHIFT_OP(psllq, 32);
+
+    SHIFT_IM(psrldq, 16);
+    SHIFT_IM(psrldq, 7);
+    SHIFT_IM(pslldq, 16);
+    SHIFT_IM(pslldq, 7);
+
+    MOVMSK(movmskps);
+    MOVMSK(movmskpd);
+
+    /* FPU specific ops */
+
+    {
+        uint32_t mxcsr;
+        asm volatile("stmxcsr %0" : "=m" (mxcsr));
+        printf("mxcsr=%08x\n", mxcsr & 0x1f80);
+        asm volatile("ldmxcsr %0" : : "m" (mxcsr));
+    }
+
+    test_sse_comi(2, -1);
+    test_sse_comi(2, 2);
+    test_sse_comi(2, 3);
+    test_sse_comi(2, q_nan.d);
+    test_sse_comi(q_nan.d, -1);
+
+    for(i = 0; i < 2; i++) {
+        a.s[0] = 2.7;
+        a.s[1] = 3.4;
+        a.s[2] = 4;
+        a.s[3] = -6.3;
+        b.s[0] = 45.7;
+        b.s[1] = 353.4;
+        b.s[2] = 4;
+        b.s[3] = 56.3;
+        if (i == 1) {
+            a.s[0] = q_nan.d;
+            b.s[3] = q_nan.d;
+        }
+
+        SSE_OPS(add);
+        SSE_OPS(mul);
+        SSE_OPS(sub);
+        SSE_OPS(min);
+        SSE_OPS(div);
+        SSE_OPS(max);
+        SSE_OPS(sqrt);
+        SSE_OPS(cmpeq);
+        SSE_OPS(cmplt);
+        SSE_OPS(cmple);
+        SSE_OPS(cmpunord);
+        SSE_OPS(cmpneq);
+        SSE_OPS(cmpnlt);
+        SSE_OPS(cmpnle);
+        SSE_OPS(cmpord);
+
+
+        a.d[0] = 2.7;
+        a.d[1] = -3.4;
+        b.d[0] = 45.7;
+        b.d[1] = -53.4;
+        if (i == 1) {
+            a.d[0] = q_nan.d;
+            b.d[1] = q_nan.d;
+        }
+        SSE_OPD(add);
+        SSE_OPD(mul);
+        SSE_OPD(sub);
+        SSE_OPD(min);
+        SSE_OPD(div);
+        SSE_OPD(max);
+        SSE_OPD(sqrt);
+        SSE_OPD(cmpeq);
+        SSE_OPD(cmplt);
+        SSE_OPD(cmple);
+        SSE_OPD(cmpunord);
+        SSE_OPD(cmpneq);
+        SSE_OPD(cmpnlt);
+        SSE_OPD(cmpnle);
+        SSE_OPD(cmpord);
+    }
+
+    /* float to float/int */
+    a.s[0] = 2.7;
+    a.s[1] = 3.4;
+    a.s[2] = 4;
+    a.s[3] = -6.3;
+    CVT_OP_XMM(cvtps2pd);
+    CVT_OP_XMM(cvtss2sd);
+    CVT_OP_XMM2MMX(cvtps2pi);
+    CVT_OP_XMM2MMX(cvttps2pi);
+    CVT_OP_XMM2REG(cvtss2si);
+    CVT_OP_XMM2REG(cvttss2si);
+    CVT_OP_XMM(cvtps2dq);
+    CVT_OP_XMM(cvttps2dq);
+
+    a.d[0] = 2.6;
+    a.d[1] = -3.4;
+    CVT_OP_XMM(cvtpd2ps);
+    CVT_OP_XMM(cvtsd2ss);
+    CVT_OP_XMM2MMX(cvtpd2pi);
+    CVT_OP_XMM2MMX(cvttpd2pi);
+    CVT_OP_XMM2REG(cvtsd2si);
+    CVT_OP_XMM2REG(cvttsd2si);
+    CVT_OP_XMM(cvtpd2dq);
+    CVT_OP_XMM(cvttpd2dq);
+
+    /* sse/mmx moves */
+    CVT_OP_XMM2MMX(movdq2q);
+    CVT_OP_MMX2XMM(movq2dq);
+
+    /* int to float */
+    a.l[0] = -6;
+    a.l[1] = 2;
+    a.l[2] = 100;
+    a.l[3] = -60000;
+    CVT_OP_MMX2XMM(cvtpi2ps);
+    CVT_OP_MMX2XMM(cvtpi2pd);
+    CVT_OP_REG2XMM(cvtsi2ss);
+    CVT_OP_REG2XMM(cvtsi2sd);
+    CVT_OP_XMM(cvtdq2ps);
+    CVT_OP_XMM(cvtdq2pd);
+
+    /* XXX: test PNI insns */
+#if 0
+    SSE_OP2(movshdup);
+#endif
+    asm volatile ("emms");
+}
+
+#endif
+
+#define TEST_CONV_RAX(op)\
+{\
+    unsigned long a, r;\
+    a = i2l(0x8234a6f8);\
+    r = a;\
+    asm volatile(#op : "=a" (r) : "0" (r));\
+    printf("%-10s A=" FMTLX " R=" FMTLX "\n", #op, a, r);\
+}
+
+#define TEST_CONV_RAX_RDX(op)\
+{\
+    unsigned long a, d, r, rh;                   \
+    a = i2l(0x8234a6f8);\
+    d = i2l(0x8345a1f2);\
+    r = a;\
+    rh = d;\
+    asm volatile(#op : "=a" (r), "=d" (rh) : "0" (r), "1" (rh));   \
+    printf("%-10s A=" FMTLX " R=" FMTLX ":" FMTLX "\n", #op, a, r, rh);  \
+}
+
+void test_conv(void)
+{
+    TEST_CONV_RAX(cbw);
+    TEST_CONV_RAX(cwde);
+#if defined(__x86_64__)
+    TEST_CONV_RAX(cdqe);
+#endif
+
+    TEST_CONV_RAX_RDX(cwd);
+    TEST_CONV_RAX_RDX(cdq);
+#if defined(__x86_64__)
+    TEST_CONV_RAX_RDX(cqo);
+#endif
+
+    {
+        unsigned long a, r;
+        a = i2l(0x12345678);
+        asm volatile("bswapl %k0" : "=r" (r) : "0" (a));
+        printf("%-10s: A=" FMTLX " R=" FMTLX "\n", "bswapl", a, r);
+    }
+#if defined(__x86_64__)
+    {
+        unsigned long a, r;
+        a = i2l(0x12345678);
+        asm volatile("bswapq %0" : "=r" (r) : "0" (a));
+        printf("%-10s: A=" FMTLX " R=" FMTLX "\n", "bswapq", a, r);
+    }
+#endif
+}
+
+extern void *__start_initcall;
+extern void *__stop_initcall;
+
+
+int main(int argc, char **argv)
+{
+    void **ptr;
+    void (*func)(void);
+
+    ptr = &__start_initcall;
+    while (ptr != &__stop_initcall) {
+        func = *ptr++;
+        func();
+    }
+    test_bsx();
+    test_mul();
+    test_jcc();
+    test_loop();
+    test_floats();
+#if !defined(__x86_64__)
+    test_bcd();
+#endif
+    test_xchg();
+    test_string();
+    test_misc();
+    test_lea();
+#ifdef TEST_SEGS
+    test_segs();
+    test_code16();
+#endif
+#ifdef TEST_VM86
+    test_vm86();
+#endif
+#if !defined(__x86_64__)
+    test_exceptions();
+    test_self_modifying_code();
+    test_single_step();
+#endif
+    test_enter();
+    test_conv();
+#ifdef TEST_SSE
+    test_sse();
+    test_fxsave();
+#endif
+    return 0;
+}
diff --git a/src/recompiler/tests/test-i386.h b/src/recompiler/tests/test-i386.h
new file mode 100644
index 00000000..75106b8c
--- /dev/null
+++ b/src/recompiler/tests/test-i386.h
@@ -0,0 +1,152 @@
+
+#define exec_op glue(exec_, OP)
+#define exec_opq glue(glue(exec_, OP), q)
+#define exec_opl glue(glue(exec_, OP), l)
+#define exec_opw glue(glue(exec_, OP), w)
+#define exec_opb glue(glue(exec_, OP), b)
+
+#define EXECOP2(size, rsize, res, s1, flags) \
+    asm ("push %4\n\t"\
+         "popf\n\t"\
+         stringify(OP) size " %" rsize "2, %" rsize "0\n\t" \
+         "pushf\n\t"\
+         "pop %1\n\t"\
+         : "=q" (res), "=g" (flags)\
+         : "q" (s1), "0" (res), "1" (flags)); \
+    printf("%-10s A=" FMTLX " B=" FMTLX " R=" FMTLX " CCIN=%04lx CC=%04lx\n", \
+           stringify(OP) size, s0, s1, res, iflags, flags & CC_MASK);
+
+#define EXECOP1(size, rsize, res, flags) \
+    asm ("push %3\n\t"\
+         "popf\n\t"\
+         stringify(OP) size " %" rsize "0\n\t" \
+         "pushf\n\t"\
+         "pop %1\n\t"\
+         : "=q" (res), "=g" (flags)\
+         : "0" (res), "1" (flags)); \
+    printf("%-10s A=" FMTLX " R=" FMTLX " CCIN=%04lx CC=%04lx\n", \
+           stringify(OP) size, s0, res, iflags, flags & CC_MASK);
+
+#ifdef OP1
+#if defined(__x86_64__)
+void exec_opq(long s0, long s1, long iflags)
+{
+    long res, flags;
+    res = s0;
+    flags = iflags;
+    EXECOP1("q", "", res, flags);
+}
+#endif
+
+void exec_opl(long s0, long s1, long iflags)
+{
+    long res, flags;
+    res = s0;
+    flags = iflags;
+    EXECOP1("l", "k", res, flags);
+}
+
+void exec_opw(long s0, long s1, long iflags)
+{
+    long res, flags;
+    res = s0;
+    flags = iflags;
+    EXECOP1("w", "w", res, flags);
+}
+
+void exec_opb(long s0, long s1, long iflags)
+{
+    long res, flags;
+    res = s0;
+    flags = iflags;
+    EXECOP1("b", "b", res, flags);
+}
+#else
+#if defined(__x86_64__)
+void exec_opq(long s0, long s1, long iflags)
+{
+    long res, flags;
+    res = s0;
+    flags = iflags;
+    EXECOP2("q", "", res, s1, flags);
+}
+#endif
+
+void exec_opl(long s0, long s1, long iflags)
+{
+    long res, flags;
+    res = s0;
+    flags = iflags;
+    EXECOP2("l", "k", res, s1, flags);
+}
+
+void exec_opw(long s0, long s1, long iflags)
+{
+    long res, flags;
+    res = s0;
+    flags = iflags;
+    EXECOP2("w", "w", res, s1, flags);
+}
+
+void exec_opb(long s0, long s1, long iflags)
+{
+    long res, flags;
+    res = s0;
+    flags = iflags;
+    EXECOP2("b", "b", res, s1, flags);
+}
+#endif
+
+void exec_op(long s0, long s1)
+{
+    s0 = i2l(s0);
+    s1 = i2l(s1);
+#if defined(__x86_64__)
+    exec_opq(s0, s1, 0);
+#endif
+    exec_opl(s0, s1, 0);
+    exec_opw(s0, s1, 0);
+    exec_opb(s0, s1, 0);
+#ifdef OP_CC
+#if defined(__x86_64__)
+    exec_opq(s0, s1, CC_C);
+#endif
+    exec_opl(s0, s1, CC_C);
+    exec_opw(s0, s1, CC_C);
+    exec_opb(s0, s1, CC_C);
+#endif
+}
+
+void glue(test_, OP)(void)
+{
+    exec_op(0x12345678, 0x812FADA);
+    exec_op(0x12341, 0x12341);
+    exec_op(0x12341, -0x12341);
+    exec_op(0xffffffff, 0);
+    exec_op(0xffffffff, -1);
+    exec_op(0xffffffff, 1);
+    exec_op(0xffffffff, 2);
+    exec_op(0x7fffffff, 0);
+    exec_op(0x7fffffff, 1);
+    exec_op(0x7fffffff, -1);
+    exec_op(0x80000000, -1);
+    exec_op(0x80000000, 1);
+    exec_op(0x80000000, -2);
+    exec_op(0x12347fff, 0);
+    exec_op(0x12347fff, 1);
+    exec_op(0x12347fff, -1);
+    exec_op(0x12348000, -1);
+    exec_op(0x12348000, 1);
+    exec_op(0x12348000, -2);
+    exec_op(0x12347f7f, 0);
+    exec_op(0x12347f7f, 1);
+    exec_op(0x12347f7f, -1);
+    exec_op(0x12348080, -1);
+    exec_op(0x12348080, 1);
+    exec_op(0x12348080, -2);
+}
+
+void *glue(_test_, OP) __init_call = glue(test_, OP);
+
+#undef OP
+#undef OP_CC
diff --git a/src/recompiler/tests/test-mmap.c b/src/recompiler/tests/test-mmap.c
new file mode 100644
index 00000000..1d444b63
--- /dev/null
+++ b/src/recompiler/tests/test-mmap.c
@@ -0,0 +1,485 @@
+/*
+ * Small test program to verify simulated mmap behaviour.
+ *
+ * When running qemu-linux-user with the -p flag, you may need to tell
+ * this test program about the pagesize because getpagesize() will not reflect
+ * the -p choice. Simply pass one argument beeing the pagesize.
+ *
+ * Copyright (c) 2007 AXIS Communications AB
+ * Written by Edgar E. Iglesias.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Oracle GPL Disclaimer: For the avoidance of doubt, except that if any license choice
+ * other than GPL or LGPL is available it will apply instead, Oracle elects to use only
+ * the General Public License version 2 (GPLv2) at this time for any software where
+ * a choice of GPL license versions is made available with the language indicating
+ * that GPLv2 or any later version may be used, or where a choice of which version
+ * of the GPL is applied is otherwise unspecified.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/mman.h>
+
+#define D(x)
+
+#define fail_unless(x)                                         \
+do                                                             \
+{                                                              \
+  if (!(x)) {                                                  \
+    fprintf (stderr, "FAILED at %s:%d\n", __FILE__, __LINE__); \
+    exit (EXIT_FAILURE);                                       \
+  }                                                            \
+} while (0);
+
+unsigned char *dummybuf;
+static unsigned int pagesize;
+static unsigned int pagemask;
+int test_fd;
+size_t test_fsize;
+
+void check_aligned_anonymous_unfixed_mmaps(void)
+{
+	void *p1;
+	void *p2;
+	void *p3;
+	void *p4;
+	void *p5;
+	uintptr_t p;
+	int i;
+
+	fprintf (stderr, "%s", __func__);
+	for (i = 0; i < 0x1fff; i++)
+	{
+		size_t len;
+
+		len = pagesize + (pagesize * i & 7);
+		p1 = mmap(NULL, len, PROT_READ,
+			  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+		p2 = mmap(NULL, len, PROT_READ,
+			  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+		p3 = mmap(NULL, len, PROT_READ,
+			  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+		p4 = mmap(NULL, len, PROT_READ,
+			  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+		p5 = mmap(NULL, len, PROT_READ,
+			  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+		/* Make sure we get pages aligned with the pagesize. The
+		   target expects this.  */
+		fail_unless (p1 != MAP_FAILED);
+		fail_unless (p2 != MAP_FAILED);
+		fail_unless (p3 != MAP_FAILED);
+		fail_unless (p4 != MAP_FAILED);
+		fail_unless (p5 != MAP_FAILED);
+		p = (uintptr_t) p1;
+		D(printf ("p=%x\n", p));
+		fail_unless ((p & pagemask) == 0);
+		p = (uintptr_t) p2;
+		fail_unless ((p & pagemask) == 0);
+		p = (uintptr_t) p3;
+		fail_unless ((p & pagemask) == 0);
+		p = (uintptr_t) p4;
+		fail_unless ((p & pagemask) == 0);
+		p = (uintptr_t) p5;
+		fail_unless ((p & pagemask) == 0);
+
+		/* Make sure we can read from the entire area.  */
+		memcpy (dummybuf, p1, pagesize);
+		memcpy (dummybuf, p2, pagesize);
+		memcpy (dummybuf, p3, pagesize);
+		memcpy (dummybuf, p4, pagesize);
+		memcpy (dummybuf, p5, pagesize);
+
+		munmap (p1, len);
+		munmap (p2, len);
+		munmap (p3, len);
+		munmap (p4, len);
+		munmap (p5, len);
+	}
+	fprintf (stderr, " passed\n");
+}
+
+void check_large_anonymous_unfixed_mmap(void)
+{
+	void *p1;
+	uintptr_t p;
+	size_t len;
+
+	fprintf (stderr, "%s", __func__);
+
+	len = 0x02000000;
+	p1 = mmap(NULL, len, PROT_READ,
+		  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+	/* Make sure we get pages aligned with the pagesize. The
+	   target expects this.  */
+	fail_unless (p1 != MAP_FAILED);
+	p = (uintptr_t) p1;
+	fail_unless ((p & pagemask) == 0);
+
+	/* Make sure we can read from the entire area.  */
+	memcpy (dummybuf, p1, pagesize);
+	munmap (p1, len);
+	fprintf (stderr, " passed\n");
+}
+
+void check_aligned_anonymous_unfixed_colliding_mmaps(void)
+{
+	char *p1;
+	char *p2;
+	char *p3;
+	uintptr_t p;
+	int i;
+
+	fprintf (stderr, "%s", __func__);
+	for (i = 0; i < 0x2fff; i++)
+	{
+		int nlen;
+		p1 = mmap(NULL, pagesize, PROT_READ,
+			  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+		fail_unless (p1 != MAP_FAILED);
+		p = (uintptr_t) p1;
+		fail_unless ((p & pagemask) == 0);
+		memcpy (dummybuf, p1, pagesize);
+
+		p2 = mmap(NULL, pagesize, PROT_READ,
+			  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+		fail_unless (p2 != MAP_FAILED);
+		p = (uintptr_t) p2;
+		fail_unless ((p & pagemask) == 0);
+		memcpy (dummybuf, p2, pagesize);
+
+
+		munmap (p1, pagesize);
+		nlen = pagesize * 8;
+		p3 = mmap(NULL, nlen, PROT_READ,
+			  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+		/* Check if the mmaped areas collide.  */
+		if (p3 < p2
+		    && (p3 + nlen) > p2)
+			fail_unless (0);
+
+		memcpy (dummybuf, p3, pagesize);
+
+		/* Make sure we get pages aligned with the pagesize. The
+		   target expects this.  */
+		fail_unless (p3 != MAP_FAILED);
+		p = (uintptr_t) p3;
+		fail_unless ((p & pagemask) == 0);
+		munmap (p2, pagesize);
+		munmap (p3, nlen);
+	}
+	fprintf (stderr, " passed\n");
+}
+
+void check_aligned_anonymous_fixed_mmaps(void)
+{
+	char *addr;
+	void *p1;
+	uintptr_t p;
+	int i;
+
+	/* Find a suitable address to start with.  */
+	addr = mmap(NULL, pagesize * 40, PROT_READ | PROT_WRITE,
+		    MAP_PRIVATE | MAP_ANONYMOUS,
+		    -1, 0);
+	fprintf (stderr, "%s addr=%p", __func__, addr);
+	fail_unless (addr != MAP_FAILED);
+
+	for (i = 0; i < 40; i++)
+	{
+		/* Create submaps within our unfixed map.  */
+		p1 = mmap(addr, pagesize, PROT_READ,
+			  MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+			  -1, 0);
+		/* Make sure we get pages aligned with the pagesize.
+		   The target expects this.  */
+		p = (uintptr_t) p1;
+		fail_unless (p1 == addr);
+		fail_unless ((p & pagemask) == 0);
+		memcpy (dummybuf, p1, pagesize);
+		munmap (p1, pagesize);
+		addr += pagesize;
+	}
+	fprintf (stderr, " passed\n");
+}
+
+void check_aligned_anonymous_fixed_mmaps_collide_with_host(void)
+{
+	char *addr;
+	void *p1;
+	uintptr_t p;
+	int i;
+
+	/* Find a suitable address to start with.  Right were the x86 hosts
+	 stack is.  */
+	addr = ((void *)0x80000000);
+	fprintf (stderr, "%s addr=%p", __func__, addr);
+	fprintf (stderr, "FIXME: QEMU fails to track pages used by the host.");
+
+	for (i = 0; i < 20; i++)
+	{
+		/* Create submaps within our unfixed map.  */
+		p1 = mmap(addr, pagesize, PROT_READ | PROT_WRITE,
+			  MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+			  -1, 0);
+		/* Make sure we get pages aligned with the pagesize.
+		   The target expects this.  */
+		p = (uintptr_t) p1;
+		fail_unless (p1 == addr);
+		fail_unless ((p & pagemask) == 0);
+		memcpy (p1, dummybuf, pagesize);
+		munmap (p1, pagesize);
+		addr += pagesize;
+	}
+	fprintf (stderr, " passed\n");
+}
+
+void check_file_unfixed_mmaps(void)
+{
+	unsigned int *p1, *p2, *p3;
+	uintptr_t p;
+	int i;
+
+	fprintf (stderr, "%s", __func__);
+	for (i = 0; i < 0x10; i++)
+	{
+		size_t len;
+
+		len = pagesize;
+		p1 = mmap(NULL, len, PROT_READ,
+			  MAP_PRIVATE,
+			  test_fd, 0);
+		p2 = mmap(NULL, len, PROT_READ,
+			  MAP_PRIVATE,
+			  test_fd, pagesize);
+		p3 = mmap(NULL, len, PROT_READ,
+			  MAP_PRIVATE,
+			  test_fd, pagesize * 2);
+
+		fail_unless (p1 != MAP_FAILED);
+		fail_unless (p2 != MAP_FAILED);
+		fail_unless (p3 != MAP_FAILED);
+
+		/* Make sure we get pages aligned with the pagesize. The
+		   target expects this.  */
+		p = (uintptr_t) p1;
+		fail_unless ((p & pagemask) == 0);
+		p = (uintptr_t) p2;
+		fail_unless ((p & pagemask) == 0);
+		p = (uintptr_t) p3;
+		fail_unless ((p & pagemask) == 0);
+
+		/* Verify that the file maps was made correctly.  */
+		D(printf ("p1=%d p2=%d p3=%d\n", *p1, *p2, *p3));
+		fail_unless (*p1 == 0);
+		fail_unless (*p2 == (pagesize / sizeof *p2));
+		fail_unless (*p3 == ((pagesize * 2) / sizeof *p3));
+
+		memcpy (dummybuf, p1, pagesize);
+		memcpy (dummybuf, p2, pagesize);
+		memcpy (dummybuf, p3, pagesize);
+		munmap (p1, len);
+		munmap (p2, len);
+		munmap (p3, len);
+	}
+	fprintf (stderr, " passed\n");
+}
+
+void check_file_unfixed_eof_mmaps(void)
+{
+	char *cp;
+	unsigned int *p1;
+	uintptr_t p;
+	int i;
+
+	fprintf (stderr, "%s", __func__);
+	for (i = 0; i < 0x10; i++)
+	{
+		p1 = mmap(NULL, pagesize, PROT_READ,
+			  MAP_PRIVATE,
+			  test_fd,
+			  (test_fsize - sizeof *p1) & ~pagemask);
+
+		fail_unless (p1 != MAP_FAILED);
+
+		/* Make sure we get pages aligned with the pagesize. The
+		   target expects this.  */
+		p = (uintptr_t) p1;
+		fail_unless ((p & pagemask) == 0);
+		/* Verify that the file maps was made correctly.  */
+		fail_unless (p1[(test_fsize & pagemask) / sizeof *p1 - 1]
+			     == ((test_fsize - sizeof *p1) / sizeof *p1));
+
+		/* Verify that the end of page is accessable and zeroed.  */
+		cp = (void *) p1;
+		fail_unless (cp[pagesize - 4] == 0);
+		munmap (p1, pagesize);
+	}
+	fprintf (stderr, " passed\n");
+}
+
+void check_file_fixed_eof_mmaps(void)
+{
+	char *addr;
+	char *cp;
+	unsigned int *p1;
+	uintptr_t p;
+	int i;
+
+	/* Find a suitable address to start with.  */
+	addr = mmap(NULL, pagesize * 44, PROT_READ,
+		    MAP_PRIVATE | MAP_ANONYMOUS,
+		    -1, 0);
+
+	fprintf (stderr, "%s addr=%p", __func__, (void *)addr);
+	fail_unless (addr != MAP_FAILED);
+
+	for (i = 0; i < 0x10; i++)
+	{
+		/* Create submaps within our unfixed map.  */
+		p1 = mmap(addr, pagesize, PROT_READ,
+			  MAP_PRIVATE | MAP_FIXED,
+			  test_fd,
+			  (test_fsize - sizeof *p1) & ~pagemask);
+
+		fail_unless (p1 != MAP_FAILED);
+
+		/* Make sure we get pages aligned with the pagesize. The
+		   target expects this.  */
+		p = (uintptr_t) p1;
+		fail_unless ((p & pagemask) == 0);
+
+		/* Verify that the file maps was made correctly.  */
+		fail_unless (p1[(test_fsize & pagemask) / sizeof *p1 - 1]
+			     == ((test_fsize - sizeof *p1) / sizeof *p1));
+
+		/* Verify that the end of page is accessable and zeroed.  */
+		cp = (void *)p1;
+		fail_unless (cp[pagesize - 4] == 0);
+		munmap (p1, pagesize);
+		addr += pagesize;
+	}
+	fprintf (stderr, " passed\n");
+}
+
+void check_file_fixed_mmaps(void)
+{
+	unsigned char *addr;
+	unsigned int *p1, *p2, *p3, *p4;
+	int i;
+
+	/* Find a suitable address to start with.  */
+	addr = mmap(NULL, pagesize * 40 * 4, PROT_READ,
+		    MAP_PRIVATE | MAP_ANONYMOUS,
+		    -1, 0);
+	fprintf (stderr, "%s addr=%p", __func__, (void *)addr);
+	fail_unless (addr != MAP_FAILED);
+
+	for (i = 0; i < 40; i++)
+	{
+		p1 = mmap(addr, pagesize, PROT_READ,
+			  MAP_PRIVATE | MAP_FIXED,
+			  test_fd, 0);
+		p2 = mmap(addr + pagesize, pagesize, PROT_READ,
+			  MAP_PRIVATE | MAP_FIXED,
+			  test_fd, pagesize);
+		p3 = mmap(addr + pagesize * 2, pagesize, PROT_READ,
+			  MAP_PRIVATE | MAP_FIXED,
+			  test_fd, pagesize * 2);
+		p4 = mmap(addr + pagesize * 3, pagesize, PROT_READ,
+			  MAP_PRIVATE | MAP_FIXED,
+			  test_fd, pagesize * 3);
+
+		/* Make sure we get pages aligned with the pagesize.
+		   The target expects this.  */
+		fail_unless (p1 == (void *)addr);
+		fail_unless (p2 == (void *)(addr + pagesize));
+		fail_unless (p3 == (void *)(addr + pagesize * 2));
+		fail_unless (p4 == (void *)(addr + pagesize * 3));
+
+		/* Verify that the file maps was made correctly.  */
+		fail_unless (*p1 == 0);
+		fail_unless (*p2 == (pagesize / sizeof *p2));
+		fail_unless (*p3 == ((pagesize * 2) / sizeof *p3));
+		fail_unless (*p4 == ((pagesize * 3) / sizeof *p4));
+
+		memcpy (dummybuf, p1, pagesize);
+		memcpy (dummybuf, p2, pagesize);
+		memcpy (dummybuf, p3, pagesize);
+		memcpy (dummybuf, p4, pagesize);
+
+		munmap (p1, pagesize);
+		munmap (p2, pagesize);
+		munmap (p3, pagesize);
+		munmap (p4, pagesize);
+		addr += pagesize * 4;
+	}
+	fprintf (stderr, " passed\n");
+}
+
+int main(int argc, char **argv)
+{
+	char tempname[] = "/tmp/.cmmapXXXXXX";
+	unsigned int i;
+
+	/* Trust the first argument, otherwise probe the system for our
+	   pagesize.  */
+	if (argc > 1)
+		pagesize = strtoul(argv[1], NULL, 0);
+	else
+		pagesize = sysconf(_SC_PAGESIZE);
+
+	/* Assume pagesize is a power of two.  */
+	pagemask = pagesize - 1;
+	dummybuf = malloc (pagesize);
+	printf ("pagesize=%u pagemask=%x\n", pagesize, pagemask);
+
+	test_fd = mkstemp(tempname);
+	unlink(tempname);
+
+	/* Fill the file with int's counting from zero and up.  */
+	for (i = 0; i < (pagesize * 4) / sizeof i; i++)
+		write (test_fd, &i, sizeof i);
+	/* Append a few extra writes to make the file end at non
+	   page boundary.  */
+	write (test_fd, &i, sizeof i); i++;
+	write (test_fd, &i, sizeof i); i++;
+	write (test_fd, &i, sizeof i); i++;
+
+	test_fsize = lseek(test_fd, 0, SEEK_CUR);
+
+	/* Run the tests.  */
+	check_aligned_anonymous_unfixed_mmaps();
+	check_aligned_anonymous_unfixed_colliding_mmaps();
+	check_aligned_anonymous_fixed_mmaps();
+	check_file_unfixed_mmaps();
+	check_file_fixed_mmaps();
+	check_file_fixed_eof_mmaps();
+	check_file_unfixed_eof_mmaps();
+
+	/* Fails at the moment.  */
+	/* check_aligned_anonymous_fixed_mmaps_collide_with_host(); */
+
+	return EXIT_SUCCESS;
+}
diff --git a/src/recompiler/tests/test_path.c b/src/recompiler/tests/test_path.c
new file mode 100644
index 00000000..def7441c
--- /dev/null
+++ b/src/recompiler/tests/test_path.c
@@ -0,0 +1,151 @@
+/* Test path override code */
+#define _GNU_SOURCE
+#include "../path.c"
+#include <stdarg.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+/* Any log message kills the test. */
+void gemu_log(const char *fmt, ...)
+{
+    va_list ap;
+
+    fprintf(stderr, "FATAL: ");
+    va_start(ap, fmt);
+    vfprintf(stderr, fmt, ap);
+    va_end(ap);
+    exit(1);
+}
+
+#define NO_CHANGE(_path)						\
+	do {								\
+	    if (strcmp(path(_path), _path) != 0) return __LINE__;	\
+	} while(0)
+
+#define CHANGE_TO(_path, _newpath)					\
+	do {								\
+	    if (strcmp(path(_path), _newpath) != 0) return __LINE__;	\
+	} while(0)
+
+static void cleanup(void)
+{
+    unlink("/tmp/qemu-test_path/DIR1/DIR2/FILE");
+    unlink("/tmp/qemu-test_path/DIR1/DIR2/FILE2");
+    unlink("/tmp/qemu-test_path/DIR1/DIR2/FILE3");
+    unlink("/tmp/qemu-test_path/DIR1/DIR2/FILE4");
+    unlink("/tmp/qemu-test_path/DIR1/DIR2/FILE5");
+    rmdir("/tmp/qemu-test_path/DIR1/DIR2");
+    rmdir("/tmp/qemu-test_path/DIR1/DIR3");
+    rmdir("/tmp/qemu-test_path/DIR1");
+    rmdir("/tmp/qemu-test_path");
+}
+
+static unsigned int do_test(void)
+{
+    if (mkdir("/tmp/qemu-test_path", 0700) != 0)
+	return __LINE__;
+
+    if (mkdir("/tmp/qemu-test_path/DIR1", 0700) != 0)
+	return __LINE__;
+
+    if (mkdir("/tmp/qemu-test_path/DIR1/DIR2", 0700) != 0)
+	return __LINE__;
+
+    if (mkdir("/tmp/qemu-test_path/DIR1/DIR3", 0700) != 0)
+	return __LINE__;
+
+    if (close(creat("/tmp/qemu-test_path/DIR1/DIR2/FILE", 0600)) != 0)
+	return __LINE__;
+
+    if (close(creat("/tmp/qemu-test_path/DIR1/DIR2/FILE2", 0600)) != 0)
+	return __LINE__;
+
+    if (close(creat("/tmp/qemu-test_path/DIR1/DIR2/FILE3", 0600)) != 0)
+	return __LINE__;
+
+    if (close(creat("/tmp/qemu-test_path/DIR1/DIR2/FILE4", 0600)) != 0)
+	return __LINE__;
+
+    if (close(creat("/tmp/qemu-test_path/DIR1/DIR2/FILE5", 0600)) != 0)
+	return __LINE__;
+
+    init_paths("/tmp/qemu-test_path");
+
+    NO_CHANGE("/tmp");
+    NO_CHANGE("/tmp/");
+    NO_CHANGE("/tmp/qemu-test_path");
+    NO_CHANGE("/tmp/qemu-test_path/");
+    NO_CHANGE("/tmp/qemu-test_path/D");
+    NO_CHANGE("/tmp/qemu-test_path/DI");
+    NO_CHANGE("/tmp/qemu-test_path/DIR");
+    NO_CHANGE("/tmp/qemu-test_path/DIR1");
+    NO_CHANGE("/tmp/qemu-test_path/DIR1/");
+
+    NO_CHANGE("/D");
+    NO_CHANGE("/DI");
+    NO_CHANGE("/DIR");
+    NO_CHANGE("/DIR2");
+    NO_CHANGE("/DIR1.");
+
+    CHANGE_TO("/DIR1", "/tmp/qemu-test_path/DIR1");
+    CHANGE_TO("/DIR1/", "/tmp/qemu-test_path/DIR1");
+
+    NO_CHANGE("/DIR1/D");
+    NO_CHANGE("/DIR1/DI");
+    NO_CHANGE("/DIR1/DIR");
+    NO_CHANGE("/DIR1/DIR1");
+
+    CHANGE_TO("/DIR1/DIR2", "/tmp/qemu-test_path/DIR1/DIR2");
+    CHANGE_TO("/DIR1/DIR2/", "/tmp/qemu-test_path/DIR1/DIR2");
+
+    CHANGE_TO("/DIR1/DIR3", "/tmp/qemu-test_path/DIR1/DIR3");
+    CHANGE_TO("/DIR1/DIR3/", "/tmp/qemu-test_path/DIR1/DIR3");
+
+    NO_CHANGE("/DIR1/DIR2/F");
+    NO_CHANGE("/DIR1/DIR2/FI");
+    NO_CHANGE("/DIR1/DIR2/FIL");
+    NO_CHANGE("/DIR1/DIR2/FIL.");
+
+    CHANGE_TO("/DIR1/DIR2/FILE", "/tmp/qemu-test_path/DIR1/DIR2/FILE");
+    CHANGE_TO("/DIR1/DIR2/FILE2", "/tmp/qemu-test_path/DIR1/DIR2/FILE2");
+    CHANGE_TO("/DIR1/DIR2/FILE3", "/tmp/qemu-test_path/DIR1/DIR2/FILE3");
+    CHANGE_TO("/DIR1/DIR2/FILE4", "/tmp/qemu-test_path/DIR1/DIR2/FILE4");
+    CHANGE_TO("/DIR1/DIR2/FILE5", "/tmp/qemu-test_path/DIR1/DIR2/FILE5");
+
+    NO_CHANGE("/DIR1/DIR2/FILE6");
+    NO_CHANGE("/DIR1/DIR2/FILE/X");
+
+    CHANGE_TO("/DIR1/../DIR1", "/tmp/qemu-test_path/DIR1");
+    CHANGE_TO("/DIR1/../DIR1/", "/tmp/qemu-test_path/DIR1");
+    CHANGE_TO("/../DIR1", "/tmp/qemu-test_path/DIR1");
+    CHANGE_TO("/../DIR1/", "/tmp/qemu-test_path/DIR1");
+    CHANGE_TO("/DIR1/DIR2/../DIR2", "/tmp/qemu-test_path/DIR1/DIR2");
+    CHANGE_TO("/DIR1/DIR2/../DIR2/../../DIR1/DIR2/FILE", "/tmp/qemu-test_path/DIR1/DIR2/FILE");
+    CHANGE_TO("/DIR1/DIR2/../DIR2/FILE", "/tmp/qemu-test_path/DIR1/DIR2/FILE");
+
+    NO_CHANGE("/DIR1/DIR2/../DIR1");
+    NO_CHANGE("/DIR1/DIR2/../FILE");
+
+    CHANGE_TO("/./DIR1/DIR2/FILE", "/tmp/qemu-test_path/DIR1/DIR2/FILE");
+    CHANGE_TO("/././DIR1/DIR2/FILE", "/tmp/qemu-test_path/DIR1/DIR2/FILE");
+    CHANGE_TO("/DIR1/./DIR2/FILE", "/tmp/qemu-test_path/DIR1/DIR2/FILE");
+    CHANGE_TO("/DIR1/././DIR2/FILE", "/tmp/qemu-test_path/DIR1/DIR2/FILE");
+    CHANGE_TO("/DIR1/DIR2/./FILE", "/tmp/qemu-test_path/DIR1/DIR2/FILE");
+    CHANGE_TO("/DIR1/DIR2/././FILE", "/tmp/qemu-test_path/DIR1/DIR2/FILE");
+    CHANGE_TO("/./DIR1/./DIR2/./FILE", "/tmp/qemu-test_path/DIR1/DIR2/FILE");
+
+    return 0;
+}
+
+int main(int argc, char *argv[])
+{
+    int ret;
+
+    ret = do_test();
+    cleanup();
+    if (ret) {
+	fprintf(stderr, "test_path: failed on line %i\n", ret);
+	return 1;
+    }
+    return 0;
+}
diff --git a/src/recompiler/tests/testthread.c b/src/recompiler/tests/testthread.c
new file mode 100644
index 00000000..27e4825b
--- /dev/null
+++ b/src/recompiler/tests/testthread.c
@@ -0,0 +1,51 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <pthread.h>
+#include <sys/wait.h>
+#include <sched.h>
+
+void *thread1_func(void *arg)
+{
+    int i;
+    char buf[512];
+
+    for(i=0;i<10;i++) {
+        snprintf(buf, sizeof(buf), "thread1: %d %s\n", i, (char *)arg);
+        write(1, buf, strlen(buf));
+        usleep(100 * 1000);
+    }
+    return NULL;
+}
+
+void *thread2_func(void *arg)
+{
+    int i;
+    char buf[512];
+    for(i=0;i<20;i++) {
+        snprintf(buf, sizeof(buf), "thread2: %d %s\n", i, (char *)arg);
+        write(1, buf, strlen(buf));
+        usleep(150 * 1000);
+    }
+    return NULL;
+}
+
+void test_pthread(void)
+{
+    pthread_t tid1, tid2;
+
+    pthread_create(&tid1, NULL, thread1_func, "hello1");
+    pthread_create(&tid2, NULL, thread2_func, "hello2");
+    pthread_join(tid1, NULL);
+    pthread_join(tid2, NULL);
+    printf("End of pthread test.\n");
+}
+
+int main(int argc, char **argv)
+{
+    test_pthread();
+    return 0;
+}
diff --git a/src/recompiler/translate-all.c b/src/recompiler/translate-all.c
new file mode 100644
index 00000000..b6882559
--- /dev/null
+++ b/src/recompiler/translate-all.c
@@ -0,0 +1,186 @@
+/*
+ *  Host code generation
+ *
+ *  Copyright (c) 2003 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Oracle LGPL Disclaimer: For the avoidance of doubt, except that if any license choice
+ * other than GPL or LGPL is available it will apply instead, Oracle elects to use only
+ * the Lesser General Public License version 2.1 (LGPLv2) at this time for any software where
+ * a choice of LGPL license versions is made available with the language indicating
+ * that LGPLv2 or any later version may be used, or where a choice of which version
+ * of the LGPL is applied is otherwise unspecified.
+ */
+
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+
+#include "config.h"
+
+#define NO_CPU_IO_DEFS
+#include "cpu.h"
+#include "exec-all.h"
+#include "disas.h"
+#include "tcg.h"
+#include "qemu-timer.h"
+
+/* code generation context */
+TCGContext tcg_ctx;
+
+uint16_t gen_opc_buf[OPC_BUF_SIZE];
+TCGArg gen_opparam_buf[OPPARAM_BUF_SIZE];
+
+target_ulong gen_opc_pc[OPC_BUF_SIZE];
+uint16_t gen_opc_icount[OPC_BUF_SIZE];
+uint8_t gen_opc_instr_start[OPC_BUF_SIZE];
+
+void cpu_gen_init(void)
+{
+    tcg_context_init(&tcg_ctx);
+    tcg_set_frame(&tcg_ctx, TCG_AREG0, offsetof(CPUState, temp_buf),
+                  sizeof(((CPUState *)0)->temp_buf));
+}
+
+/* return non zero if the very first instruction is invalid so that
+   the virtual CPU can trigger an exception.
+
+   '*gen_code_size_ptr' contains the size of the generated code (host
+   code).
+*/
+int cpu_gen_code(CPUState *env, TranslationBlock *tb, int *gen_code_size_ptr)
+{
+    TCGContext *s = &tcg_ctx;
+    uint8_t *gen_code_buf;
+    int gen_code_size;
+#ifdef CONFIG_PROFILER
+    int64_t ti;
+#endif
+
+#ifdef CONFIG_PROFILER
+    s->tb_count1++; /* includes aborted translations because of
+                       exceptions */
+    ti = profile_getclock();
+#endif
+
+#ifdef VBOX
+    RAWEx_ProfileStart(env, STATS_QEMU_COMPILATION);
+#endif
+
+    tcg_func_start(s);
+
+    gen_intermediate_code(env, tb);
+
+    /* generate machine code */
+    gen_code_buf = tb->tc_ptr;
+    tb->tb_next_offset[0] = 0xffff;
+    tb->tb_next_offset[1] = 0xffff;
+    s->tb_next_offset = tb->tb_next_offset;
+#ifdef USE_DIRECT_JUMP
+    s->tb_jmp_offset = tb->tb_jmp_offset;
+    s->tb_next = NULL;
+#else
+    s->tb_jmp_offset = NULL;
+    s->tb_next = tb->tb_next;
+#endif
+
+#ifdef CONFIG_PROFILER
+    s->tb_count++;
+    s->interm_time += profile_getclock() - ti;
+    s->code_time -= profile_getclock();
+#endif
+    gen_code_size = tcg_gen_code(s, gen_code_buf);
+    *gen_code_size_ptr = gen_code_size;
+#ifdef CONFIG_PROFILER
+    s->code_time += profile_getclock();
+    s->code_in_len += tb->size;
+    s->code_out_len += gen_code_size;
+#endif
+
+#ifdef VBOX
+    RAWEx_ProfileStop(env, STATS_QEMU_COMPILATION);
+#endif
+
+#ifdef DEBUG_DISAS
+    if (qemu_loglevel_mask(CPU_LOG_TB_OUT_ASM)) {
+        qemu_log("OUT: [size=%d]\n", *gen_code_size_ptr);
+        log_disas(tb->tc_ptr, *gen_code_size_ptr);
+        qemu_log("\n");
+        qemu_log_flush();
+    }
+#endif
+    return 0;
+}
+
+/* The cpu state corresponding to 'searched_pc' is restored.
+ */
+int cpu_restore_state(TranslationBlock *tb,
+                      CPUState *env, uintptr_t searched_pc,
+                      void *puc)
+{
+    TCGContext *s = &tcg_ctx;
+    int j;
+    uintptr_t tc_ptr;
+#ifdef CONFIG_PROFILER
+    int64_t ti;
+#endif
+
+#ifdef CONFIG_PROFILER
+    ti = profile_getclock();
+#endif
+    tcg_func_start(s);
+
+    gen_intermediate_code_pc(env, tb);
+
+    if (use_icount) {
+        /* Reset the cycle counter to the start of the block.  */
+        env->icount_decr.u16.low += tb->icount;
+        /* Clear the IO flag.  */
+        env->can_do_io = 0;
+    }
+
+    /* find opc index corresponding to search_pc */
+    tc_ptr = (uintptr_t)tb->tc_ptr;
+    if (searched_pc < tc_ptr)
+        return -1;
+
+    s->tb_next_offset = tb->tb_next_offset;
+#ifdef USE_DIRECT_JUMP
+    s->tb_jmp_offset = tb->tb_jmp_offset;
+    s->tb_next = NULL;
+#else
+    s->tb_jmp_offset = NULL;
+    s->tb_next = tb->tb_next;
+#endif
+    j = tcg_gen_code_search_pc(s, (uint8_t *)tc_ptr, searched_pc - tc_ptr);
+    if (j < 0)
+        return -1;
+    /* now find start of instruction before */
+    while (gen_opc_instr_start[j] == 0)
+        j--;
+    env->icount_decr.u16.low -= gen_opc_icount[j];
+
+    gen_pc_load(env, tb, searched_pc, j, puc);
+
+#ifdef CONFIG_PROFILER
+    s->restore_time += profile_getclock() - ti;
+    s->restore_count++;
+#endif
+    return 0;
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-06 03:01:46 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-06 03:01:46 +0000
commit	f8fe689a81f906d1b91bb3220acde2a4ecb14c5b (patch)
tree	26484e9d7e2c67806c2d1760196ff01aaa858e8c /src/recompiler
parent	Initial commit. (diff)
download	virtualbox-f8fe689a81f906d1b91bb3220acde2a4ecb14c5b.tar.xz virtualbox-f8fe689a81f906d1b91bb3220acde2a4ecb14c5b.zip