Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ramses] serious performance regression #2766

Closed
Xarthisius opened this issue Jul 20, 2020 · 4 comments
Closed

[ramses] serious performance regression #2766

Xarthisius opened this issue Jul 20, 2020 · 4 comments

Comments

@Xarthisius
Copy link
Member

Bug report

Current master suffers from a serious regression for Ramses frontend. Take the following code as an example:

import yt

field = "temperature"
output_00080 = "output_00080/info_00080.txt"
axis = 0
weight_field = None

ds = yt.load(output_00080)
proj = ds.proj(field, axis, weight_field=weight_field, data_source=None)
frb = proj.to_frb((1.0, "unitary"), 256)
d = frb.data
for f in proj.field_data:
    d["%s_sum" % (f,)] = proj.field_data[f].sum(dtype="float64")

On master (6daf9cd):

$ time python3.8 regression.py 
real	0m43.792s
user	0m43.704s
sys	0m0.969s

on :

$ time python3.8 regression.py 
real	0m3.882s
user	0m3.786s
sys	0m0.799s

Here's the basic profile:

Program: regression.py

43.799 <module>  regression.py:1
├─ 41.573 __init__  yt/data_objects/construction_data_containers.py:463
│  ├─ 40.201 get_data  yt/data_objects/construction_data_containers.py:223
│  │  └─ 40.145 _handle_chunk  yt/data_objects/construction_data_containers.py:548
│  │     └─ 39.919 __getitem__  yt/data_objects/data_containers.py:256
│  │        └─ 39.919 get_data  yt/data_objects/data_containers.py:1614
│  │           └─ 39.810 _read_fluid_fields  yt/geometry/geometry_handler.py:233
│  │              └─ 39.810 _read_fluid_selection  yt/frontends/ramses/io.py:88
│  │                 └─ 39.809 fill  yt/frontends/ramses/data_structures.py:336
│  │                    └─ 39.809 _fill_no_ghostzones  yt/frontends/ramses/data_structures.py:221
│  │                       └─ 39.783 [self]  
│  └─ 1.372 __init__  yt/data_objects/construction_data_containers.py:159
│     └─ 1.371 all_data  yt/data_objects/static_output.py:960
│        └─ 1.371 index  yt/data_objects/static_output.py:495
│           └─ 1.250 create_field_info  yt/frontends/ramses/data_structures.py:593
│              └─ 1.250 create_field_info  yt/data_objects/static_output.py:559
│                 └─ 0.670 load_all_plugins  yt/fields/field_info_container.py:337
│                    └─ 0.656 find_dependencies  yt/fields/field_info_container.py:354
│                       └─ 0.655 check_derived_fields  yt/fields/field_info_container.py:426
│                          └─ 0.653 get_dependencies  yt/fields/derived_field.py:239
│                             └─ 0.645 __missing__  yt/fields/field_detector.py:101
│                                └─ 0.638 __call__  yt/fields/derived_field.py:278
└─ 2.037 <module>  yt/__init__.py:1
   └─ 1.758 <module>  yt/funcs.py:1
      └─ 1.687 <module>  yt/units/__init__.py:1
         └─ 1.522 <module>  unyt/__init__.py:1
               [1917 frames hidden]  unyt, copy, sympy, inspect, enum, tok...

@triage-new-issues triage-new-issues bot added the triage Triage needed label Jul 20, 2020
@matthewturk
Copy link
Member

I looked over the code that may cause it, and I think the issue is in oct_container.pyx, specifically lines like these:

for i, lvl in enumerate(levels):

and

for i, (lev, dom) in enumerate(zip(levels, domains)):

I believe that inside Cython, these will always be expanded, and will result in many many python objects being created and interpreted. For instance, cython -a on the first one suggests it results in code like:

  __pyx_t_6 = 0;
    if (likely(PyList_CheckExact(((PyObject *)__pyx_v_levels))) || PyTuple_CheckExact(((PyObject *)__pyx_v_levels))) {
      __pyx_t_4 = ((PyObject *)__pyx_v_levels); __Pyx_INCREF(__pyx_t_4); __pyx_t_11 = 0;
      __pyx_t_12 = NULL;
    } else {
      __pyx_t_11 = -1; __pyx_t_4 = PyObject_GetIter(((PyObject *)__pyx_v_levels)); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 735, __pyx_L1_error)
      __Pyx_GOTREF(__pyx_t_4);
      __pyx_t_12 = Py_TYPE(__pyx_t_4)->tp_iternext; if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 735, __pyx_L1_error)
    }
    for (;;) {
      if (likely(!__pyx_t_12)) {
        if (likely(PyList_CheckExact(__pyx_t_4))) {
          if (__pyx_t_11 >= PyList_GET_SIZE(__pyx_t_4)) break;
          #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
          __pyx_t_13 = PyList_GET_ITEM(__pyx_t_4, __pyx_t_11); __Pyx_INCREF(__pyx_t_13); __pyx_t_11++; if (unlikely(0 < 0)) __PYX_ERR(0, 735, __pyx_L1_error)
          #else
          __pyx_t_13 = PySequence_ITEM(__pyx_t_4, __pyx_t_11); __pyx_t_11++; if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 735, __pyx_L1_error)
          __Pyx_GOTREF(__pyx_t_13);
          #endif
        } else {
          if (__pyx_t_11 >= PyTuple_GET_SIZE(__pyx_t_4)) break;
          #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
          __pyx_t_13 = PyTuple_GET_ITEM(__pyx_t_4, __pyx_t_11); __Pyx_INCREF(__pyx_t_13); __pyx_t_11++; if (unlikely(0 < 0)) __PYX_ERR(0, 735, __pyx_L1_error)
          #else
          __pyx_t_13 = PySequence_ITEM(__pyx_t_4, __pyx_t_11); __pyx_t_11++; if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 735, __pyx_L1_error)
          __Pyx_GOTREF(__pyx_t_13);
          #endif
        }
      } else {
        __pyx_t_13 = __pyx_t_12(__pyx_t_4);
        if (unlikely(!__pyx_t_13)) {
          PyObject* exc_type = PyErr_Occurred();
          if (exc_type) {
            if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear();
            else __PYX_ERR(0, 735, __pyx_L1_error)
          }
          break;
        }
        __Pyx_GOTREF(__pyx_t_13);
      }
      __Pyx_XDECREF_SET(__pyx_v_lvl, __pyx_t_13);
      __pyx_t_13 = 0;
      __pyx_v_i = __pyx_t_6;
      __pyx_t_6 = (__pyx_t_6 + 1);
/* … */
      __pyx_L5_continue:;
    }
    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;

This is pretty expensive. Unfortunately, the fix for it is to do something like this:

for i in range(levels.shape[0]):
    lvl = levels[i]

A similar thing will have to be done for the second operation.

I'm happy to make these changes, but want to give @cphyc the opportunity first.

@Xarthisius
Copy link
Member Author

@cphyc I went ahead and included a fix in #2735 I wanted to see if that fixes all the remaining test/doc build failures. I'd appreciate you taking a look, TIA!

@neutrinoceros
Copy link
Member

I looked over the code that may cause it, and I think the issue is in oct_container.pyx, specifically lines like these:

for i, lvl in enumerate(levels):

and

for i, (lev, dom) in enumerate(zip(levels, domains)):

oops, those two were my suggestions. Learning everyday ! Thanks for figuring this out you guys !

@cphyc
Copy link
Member

cphyc commented Jul 20, 2020

@cphyc I went ahead and included a fix in #2735 I wanted to see if that fixes all the remaining test/doc build failures. I'd appreciate you taking a look, TIA!

I had a look and this already looks good to me. We may gain an extra bit by using memory views instead of numpy arrays, see the patch below.

diff --git a/yt/geometry/oct_container.pyx b/yt/geometry/oct_container.pyx
index db2e13490..9c6ab9867 100644
--- a/yt/geometry/oct_container.pyx
+++ b/yt/geometry/oct_container.pyx
@@ -720,24 +720,34 @@ cdef class OctreeContainer:
     @cython.wraparound(False)
     @cython.cdivision(True)
     def fill_level(self, int level,
-                   np.ndarray[np.uint8_t, ndim=1] levels,
-                   np.ndarray[np.uint8_t, ndim=1] cell_inds,
-                   np.ndarray[np.int64_t, ndim=1] file_inds,
+                   np.uint8_t[::1] levels,
+                   np.uint8_t[::1] cell_inds,
+                   np.int64_t[::1] file_inds,
                    dest_fields, source_fields,
                    np.int64_t offset = 0):
         cdef np.ndarray[np.float64_t, ndim=2] source
         cdef np.ndarray[np.float64_t, ndim=1] dest
-        cdef int i
+        cdef np.float64_t[::1] dest_view
+        cdef np.float64_t[::1, :] source_view
+
+        cdef int i, lvl, Nlevel, file_ind
 
+        Nlevel = len(levels)
         for key in dest_fields:
             dest = dest_fields[key]
             source = source_fields[key]
-            for i, lvl in enumerate(levels):
+
+            dest_view = dest
+            source_view = source
+            for i in range(Nlevel):
+                lvl = levels[i]
                 if lvl != level: continue
-                if file_inds[i] < 0:
-                    dest[i + offset] = np.nan
+
+                file_ind = file_inds[i]
+                if file_ind < 0:
+                    dest_view[i + offset] = np.nan
                 else:
-                    dest[i + offset] = source[file_inds[i], cell_inds[i]]
+                    dest_view[i + offset] = source_view[file_ind, cell_inds[i]]
 
     def fill_index(self, SelectorObject selector = AlwaysSelector(None)):
         """Get the on-file index of each cell"""
@@ -802,10 +812,10 @@ cdef class OctreeContainer:
     @cython.cdivision(True)
     def fill_level_with_domain(
                    self, int level,
-                   np.uint8_t[:] levels,
-                   np.uint8_t[:] cell_inds,
-                   np.int64_t[:] file_inds,
-                   np.int32_t[:] domains,
+                   np.uint8_t[::1] levels,
+                   np.uint8_t[::1] cell_inds,
+                   np.int64_t[::1] file_inds,
+                   np.int32_t[::1] domains,
                    dict dest_fields,
                    dict source_fields,
                    np.int32_t domain,
@@ -819,19 +829,29 @@ cdef class OctreeContainer:
         """
         cdef np.ndarray[np.float64_t, ndim=2] source
         cdef np.ndarray[np.float64_t, ndim=1] dest
-        cdef int i, count
+        cdef np.float64_t[::1] dest_view
+        cdef np.float64_t[::1, :] source_view
+
+        cdef int i, count, lvl, dom, Nlevel, file_ind
+
+        Nlevel = len(levels)
 
         for key in dest_fields:
             dest = dest_fields[key]
+            dest_view = dest
             source = source_fields[key]
+            source_view = source
             count = 0
-            for i, (lev, dom) in enumerate(zip(levels, domains)):
+            for i in range(Nlevel):
+                lev = levels[i]
+                dom = domains[i]
                 if lev != level or dom != domain: continue
                 count += 1
-                if file_inds[i] < 0:
-                    dest[i + offset] = np.nan
+                file_ind = file_inds[i]
+                if file_ind < 0:
+                    dest_view[i + offset] = np.nan
                 else:
-                    dest[i + offset] = source[file_inds[i], cell_inds[i]]
+                    dest_view[i + offset] = source_view[file_ind, cell_inds[i]]
         return count
 
     @cython.boundscheck(False)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

4 participants