RAC interconnect traffic

RAC interconnect traffic



http://gjilevski.com/2011/08/02/oracle-cache-fusion-private-inter-connects-and-practical-performance-management-considerations-in-oracle-rac/

http://www.datadisk.co.uk/html_docs/rac/performance.htm



 @/tmp/latchprofx.sql name,ksllwnam,ksllwlbl % % 100000


DEF _lhp_what="&1"

DEF _lhp_sid="&2"
DEF _lhp_name="&3"
DEF _lhp_samples="&4"

COL name FOR A35 TRUNCATE

COL latchprof_total_ms HEAD "Held ms" FOR 999999.999
COL latchprof_pct_total_samples head "Held %" format 999.99
COL latchprof_avg_ms HEAD "Avg hold ms" FOR 999999.999
COL dist_samples HEAD Gets
COL total_samples HEAD Held
COL ksllwnam FOR A40 TRUNCATE
COL ksllwlbl FOR A20 TRUNCATE
COL objtype  FOR A20 TRUNCATE
COL object   FOR A17 WRAP JUST RIGHT
COL hmode    FOR A12 TRUNCATE
COL what     FOR A17 WRAP
COL func     FOR A40 TRUNCATE

BREAK ON lhp_name SKIP 1


DEF _IF_ORA_10_OR_HIGHER="--"


PROMPT

PROMPT -- LatchProfX 2.02

COL latchprof_oraversion NEW_VALUE _IF_ORA_10_OR_HIGHER


SET TERMOUT OFF

SELECT
    DECODE(SUBSTR(BANNER, INSTR(BANNER, 'Release ')+8,1), 1, '', '--') latchprof_oraversion
FROM v$version WHERE ROWNUM=1;
SET TERMOUT ON

WITH

    t1 AS (SELECT hsecs FROM v$timer),
    samples AS (
        SELECT /*+ ORDERED USE_NL(l.x$ksuprlat) USE_NL(s.x$ksuse) NO_TRANSFORM_DISTINCT_AGG */
            &_lhp_what
            &_IF_ORA_10_OR_HIGHER , COUNT(DISTINCT gets)        dist_samples
          , COUNT(*)                    total_samples
          , COUNT(*) / &_lhp_samples    total_samples_pct
        FROM
            (SELECT /*+ NO_MERGE */ 1 FROM DUAL CONNECT BY LEVEL <= &_lhp_samples) s,
            (SELECT ksuprpid PID, ksuprsid SID, ksuprlnm NAME, ksuprlat LADDR, ksulawhr,
                    TO_CHAR(ksulawhy,'XXXXXXXXXXXXXXXX') object
                    &_IF_ORA_10_OR_HIGHER , ksulagts GETS
                    &_IF_ORA_10_OR_HIGHER , lower(ksuprlmd) HMODE
             FROM x$ksuprlat) l,
            (SELECT
                    indx
                  , ksusesqh     sqlhash
                  , ksusesql     sqladdr
                  &_IF_ORA_10_OR_HIGHER , ksusesph planhash
                  &_IF_ORA_10_OR_HIGHER , ksusesch sqlchild
                  &_IF_ORA_10_OR_HIGHER , ksusesqi sqlid
             FROM x$ksuse) s,
            (SELECT indx,
                    ksllwnam func, ksllwnam,
                    ksllwlbl objtype, ksllwlbl
             FROM x$ksllw) w
        WHERE
            l.sid LIKE '&_lhp_sid'
        AND l.ksulawhr = w.indx (+)
        AND l.sid = s.indx
        AND (LOWER(l.name) LIKE LOWER('%&_lhp_name%') OR LOWER(RAWTOHEX(l.laddr)) LIKE LOWER('%&_lhp_name%'))
        GROUP BY
            &_lhp_what
        ORDER BY
            total_samples DESC
    ),
    t2 AS (SELECT hsecs FROM v$timer)
SELECT /*+ ORDERED */
    &_lhp_what
  , s.total_samples
  &_IF_ORA_10_OR_HIGHER , s.dist_samples
--  , s.total_samples_pct
  , s.total_samples / &_lhp_samples * 100 latchprof_pct_total_samples
  , (t2.hsecs - t1.hsecs) * 10 * s.total_samples / &_lhp_samples latchprof_total_ms
--  , s.dist_events
    &_IF_ORA_10_OR_HIGHER , (t2.hsecs - t1.hsecs) * 10 * s.total_samples / dist_samples / &_lhp_samples latchprof_avg_ms
  FROM
    t1,
    samples s,
    t2
  WHERE ROWNUM <= 20
/

COL name    CLEAR

COL hmode   CLEAR
COL what    CLEAR
COL func    CLEAR

COL objtype CLEAR


NAME                                KSLLWNAM                                 KSLLWLBL                   Held       Gets  Held %     Held ms Avg hold ms
----------------------------------- ---------------------------------------- -------------------- ---------- ---------- ------- ----------- -----------
ges resource hash list              kjrmas1: lookup master node                                        86203      36070   86.20   29179.716        .809
ges resource hash list              kjlrlr: remove lock from resource queue                            15166       2751   15.17    5133.691       1.866
ges process parent latch            kjata_fg                                                           10625      10591   10.63    3596.563        .340
cache buffers chains                kcbgcur_2                                                           7195       7035    7.20    2435.508        .346
redo copy                           kcrfw_redo_gen: nowait                   latch                      5798       5746    5.80    1962.623        .342
cache buffers chains                kcbgtcr: fast path                                                  4994       4976    4.99    1690.469        .340
ges process parent latch            kjlrem: detach lock from group                                      4802       4733    4.80    1625.477        .343
ges resource hash list              kjrref: find matched resource                                       4657       4175    4.66    1576.395        .378
enqueue hash chains                 ksqcmi: if lk mode requested                                        4561       4559    4.56    1543.899        .339
ges process parent latch            kjatioc                                                             4359       4353    4.36    1475.522        .339
ges process parent latch            kjlalc: lock allocation from lmon                                   4329       2565    4.33    1465.367        .571
enqueue hash chains                 ksqgtl3                                  acquiring session          3476       3422    3.48    1176.626        .344
ges resource hash list              kjakoca: search for resp by resname                                 3337       3038    3.34    1129.575        .372
enqueue hash chains                 ksqrcl                                   resource addr              3119       3115    3.12    1055.782        .339
cache buffers chains                kcbrls_2                                                            2002       1929    2.00     677.677        .351
ges enqueue table freelist          kjlalc: lock allocation                                             1567        584    1.57     530.430        .908
cache buffers chains                kcbzwb                                                              1537       1459    1.54     520.275        .357
ges process parent latch            kjucll: closing lock                                                1386       1386    1.39     469.161        .339
flashback copy                      krfgdata1                                                           1074        369    1.07     363.549        .985
cache buffers chains                kcbget: fast exchange                                               1058       1049    1.06     358.133        .341
20 rows selected.



Below views provide the current hardware configuration:
SQL> SET lines 200 pages 100
SQL> SELECT * FROM gv$cluster_interconnects ORDER BY inst_id,name;
 
   INST_ID NAME            IP_ADDRESS       IS_ SOURCE
---------- --------------- ---------------- --- -------------------------------
         1 eth0:1          169.254.5.247    NO
         1 eth1:1          169.254.192.128  NO
         2 eth0:1          169.254.90.252   NO
         2 eth1:1          169.254.158.153  NO
SQL> SET lines 200 pages 100
SQL> SELECT * FROM gv$configured_interconnects ORDER BY inst_id,name;
 
   INST_ID NAME            IP_ADDRESS       IS_ SOURCE
---------- --------------- ---------------- --- -------------------------------
         1 bond0           10.75.36.4       YES
         1 bond0:1         10.75.36.48      YES
         1 bond0:2         10.75.36.33      YES
         1 eth0:1          169.254.5.247    NO
         1 eth1:1          169.254.192.128  NO
         2 bond0           10.75.36.7       YES
         2 bond0:1         10.75.36.47      YES
         2 bond0:2         10.75.36.49      YES
         2 bond0:5         10.75.36.42      YES
         2 eth0:1          169.254.90.252   NO
         2 eth1:1          169.254.158.153  NO
Then there are multiple views providing the amount of blocks (data, undo, …) exchanged between cluster instances:
SQL> SET lines 200 pages 100
SQL> SELECT inst_id,class,cr_block,current_block
FROM gv$instance_cache_transfer WHERE instance IN (1,2)
ORDER BY inst_id,class;
 
   INST_ID CLASS                CR_BLOCK CURRENT_BLOCK
---------- ------------------ ---------- -------------
         1 1st LEVEL bmb            1566          3016
         1 1st LEVEL bmb               0             0
         1 2nd LEVEL bmb               0             0
         1 2nd LEVEL bmb            2241          1652
         1 3rd LEVEL bmb               0             0
         1 3rd LEVEL bmb              15            18
         1 bitmap block                0             0
         1 bitmap block                0             0
         1 bitmap INDEX block          0           347
         1 bitmap INDEX block          0             0
         1 data block             105067        392338
         1 data block                  0             0
         1 extent map                  0             0
         1 extent map                  0             0
         1 FILE header block         347           771
         1 FILE header block           0             0
         1 free list                   0             0
         1 free list                   8             8
         1 save undo block             0             0
         1 save undo block             0             0
         1 save undo header            0             0
         1 save undo header            0             0
         1 SEGMENT header              0             0
         1 SEGMENT header            358          1687
         1 sort block                  0             0
         1 sort block                  0             0
         1 undo block                  0             0
         1 undo block                 67             1
         1 undo header                 0             0
         1 undo header             89024          5956
         1 unused                      0             0
         1 unused                      0             0
         2 1st LEVEL bmb            9368         25428
         2 1st LEVEL bmb               0             0
         2 2nd LEVEL bmb           12819         10844
         2 2nd LEVEL bmb               0             0
         2 3rd LEVEL bmb               0             0
         2 3rd LEVEL bmb             657           651
         2 bitmap block                0             0
         2 bitmap block                0             0
         2 bitmap INDEX block          0             0
         2 bitmap INDEX block          0          1821
         2 data block                  0             0
         2 data block             612191       1999885
         2 extent map                  0             0
         2 extent map                  0             0
         2 FILE header block        2114          3657
         2 FILE header block           0             0
         2 free list                   0             0
         2 free list                1459          1541
         2 save undo block             0             0
         2 save undo block             0             0
         2 save undo header            0             0
         2 save undo header            0             0
         2 SEGMENT header              0             0
         2 SEGMENT header           1654          6654
         2 sort block                  0             0
         2 sort block                  0             0
         2 undo block                  0             0
         2 undo block               5897             0
         2 undo header            272117         12986
         2 undo header                 0             0
         2 unused                      0             0
         2 unused                      0             0
SQL> SET lines 200 pages 100
SQL> SELECT inst_id,cr_requests,current_requests
FROM gv$cr_block_server;
 
   INST_ID CR_REQUESTS CURRENT_REQUESTS
---------- ----------- ----------------
         1      105862            93810
         2      870616          8415406
If you are wondering the difference between CR and current blocks it is clearly explained in Oracle documentation:
The Global Cache Block Access Latency chart shows data for two different types of data block requests: current and consistent-read (CR) blocks. When you update data in the database, Oracle Database must locate the most recent version of the data block that contains the data, which is called the current block. If you perform a query, then only data committed before the query began is visible to the query. Data blocks that were changed after the start of the query are reconstructed from data in the undo segments, and the reconstructed data is made available to the query in the form of a consistent-read block.
As those table contains figures since the instance started it is not easily usable to get the interconnect traffic which is an amount of information over a period of time. I was also tempted to sum GC CR Block Received Per Second and GC Current Block Received Per Second from V$SYSMETRIC_HISTORY table.
But as explained around on Internet this would not take into account the size of messages exchanged by cluster instances. The formula can be finally find insprepins.sql file located in $ORACLE_HOME/rdbms/admin directory. This file is script to generate Statspack reports.
Estd Interconnect traffic = ((Global Cache blocks received + Global Cache blocks served)*db_block_size +(GCS/GES messages received + GCS/GES messages sent)*200)/elapsed time
This can be verified in an AWR report…
The bad news is that figures are available in GV$SYSSTAT and GV$GES_STATISTICS (or GV$DLM_MISC if you have not executed$ORACLE_HOME/rdbms/admin/catclust.sql script). Means that we will have to subtract figures to compute the interconnect throughput.
Remark:
Even if Oracle takes 200 bytes as GCS/GES message size there could be a formula to compute it but nothing confirmed officially:
SQL>  SELECT SUM(kjxmsize * (kjxmrcv + kjxmsnt + kjxmqsnt)) / SUM((kjxmrcv + kjxmsnt + kjxmqsnt)) "avg Message size"
      FROM x$kjxm
      WHERE kjxmrcv > 0 OR kjxmsnt > 0 OR kjxmqsnt > 0;
 
AVG Message SIZE
----------------
      312.751415
Last but not least the above formula does not take into account the MTU of network interface:
[root@server1 ~]# ifconfig eth0
eth0      Link encap:Ethernet  HWaddr B4:99:BA:A7:07:2A
          inet addr:10.10.10.11  Bcast:10.10.10.255  Mask:255.255.255.0
          inet6 addr: fe80::b699:baff:fea7:72a/64 Scope:Link
          UP BROADCAST RUNNING MULTICAST  MTU:1500  Metric:1
          RX packets:475791490 errors:0 dropped:0 overruns:0 frame:0
          TX packets:172347326 errors:0 dropped:0 overruns:0 carrier:0
          collisions:0 txqueuelen:1000
          RX bytes:147344045136 (137.2 GiB)  TX bytes:86219877687 (80.2 GiB)
Conclusion could be that crosschecking figures at OS level (RX / TX) versus the ones of Oracle is anyway interesting…

Script


So all is turning around this query:
SQL> SELECT
DECODE(name,'gc cr blocks received','global cache blocks received','gc cr blocks served','global cache blocks served','gc current blocks received','global cache blocks received','gc current blocks served','global cache blocks served',name) AS name,
  SUM(VALUE) AS VALUE,
  TO_CHAR(SYSDATE,'yyyy-mm-dd hh24:mi:ss') AS date_taken
  FROM gv$sysstat
  WHERE inst_id=1
  AND name IN ('gc cr blocks received','gc cr blocks served','gc current blocks received','gc current blocks served','gcs messages sent','ges messages sent')
  GROUP BY DECODE(name,'gc cr blocks received','global cache blocks received','gc cr blocks served','global cache blocks served','gc current blocks received','global cache blocks received','gc current blocks served','global cache blocks served',name)
  UNION
  SELECT name,VALUE,TO_CHAR(SYSDATE,'yyyy-mm-dd hh24:mi:ss') AS date_taken
  FROM gv$dlm_misc
  WHERE name IN ('gcs msgs received','ges msgs received')
  AND inst_id=1;
 
NAME                                                                  VALUE DATE_TAKEN
---------------------------------------------------------------- ---------- -------------------
gcs messages sent                                                   5357944 2012-05-16 17:12:34
gcs msgs received                                                   1807244 2012-05-16 17:12:34
ges messages sent                                                   1428546 2012-05-16 17:12:34
ges msgs received                                                   1453824 2012-05-16 17:12:34
global cache blocks received                                         626606 2012-05-16 17:12:34
global cache blocks served                                           508232 2012-05-16 17:12:34
 
6 ROWS selected.
To compute interconnect transfer rate you have to execute the above query 2 times and divide the result by the elapsed time. In a more advanced script it gives:
If you want to recompute what you find in an AWR report you can use DBA_HIST_DLM_MISC and DBA_HIST_SYSSTAT hsitory tables.

Performance queries


Global Cache Service processes (LMS) statistics:
SQL> SET lines 200 pages 100
SQL> SELECT * FROM gv$current_block_server;
 
   INST_ID       PIN1      PIN10     PIN100    PIN1000   PIN10000     FLUSH1
---------- ---------- ---------- ---------- ---------- ---------- ----------
   FLUSH10   FLUSH100  FLUSH1000 FLUSH10000     WRITE1    WRITE10   WRITE100
---------- ---------- ---------- ---------- ---------- ---------- ----------
 WRITE1000 WRITE10000    CLEANDC      RCVDC    QUEUEDC    EVICTDC    WRITEDC
---------- ---------- ---------- ---------- ---------- ---------- ----------
         1         25        255        355          0          0       4255
      2163         60          4          0      36586      10143        478
        42         39          0          0          0      23948          0
 
         2        119        393        471          0          0      18724
      6353         61          0          0     163483      58352       2277
       284        623          0          0          0      65906          0
Latency of interconnect messages:
SQL> SET lines 200 pages 100
SQL> SELECT * FROM dba_hist_interconnect_pings WHERE snap_id=2846;
 
   SNAP_ID       DBID INSTANCE_NUMBER TARGET_INSTANCE   CNT_500B  WAIT_500B WAITSQ_500B     CNT_8K    WAIT_8K  WAITSQ_8K
---------- ---------- --------------- --------------- ---------- ---------- ----------- ---------- ---------- ----------
      2846 2446434119               1               1      49270   11135446     2673946      49270   10535445    2393620
      2846 2446434119               1               2      49270   12011547     4889338      49270   15731115    7441966
      2846 2446434119               2               1      49270   11611649     4012287     206735   62938564   29425924
      2846 2446434119               2               2     206896   55943066    15317805     206896   52958767   13732654
Interesting one coming from racdiag.sql script:
SQL> SET lines 200
SQL> SELECT b1.inst_id, b2.VALUE "GCS CR BLOCKS RECEIVED",
     b1.VALUE "GCS CR BLOCK RECEIVE TIME",
     ((b1.VALUE / b2.VALUE) * 10) "AVG CR BLOCK RECEIVE TIME (ms)"
     FROM gv$sysstat b1, gv$sysstat b2
     WHERE b1.name = 'global cache cr block receive time'
     AND b2.name = 'global cache cr blocks received'
     AND b1.inst_id = b2.inst_id OR b1.name = 'gc cr block receive time'
     AND b2.name = 'gc cr blocks received'
     AND b1.inst_id = b2.inst_id;
 
   INST_ID GCS CR BLOCKS RECEIVED GCS CR BLOCK RECEIVE TIME AVG CR BLOCK RECEIVE TIME (ms)
---------- ---------------------- ------------------------- ------------------------------
         1                 207686                      5048                     .243059234
         2                1089552                     43020                     .394841182

Comments