IBM Quantum Challenge – Jan's write-up

Contact: j.tulowiecki@beit.tech

This notebook is a shortened version of original write-up, later called the PDF. The Jupyter version shows only the practical aspect of work done during IBM Quantum Challenge Fall 2020 competition. It shows the implementation of subsequent optimizations to obtain the score of 6,574. The original document contains more insight into the Final Problem and also proves most of the important parts of the algorithm and the solution.

The problem is described in the PDF as well as in other participants' excellent write-ups.

Let's start by defining two problem sets and creating a function to validate and evaluate solutions.

Now let's implement the baseline solution, i.e. using multicontrolled gates provided by Qiskit.

Other features of this baseline solution include:

qRAM I/O using Gray Codes (counterintuitively, this implementation is more straighforward than if we had to decompose index into bitmask and negated controlling bits)
Fractional phase change in the oracle

def solution_1(problem_set):
    idx = QuantumRegister(4, name='idx')
    board = QuantumRegister(16, name='brd')
    
    result = ClassicalRegister(4, name='res')
    qc = QuantumCircuit(idx, board, result)
    
    qc.h(idx)
    
    # QRAM load using Gray codes
    gray_bit = [3, 2, 3, 1, 3, 2, 3, 0, 3, 2, 3, 1, 3, 2, 3, 0]
    gray = [15, 14, 12, 13, 9, 8, 10, 11, 3, 2, 0, 1, 5, 4, 6, 7]

    for (bit, code) in zip(gray_bit, gray):
        for stars in problem_set[code]:
            (r, c) = tuple(map(int, stars))
            rc = 4 * r + c
            qc.mcx(idx, board[rc])
        qc.x(idx[bit])
    
    
    # ORACLE, i.e. introducing phase equal to (2pi / 3) x perm(M)
    for p in permutations(range(4)):
        squares = [board[4 * i + p[i]] for i in range(4)]
        qc.mcu1(2 * pi / 3, squares[:-1], squares[-1])
    
    
    # QRAM
    for (bit, code) in zip(gray_bit, gray):
        for stars in problem_set[code]:
            (r, c) = tuple(map(int, stars))
            rc = 4 * r + c
            qc.mcx(idx, board[rc])
        qc.x(idx[bit])
        
    # GROVER DIFFUSER
    qc.h(idx)
    qc.x(idx)
    qc.mcu1(pi, idx[:-1], idx[-1])
    qc.x(idx)
    qc.h(idx)
    
    # READ RESULT
    for (q, c) in zip(idx, result):
        qc.measure(q, c)
    return qc

evaluate(solution_1)

Running Test 1
Running Test 2
Solution correct! Cost: 99081

As we can see, this short code obtains the result of 99,081 which is not that bad but still far from being in the final top 10 solutions.

Now, let's introduce ancilla to enhance qRAM loading and algorithm. Instead of having large, multi-controlled gates we use ancillae the way described in the PDF.

To obtain this solution, we enriched solution_1 with ancilla enhanced multi-controlled gates decomposition and smart ancilla management.

def solution_2(problem_set):
    idx = QuantumRegister(4, name='idx')
    board = QuantumRegister(16, name='brd')
    ancilla = QuantumRegister(6, name='anc')
    
    result = ClassicalRegister(4, name='res')
    qc = QuantumCircuit(idx, board, ancilla, result)
    
    qc.h(idx)
    
    # QRAM
    gray_bit = [1, 0, 1, 0]
    gray = [3, 2, 0, 1]

    # We start by pairing first two bits and compute conjunctions
    # of all their mask combinations into separate ancillae.
    
    for (i, g) in enumerate(gray):
        qc.ccx(idx[0], idx[1], ancilla[i])
        qc.x(idx[gray_bit[i]])
    
    # Now, for the last pair, we iterate through all possible masks
    # and merge them with ancillae. We use two additional ancillae,
    # so that each star is placed with a single CX gate.
    
    for (i, g) in enumerate(gray):
        # pair the last two bits of the idx into ancilla[4].
        qc.ccx(idx[2], idx[3], ancilla[4])
        for x in range(4):
            # pair idx prefix and suffix ancillae into one,
            # to enhance copying of the 6 stars.
            qc.ccx(ancilla[x], ancilla[4], ancilla[5])
            inst = problem_set[4 * gray[x] + g]
            for stars in inst:
                (r, c) = tuple(map(int, stars))
                rc = 4 * r + c
                qc.cx(ancilla[5], board[rc])
            # Uncompute the temporary ancilla
            qc.ccx(ancilla[x], ancilla[4], ancilla[5])
        
        # uncompute ancilla[4]
        qc.ccx(idx[2], idx[3], ancilla[4])
        qc.x(idx[2 + gray_bit[i]])
    
    # uncompute ancillae 0 - 3.
    for (i, g) in enumerate(gray):
        qc.ccx(idx[0], idx[1], ancilla[i])
        qc.x(idx[gray_bit[i]])
    
    
    # ORACLE
    
    # selection of the first "box", i.e. selection of the two columns
    combi = list(combinations(range(4), 2))
    # the remaining box
    combi_compl = [tuple(x for x in range(4) if x not in l) for l in combi]
    
    for (a, b), (c, d) in zip(combi, combi_compl):
        
        # compute boxes diagonals into separate ancillae
        qc.ccx(board[a], board[4 + b], ancilla[0])
        qc.ccx(board[b], board[4 + a], ancilla[1])
        qc.ccx(board[8 + c], board[12 + d], ancilla[2])
        qc.ccx(board[8 + d], board[12 + c], ancilla[3])
    
        for x in [0, 1]:
            for y in [2, 3]:
                # execute phase change when both diagonals are in state |1>.
                qc.cu1(2 * pi / 3, ancilla[x], ancilla[y])
        
        # uncompute ancillae 0 - 3
        qc.ccx(board[a], board[4 + b], ancilla[0])
        qc.ccx(board[b], board[4 + a], ancilla[1])
        qc.ccx(board[8 + c], board[12 + d], ancilla[2])
        qc.ccx(board[8 + d], board[12 + c], ancilla[3])
    
    
    # QRAM again.
    for (i, g) in enumerate(gray):
        qc.ccx(idx[0], idx[1], ancilla[i])
        qc.x(idx[gray_bit[i]])
    for (i, g) in enumerate(gray):
        qc.ccx(idx[2], idx[3], ancilla[4])
        for x in range(4):
            qc.ccx(ancilla[x], ancilla[4], ancilla[5])
            inst = problem_set[4 * gray[x] + g]
            for stars in inst:
                (r, c) = tuple(map(int, stars))
                rc = 4 * r + c
                qc.cx(ancilla[5], board[rc])
            qc.ccx(ancilla[x], ancilla[4], ancilla[5])
        qc.ccx(idx[2], idx[3], ancilla[4])
        qc.x(idx[2 + gray_bit[i]])
    for (i, g) in enumerate(gray):
        qc.ccx(idx[0], idx[1], ancilla[i])
        qc.x(idx[gray_bit[i]])
    
    # GROVER DIFFUSER
    for i in range(4):
        qc.u3(-pi/2, pi, -pi, idx[i])
    qc.ccx(idx[0], idx[1], ancilla[0])
    qc.mcu1(pi, [idx[2], idx[3]], ancilla[0])
    qc.ccx(idx[0], idx[1], ancilla[0])
    for i in range(4):
        qc.u3(pi/2, -pi, pi, idx[i])
    
    # READ RESULT
    for (q, c) in zip(idx, result):
        qc.measure(q, c)
    return qc

evaluate(solution_2)

Running Test 1
Running Test 2
Solution correct! Cost: 12671

Wow! This result of 12,671 is much much better!

Now let's get below 10k points with simple substitution of Margolus gate in place of CCX marking ancilla. Places where this optimization applies are marked with exclamation mark.

This simple Margolus gate trick is the main optimization that makes solution_2 and solution_3 different. Also, the Diffusion operator is slightly reduced.

def margolus(qc, qb):
    qc.ry(pi/4, qb[2])
    qc.cx(qb[1], qb[2])
    qc.ry(pi/4, qb[2])
    qc.cx(qb[0], qb[2])
    qc.ry(-pi/4, qb[2])
    qc.cx(qb[1], qb[2])
    qc.ry(-pi/4, qb[2])

def solution_3(problem_set):
    idx = QuantumRegister(4, name='idx')
    board = QuantumRegister(16, name='brd')
    ancilla = QuantumRegister(6, name='anc')
    
    result = ClassicalRegister(4, name='res')
    qc = QuantumCircuit(idx, board, ancilla, result)
    
    qc.h(idx)
    
    # QRAM
    gray_bit = [1, 0, 1, 0]
    gray = [3, 2, 0, 1]

    for (i, g) in enumerate(gray):
        margolus(qc, [idx[0], idx[1], ancilla[i]]) # !
        qc.x(idx[gray_bit[i]])
    for (i, g) in enumerate(gray):
        margolus(qc, [idx[2], idx[3], ancilla[4]]) # !
        for x in range(4):
            margolus(qc, [ancilla[x], ancilla[4], ancilla[5]]) # !
            inst = problem_set[4 * gray[x] + g]
            for stars in inst:
                (r, c) = tuple(map(int, stars))
                rc = 4 * r + c
                qc.cx(ancilla[5], board[rc])
            margolus(qc, [ancilla[x], ancilla[4], ancilla[5]]) # !
        margolus(qc, [idx[2], idx[3], ancilla[4]]) # !
        qc.x(idx[2 + gray_bit[i]])
    for (i, g) in enumerate(gray):
        margolus(qc, [idx[0], idx[1], ancilla[i]]) # !
        qc.x(idx[gray_bit[i]])
    
    
    # ORACLE
    
    combi = list(combinations(range(4), 2))
    combi_compl = [tuple(x for x in range(4) if x not in l) for l in combi]
    
    for (a, b), (c, d) in zip(combi, combi_compl):
        margolus(qc, [board[a], board[4 + b], ancilla[0]]) # !
        margolus(qc, [board[b], board[4 + a], ancilla[1]]) # !
        margolus(qc, [board[8 + c], board[12 + d], ancilla[2]]) # !
        margolus(qc, [board[8 + d], board[12 + c], ancilla[3]]) # !
    
        for x in [0, 1]:
            for y in [2, 3]:
                qc.cu1(2 * pi / 3, ancilla[x], ancilla[y])
        
        margolus(qc, [board[a], board[4 + b], ancilla[0]]) # !
        margolus(qc, [board[b], board[4 + a], ancilla[1]]) # !
        margolus(qc, [board[8 + c], board[12 + d], ancilla[2]]) # !
        margolus(qc, [board[8 + d], board[12 + c], ancilla[3]]) # !
    
    
    # QRAM again.
    for (i, g) in enumerate(gray):
        margolus(qc, [idx[0], idx[1], ancilla[i]]) # !
        qc.x(idx[gray_bit[i]])
    for (i, g) in enumerate(gray):
        margolus(qc, [idx[2], idx[3], ancilla[4]]) # !
        for x in range(4):
            margolus(qc, [ancilla[x], ancilla[4], ancilla[5]]) # !
            inst = problem_set[4 * gray[x] + g]
            for stars in inst:
                (r, c) = tuple(map(int, stars))
                rc = 4 * r + c
                qc.cx(ancilla[5], board[rc])
            margolus(qc, [ancilla[x], ancilla[4], ancilla[5]]) # !
        margolus(qc, [idx[2], idx[3], ancilla[4]]) # !
        qc.x(idx[2 + gray_bit[i]])
    for (i, g) in enumerate(gray):
        margolus(qc, [idx[0], idx[1], ancilla[i]]) # !
        qc.x(idx[gray_bit[i]])
    
    # GROVER DIFFUSER
    for i in range(4):
        qc.u3(-pi/2, pi, -pi, idx[i])
    margolus(qc, [idx[0], idx[1], ancilla[0]]) # !
    qc.mcu1(pi, [idx[2], idx[3]], ancilla[0])
    # we simplify the Diffusion operator by skipping this last CCX
    # as in *the PDF* obtaining around 19%  success rate
    # qc.ccx(idx[0], idx[1], ancilla[0])
    for i in range(4):
        qc.u3(pi/2, -pi, pi, idx[i])
    
    # READ RESULT
    for (q, c) in zip(idx, result):
        qc.measure(q, c)
    return qc

evaluate(solution_3)

Running Test 1
Running Test 2
Solution correct! Cost: 7527

The obtained result of 7,527 should suffice to make it into the top 10. But we're not done yet.

The final touch to solution_3 is the introduction of the inter- and intra- routine (mainly Margolus) optimizations, as in the PDF. To do that, we need to implement all the gates ourselves and simplify as many gates as possible.

# "Partial" Margolus for convenience. We add parameters that
# helps us to execute only subsequence of Margolus gates.
def margolus(qc, qb, fr=0, to=6):
    if fr <= 0 <= to:
        qc.ry(pi/4, qb[2])
    if fr <= 1 <= to:
        qc.cx(qb[1], qb[2])
    if fr <= 2 <= to:
        qc.ry(pi/4, qb[2])
    if fr <= 3 <= to:
        qc.cx(qb[0], qb[2])
    if fr <= 4 <= to:
        qc.ry(-pi/4, qb[2])
    if fr <= 5 <= to:
        qc.cx(qb[1], qb[2])
    if fr <= 6 <= to:
        qc.ry(-pi/4, qb[2])

# The cheapest CCZ gate I could find on the internet.
# It's used only in the Diffusion Operator.
def ccz(qc, qb):
    qc.cx(qb[1], qb[2])
    qc.rz(-pi/4, qb[2])
    qc.cx(qb[0], qb[2])
    qc.rz(pi/4, qb[2])
    qc.cx(qb[1], qb[2])
    qc.rz(-pi/4, qb[2])
    qc.cx(qb[0], qb[2])
    qc.rz(pi/4, qb[1])
    qc.rz(pi/4, qb[2])
    qc.cx(qb[0], qb[1])
    qc.rz(pi/4, qb[0])
    qc.rz(-pi/4, qb[1])
    qc.cx(qb[0], qb[1])

# The actual Controlled Phase Shift gate consists of 2 more U3 gates.
# We only include the decomposition's core and manipulate those two
# not included U3 gates in the outer scope to enable reductions.
def CPh(qc, angle, qb):
    qc.cx(qb[0], qb[1])
    qc.u3(0, 0, -angle / 2, qb[1])
    qc.cx(qb[0], qb[1])

# The central CCCZ of the Diffusion Operator abstracted to a single function.
def ancilla_toffoli_z(qc, qb):
    margolus(qc, [qb[x] for x in [0, 1, 4]], fr=1)
    ccz(qc, [qb[x] for x in [2, 3, 4]])
#    margolus(qc, [qb[x] for x in [0, 1, 4]])

def solution_4(problem_set):
    idx = QuantumRegister(4, name='idx')
    board = QuantumRegister(16, name='brd')
    ancilla = QuantumRegister(6, name='anc')
    
    result = ClassicalRegister(4, name='res')
    qc = QuantumCircuit(idx, board, ancilla, result)
    
    qc.h(idx)
    
    # QRAM
    gray_bit = [1, 0, 1, 0]
    gray = [3, 2, 0, 1]

    for (i, g) in enumerate(gray):
        margolus(qc, idx[:2] + [ancilla[i]])
        qc.x(idx[gray_bit[i]])
    for (i, g) in enumerate(gray):
        # complicated rules describing possible optimizations of subsequent
        # Margolus gates with the same target. Those matching Margolus gates are
        # the last Margolus targeting given ancilla from the previous iteration
        # and the first margolus targeting the same ancilla in this iteration.
        outer_margolus_from = 0 if i == 0 else (3 if i == 2 else 1)
        outer_margolus_to = 3 if i == 3 else (3 if i == 1 else 5)
        margolus(qc, idx[2:] + [ancilla[4]], fr=outer_margolus_from)
        for x in range(4):
            # The same simplifications as above.
            inner_margolus_from = (0 if i == 0 else 1) if x == 0 else 3
            inner_margolus_to = 5 if x == 3 else 3
            margolus(qc, [ancilla[x], ancilla[4], ancilla[5]], fr=inner_margolus_from)
            inst = problem_set[4 * gray[x] + g]
            for stars in inst:
                (r, c) = tuple(map(int, stars))
                rc = 4 * r + c
                qc.cx(ancilla[5], board[rc])
            margolus(qc, [ancilla[x], ancilla[4], ancilla[5]], to=inner_margolus_to)
        margolus(qc, idx[2:] + [ancilla[4]], to=outer_margolus_to)
        qc.x(idx[2 + gray_bit[i]])
    for (i, g) in enumerate(gray):
        # We don't execute the last Ry for it can
        # interroutine optimize with the Oracle 
        margolus(qc, idx[:2] + [ancilla[i]],to=5)
        qc.x(idx[gray_bit[i]])
    
    
    # ORACLE
    combi = list(combinations(range(4), 2))
    combi_compl = [tuple(x for x in range(4) if x not in l) for l in combi]
    combi_len = len(combi)
    for combination_id in range(combi_len):
        (a, b) = combi[combination_id]
        (c, d) = combi_compl[combination_id]
        # Those rules are the same as above. It seems like there are more
        # combinations but it's simply just the result of diligent analysis
        # of previous and current iteration. The first and last combination
        # is optimizing the Ry with preceding / following QRAM Margoluses
        # (thence check for xx == 0 and xx == combi_len - 1).
        if combination_id == 0:
            f0, f1, f2, f3 = 1, 1, 1, 1
        else:
            comb = combi[combination_id - 1]
            comb_compl = combi_compl[combination_id - 1]
            f0 = 3 if comb[1] == b else 1
            f1 = 3 if comb[0] == a else 1
            f2 = 3 if comb_compl[1] == d else 1
            f3 = 3 if comb_compl[0] == c else 1
        if combination_id == combi_len - 1:
            t0, t1, t2, t3 = 5, 5, 5, 5
        else:
            comb = combi[combination_id + 1]
            comb_compl = combi_compl[combination_id + 1]
            t0 = 3 if comb[1] == b else 5
            t1 = 3 if comb[0] == a else 5
            t2 = 3 if comb_compl[1] == d else 5
            t3 = 3 if comb_compl[0] == c else 5
        
        # You can notice how fi and ti correspond to each other -
        # fi "checks" whether in the previous iteration second control
        # qubit is the same and ti - whether the next iteration begins
        # with the same control qubit.
        margolus(qc, [board[a], board[4 + b], ancilla[0]], fr=f0,to=5)
        margolus(qc, [board[b], board[4 + a], ancilla[1]], fr=f1,to=5)
        margolus(qc, [board[8 + c], board[12 + d], ancilla[2]], fr=f2)
        margolus(qc, [board[8 + d], board[12 + c], ancilla[3]], fr=f3)
    
        # The controlled-phase gate loop gets unrolled.
        # Some of the controlled phase gates are interfering with preceding
        #   Margolus gates.
        qc.u3(pi/4,-2*pi/3,pi, ancilla[0])
        CPh(qc, 2 * pi / 3, [ancilla[0], ancilla[2]])
        qc.u3(0, 0, pi / 3, ancilla[2])
        
        qc.u3(0, 0, pi / 3, ancilla[0])
        CPh(qc, 2 * pi / 3, [ancilla[0], ancilla[3]])
        qc.u3(0, 0, pi / 3, ancilla[3])
        
        qc.u3(pi/4,-2*pi/3,pi, ancilla[1])
        CPh(qc, 2 * pi / 3, [ancilla[1], ancilla[2]])
        qc.u3(pi/4,0,pi/3, ancilla[2])
        
        qc.u3(0, 0, pi / 3, ancilla[1])
        CPh(qc, 2 * pi / 3, [ancilla[1], ancilla[3]])
        qc.u3(pi/4,0,pi/3, ancilla[3])
            
        margolus(qc, [board[a], board[4 + b], ancilla[0]], to=t0)
        margolus(qc, [board[b], board[4 + a], ancilla[1]], to=t1)
        margolus(qc, [board[8 + c], board[12 + d], ancilla[2]], to=t2,fr=1)
        margolus(qc, [board[8 + d], board[12 + c], ancilla[3]], to=t3,fr=1)
    
    
    # QRAM
    for (i, g) in enumerate(gray):
        margolus(qc, idx[:2] + [ancilla[i]],fr=1)
        qc.x(idx[gray_bit[i]])
    for (i, g) in enumerate(gray):
        outer_margolus_from = 3 if i == 0 else (3 if i == 2 else 1)
        outer_margolus_to = 5 if i == 3 else (3 if i == 1 else 5)
        margolus(qc, idx[2:] + [ancilla[4]], fr=outer_margolus_from)
        for x in range(4):
            inner_margolus_from = 1 if x == 0 else 3
            inner_margolus_to = 5 if x == 3 else 3
            margolus(qc, [ancilla[x], ancilla[4], ancilla[5]], fr=inner_margolus_from)
            inst = problem_set[4 * gray[x] + g]
            for stars in inst:
                (r, c) = tuple(map(int, stars))
                rc = 4 * r + c
                qc.cx(ancilla[5], board[rc])
            margolus(qc, [ancilla[x], ancilla[4], ancilla[5]], to=inner_margolus_to)
        margolus(qc, idx[2:] + [ancilla[4]], to=outer_margolus_to)
        qc.x(idx[2 + gray_bit[i]])
    for (i, g) in enumerate(gray):
        margolus_to = 5 if i == 0 else 6
        margolus(qc, idx[:2] + [ancilla[i]], to=margolus_to)
        if i < 3:
            qc.x(idx[gray_bit[i]])
    
    # GROVER DIFFUSER
    
    # Single inter-routine optimization with preceding QRAM.
    qc.u3(pi/2, pi,2*pi, idx[0])
    for i in range(1, 4):
        qc.u3(-pi/2, pi, -pi, idx[i])
    ancilla_toffoli_z(qc, idx[:] + [ancilla[0]])
    for i in range(4):
        qc.u3(pi/2, -pi, pi, idx[i])
    
    # READ RESULT
    for (q, c) in zip(idx, result):
        qc.measure(q, c)
    return qc

evaluate(solution_4)

Running Test 1
Running Test 2
Solution correct! Cost: 6574

And we're done! The result of 6,574 is an amazing result, granting us the 6th place in the final scoreboard.